unicode_funcs.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. /**
  2. * @file unicode_funcs.h
  3. * @author Ambroz Bizjak <ambrop7@gmail.com>
  4. *
  5. * @section LICENSE
  6. *
  7. * This file is part of BadVPN.
  8. *
  9. * BadVPN is free software: you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License version 2
  11. * as published by the Free Software Foundation.
  12. *
  13. * BadVPN is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21. */
  22. #ifndef BADVPN_UNICODE_FUNCS_H
  23. #define BADVPN_UNICODE_FUNCS_H
  24. #include <misc/expstring.h>
  25. #include <misc/bsize.h>
  26. #include <misc/Utf8Encoder.h>
  27. #include <misc/Utf8Decoder.h>
  28. #include <misc/Utf16Encoder.h>
  29. #include <misc/Utf16Decoder.h>
  30. /**
  31. * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
  32. *
  33. * @param data UTF-16 data, in big endian
  34. * @param data_len size of data in bytes
  35. * @param out_is_error if not NULL and the function returns a string,
  36. * *out_is_error will be set to 0 or 1, indicating
  37. * whether there have been errors decoding the input.
  38. * A null decoded character is treated as an error.
  39. * @return An UTF-8 null-terminated string which can be freed with free(),
  40. * or NULL if out of memory.
  41. */
  42. static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
  43. /**
  44. * Decodes UTF-8 data into UTF-16 data as bytes.
  45. *
  46. * @param data UTF-8 data
  47. * @param data_len size of data in bytes
  48. * @param out output buffer
  49. * @param out_avail number of bytes available in output buffer
  50. * @param out_len if not NULL, *out_len will contain the number of bytes
  51. * required to store the resulting data (or overflow)
  52. * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
  53. * indicating whether there have been errors decoding
  54. * the input
  55. */
  56. static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
  57. static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
  58. {
  59. // will build the resulting UTF-8 string by appending to ExpString
  60. ExpString str;
  61. if (!ExpString_Init(&str)) {
  62. goto fail0;
  63. }
  64. // init UTF-16 decoder
  65. Utf16Decoder decoder;
  66. Utf16Decoder_Init(&decoder);
  67. // set initial input and input matching positions
  68. size_t i_in = 0;
  69. size_t i_ch = 0;
  70. int error = 0;
  71. while (i_in < data_len) {
  72. // read two input bytes from the input position
  73. uint8_t x = data[i_in++];
  74. if (i_in == data_len) {
  75. break;
  76. }
  77. uint8_t y = data[i_in++];
  78. // combine them into a 16-bit value
  79. uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
  80. // give the 16-bit value to the UTF-16 decoder and maybe
  81. // receive a Unicode character back
  82. uint32_t ch;
  83. if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
  84. continue;
  85. }
  86. if (!error) {
  87. // encode the Unicode character back into UTF-16
  88. uint16_t chenc[2];
  89. int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
  90. ASSERT(chenc_n > 0)
  91. // match the result with input
  92. for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
  93. uint8_t cx = (chenc[chenc_i] >> 8);
  94. uint8_t cy = (chenc[chenc_i] & 0xFF);
  95. if (i_ch >= data_len || data[i_ch] != cx) {
  96. error = 1;
  97. break;
  98. }
  99. i_ch++;
  100. if (i_ch >= data_len || data[i_ch] != cy) {
  101. error = 1;
  102. break;
  103. }
  104. i_ch++;
  105. }
  106. }
  107. // we don't like null Unicode characters because we're building a
  108. // null-terminated UTF-8 string
  109. if (ch == 0) {
  110. error = 1;
  111. continue;
  112. }
  113. // encode the Unicode character into UTF-8
  114. uint8_t enc[5];
  115. int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
  116. ASSERT(enc_n > 0)
  117. // append the resulting UTF-8 bytes to the result string
  118. enc[enc_n] = 0;
  119. if (!ExpString_Append(&str, enc)) {
  120. goto fail1;
  121. }
  122. }
  123. // check if we matched the whole input string when encoding back
  124. if (i_ch < data_len) {
  125. error = 1;
  126. }
  127. if (out_is_error) {
  128. *out_is_error = error;
  129. }
  130. return ExpString_Get(&str);
  131. fail1:
  132. ExpString_Free(&str);
  133. fail0:
  134. return NULL;
  135. }
  136. static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
  137. {
  138. Utf8Decoder decoder;
  139. Utf8Decoder_Init(&decoder);
  140. size_t i_in = 0;
  141. size_t i_ch = 0;
  142. bsize_t len = bsize_fromsize(0);
  143. int error = 0;
  144. while (i_in < data_len) {
  145. uint8_t x = data[i_in++];
  146. uint32_t ch;
  147. if (!Utf8Decoder_Input(&decoder, x, &ch)) {
  148. continue;
  149. }
  150. if (!error) {
  151. uint8_t chenc[4];
  152. int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
  153. ASSERT(chenc_n > 0)
  154. for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
  155. if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
  156. error = 1;
  157. break;
  158. }
  159. i_ch++;
  160. }
  161. }
  162. uint16_t enc[2];
  163. int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
  164. ASSERT(enc_n > 0)
  165. len = bsize_add(len, bsize_fromsize(2 * enc_n));
  166. for (int enc_i = 0; enc_i < enc_n; enc_i++) {
  167. if (out_avail == 0) {
  168. break;
  169. }
  170. *(out++) = (enc[enc_i] >> 8);
  171. out_avail--;
  172. if (out_avail == 0) {
  173. break;
  174. }
  175. *(out++) = (enc[enc_i] & 0xFF);
  176. out_avail--;
  177. }
  178. }
  179. if (i_ch < data_len) {
  180. error = 1;
  181. }
  182. if (out_len) {
  183. *out_len = len;
  184. }
  185. if (out_is_error) {
  186. *out_is_error = error;
  187. }
  188. }
  189. #endif