unicode_funcs.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /**
  2. * @file unicode_funcs.h
  3. * @author Ambroz Bizjak <ambrop7@gmail.com>
  4. *
  5. * @section LICENSE
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. Neither the name of the author nor the
  15. * names of its contributors may be used to endorse or promote products
  16. * derived from this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  20. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  21. * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  22. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  23. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  25. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. */
  29. #ifndef BADVPN_UNICODE_FUNCS_H
  30. #define BADVPN_UNICODE_FUNCS_H
  31. #include <misc/expstring.h>
  32. #include <misc/bsize.h>
  33. #include <misc/Utf8Encoder.h>
  34. #include <misc/Utf8Decoder.h>
  35. #include <misc/Utf16Encoder.h>
  36. #include <misc/Utf16Decoder.h>
  37. /**
  38. * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
  39. *
  40. * @param data UTF-16 data, in big endian
  41. * @param data_len size of data in bytes
  42. * @param out_is_error if not NULL and the function returns a string,
  43. * *out_is_error will be set to 0 or 1, indicating
  44. * whether there have been errors decoding the input.
  45. * A null decoded character is treated as an error.
  46. * @return An UTF-8 null-terminated string which can be freed with free(),
  47. * or NULL if out of memory.
  48. */
  49. static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
  50. /**
  51. * Decodes UTF-8 data into UTF-16 data as bytes.
  52. *
  53. * @param data UTF-8 data
  54. * @param data_len size of data in bytes
  55. * @param out output buffer
  56. * @param out_avail number of bytes available in output buffer
  57. * @param out_len if not NULL, *out_len will contain the number of bytes
  58. * required to store the resulting data (or overflow)
  59. * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
  60. * indicating whether there have been errors decoding
  61. * the input
  62. */
  63. static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
  64. static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
  65. {
  66. // will build the resulting UTF-8 string by appending to ExpString
  67. ExpString str;
  68. if (!ExpString_Init(&str)) {
  69. goto fail0;
  70. }
  71. // init UTF-16 decoder
  72. Utf16Decoder decoder;
  73. Utf16Decoder_Init(&decoder);
  74. // set initial input and input matching positions
  75. size_t i_in = 0;
  76. size_t i_ch = 0;
  77. int error = 0;
  78. while (i_in < data_len) {
  79. // read two input bytes from the input position
  80. uint8_t x = data[i_in++];
  81. if (i_in == data_len) {
  82. break;
  83. }
  84. uint8_t y = data[i_in++];
  85. // combine them into a 16-bit value
  86. uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
  87. // give the 16-bit value to the UTF-16 decoder and maybe
  88. // receive a Unicode character back
  89. uint32_t ch;
  90. if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
  91. continue;
  92. }
  93. if (!error) {
  94. // encode the Unicode character back into UTF-16
  95. uint16_t chenc[2];
  96. int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
  97. ASSERT(chenc_n > 0)
  98. // match the result with input
  99. for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
  100. uint8_t cx = (chenc[chenc_i] >> 8);
  101. uint8_t cy = (chenc[chenc_i] & 0xFF);
  102. if (i_ch >= data_len || data[i_ch] != cx) {
  103. error = 1;
  104. break;
  105. }
  106. i_ch++;
  107. if (i_ch >= data_len || data[i_ch] != cy) {
  108. error = 1;
  109. break;
  110. }
  111. i_ch++;
  112. }
  113. }
  114. // we don't like null Unicode characters because we're building a
  115. // null-terminated UTF-8 string
  116. if (ch == 0) {
  117. error = 1;
  118. continue;
  119. }
  120. // encode the Unicode character into UTF-8
  121. uint8_t enc[5];
  122. int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
  123. ASSERT(enc_n > 0)
  124. // append the resulting UTF-8 bytes to the result string
  125. enc[enc_n] = 0;
  126. if (!ExpString_Append(&str, enc)) {
  127. goto fail1;
  128. }
  129. }
  130. // check if we matched the whole input string when encoding back
  131. if (i_ch < data_len) {
  132. error = 1;
  133. }
  134. if (out_is_error) {
  135. *out_is_error = error;
  136. }
  137. return ExpString_Get(&str);
  138. fail1:
  139. ExpString_Free(&str);
  140. fail0:
  141. return NULL;
  142. }
  143. static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
  144. {
  145. Utf8Decoder decoder;
  146. Utf8Decoder_Init(&decoder);
  147. size_t i_in = 0;
  148. size_t i_ch = 0;
  149. bsize_t len = bsize_fromsize(0);
  150. int error = 0;
  151. while (i_in < data_len) {
  152. uint8_t x = data[i_in++];
  153. uint32_t ch;
  154. if (!Utf8Decoder_Input(&decoder, x, &ch)) {
  155. continue;
  156. }
  157. if (!error) {
  158. uint8_t chenc[4];
  159. int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
  160. ASSERT(chenc_n > 0)
  161. for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
  162. if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
  163. error = 1;
  164. break;
  165. }
  166. i_ch++;
  167. }
  168. }
  169. uint16_t enc[2];
  170. int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
  171. ASSERT(enc_n > 0)
  172. len = bsize_add(len, bsize_fromsize(2 * enc_n));
  173. for (int enc_i = 0; enc_i < enc_n; enc_i++) {
  174. if (out_avail == 0) {
  175. break;
  176. }
  177. *(out++) = (enc[enc_i] >> 8);
  178. out_avail--;
  179. if (out_avail == 0) {
  180. break;
  181. }
  182. *(out++) = (enc[enc_i] & 0xFF);
  183. out_avail--;
  184. }
  185. }
  186. if (i_ch < data_len) {
  187. error = 1;
  188. }
  189. if (out_len) {
  190. *out_len = len;
  191. }
  192. if (out_is_error) {
  193. *out_is_error = error;
  194. }
  195. }
  196. #endif