| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225 |
- /**
- * @file unicode_funcs.h
- * @author Ambroz Bizjak <ambrop7@gmail.com>
- *
- * @section LICENSE
- *
- * This file is part of BadVPN.
- *
- * BadVPN is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * BadVPN is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
- #ifndef BADVPN_UNICODE_FUNCS_H
- #define BADVPN_UNICODE_FUNCS_H
- #include <misc/expstring.h>
- #include <misc/bsize.h>
- #include <misc/Utf8Encoder.h>
- #include <misc/Utf8Decoder.h>
- #include <misc/Utf16Encoder.h>
- #include <misc/Utf16Decoder.h>
- /**
- * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
- *
- * @param data UTF-16 data, in big endian
- * @param data_len size of data in bytes
- * @param out_is_error if not NULL and the function returns a string,
- * *out_is_error will be set to 0 or 1, indicating
- * whether there have been errors decoding the input.
- * A null decoded character is treated as an error.
- * @return An UTF-8 null-terminated string which can be freed with free(),
- * or NULL if out of memory.
- */
- static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
- /**
- * Decodes UTF-8 data into UTF-16 data as bytes.
- *
- * @param data UTF-8 data
- * @param data_len size of data in bytes
- * @param out output buffer
- * @param out_avail number of bytes available in output buffer
- * @param out_len if not NULL, *out_len will contain the number of bytes
- * required to store the resulting data (or overflow)
- * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
- * indicating whether there have been errors decoding
- * the input
- */
- static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
- static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
- {
- // will build the resulting UTF-8 string by appending to ExpString
- ExpString str;
- if (!ExpString_Init(&str)) {
- goto fail0;
- }
-
- // init UTF-16 decoder
- Utf16Decoder decoder;
- Utf16Decoder_Init(&decoder);
-
- // set initial input and input matching positions
- size_t i_in = 0;
- size_t i_ch = 0;
-
- int error = 0;
-
- while (i_in < data_len) {
- // read two input bytes from the input position
- uint8_t x = data[i_in++];
- if (i_in == data_len) {
- break;
- }
- uint8_t y = data[i_in++];
-
- // combine them into a 16-bit value
- uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
-
- // give the 16-bit value to the UTF-16 decoder and maybe
- // receive a Unicode character back
- uint32_t ch;
- if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
- continue;
- }
-
- if (!error) {
- // encode the Unicode character back into UTF-16
- uint16_t chenc[2];
- int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
- ASSERT(chenc_n > 0)
-
- // match the result with input
- for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
- uint8_t cx = (chenc[chenc_i] >> 8);
- uint8_t cy = (chenc[chenc_i] & 0xFF);
-
- if (i_ch >= data_len || data[i_ch] != cx) {
- error = 1;
- break;
- }
- i_ch++;
-
- if (i_ch >= data_len || data[i_ch] != cy) {
- error = 1;
- break;
- }
- i_ch++;
- }
- }
-
- // we don't like null Unicode characters because we're building a
- // null-terminated UTF-8 string
- if (ch == 0) {
- error = 1;
- continue;
- }
-
- // encode the Unicode character into UTF-8
- uint8_t enc[5];
- int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
- ASSERT(enc_n > 0)
-
- // append the resulting UTF-8 bytes to the result string
- enc[enc_n] = 0;
- if (!ExpString_Append(&str, enc)) {
- goto fail1;
- }
- }
-
- // check if we matched the whole input string when encoding back
- if (i_ch < data_len) {
- error = 1;
- }
-
- if (out_is_error) {
- *out_is_error = error;
- }
- return ExpString_Get(&str);
-
- fail1:
- ExpString_Free(&str);
- fail0:
- return NULL;
- }
- static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
- {
- Utf8Decoder decoder;
- Utf8Decoder_Init(&decoder);
-
- size_t i_in = 0;
- size_t i_ch = 0;
-
- bsize_t len = bsize_fromsize(0);
-
- int error = 0;
-
- while (i_in < data_len) {
- uint8_t x = data[i_in++];
-
- uint32_t ch;
- if (!Utf8Decoder_Input(&decoder, x, &ch)) {
- continue;
- }
-
- if (!error) {
- uint8_t chenc[4];
- int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
- ASSERT(chenc_n > 0)
-
- for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
- if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
- error = 1;
- break;
- }
- i_ch++;
- }
- }
-
- uint16_t enc[2];
- int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
- ASSERT(enc_n > 0)
-
- len = bsize_add(len, bsize_fromsize(2 * enc_n));
-
- for (int enc_i = 0; enc_i < enc_n; enc_i++) {
- if (out_avail == 0) {
- break;
- }
- *(out++) = (enc[enc_i] >> 8);
- out_avail--;
-
- if (out_avail == 0) {
- break;
- }
- *(out++) = (enc[enc_i] & 0xFF);
- out_avail--;
- }
- }
-
- if (i_ch < data_len) {
- error = 1;
- }
-
- if (out_len) {
- *out_len = len;
- }
- if (out_is_error) {
- *out_is_error = error;
- }
- }
- #endif
|