|
@@ -0,0 +1,225 @@
|
|
|
|
|
+/**
|
|
|
|
|
+ * @file unicode_funcs.h
|
|
|
|
|
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
|
|
|
|
|
+ *
|
|
|
|
|
+ * @section LICENSE
|
|
|
|
|
+ *
|
|
|
|
|
+ * This file is part of BadVPN.
|
|
|
|
|
+ *
|
|
|
|
|
+ * BadVPN is free software: you can redistribute it and/or modify
|
|
|
|
|
+ * it under the terms of the GNU General Public License version 2
|
|
|
|
|
+ * as published by the Free Software Foundation.
|
|
|
|
|
+ *
|
|
|
|
|
+ * BadVPN is distributed in the hope that it will be useful,
|
|
|
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
+ * GNU General Public License for more details.
|
|
|
|
|
+ *
|
|
|
|
|
+ * You should have received a copy of the GNU General Public License along
|
|
|
|
|
+ * with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
+ */
|
|
|
|
|
+
|
|
|
|
|
+#ifndef BADVPN_UNICODE_FUNCS_H
|
|
|
|
|
+#define BADVPN_UNICODE_FUNCS_H
|
|
|
|
|
+
|
|
|
|
|
+#include <misc/expstring.h>
|
|
|
|
|
+#include <misc/bsize.h>
|
|
|
|
|
+#include <misc/Utf8Encoder.h>
|
|
|
|
|
+#include <misc/Utf8Decoder.h>
|
|
|
|
|
+#include <misc/Utf16Encoder.h>
|
|
|
|
|
+#include <misc/Utf16Decoder.h>
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param data UTF-16 data, in big endian
|
|
|
|
|
+ * @param data_len size of data in bytes
|
|
|
|
|
+ * @param out_is_error if not NULL and the function returns a string,
|
|
|
|
|
+ * *out_is_error will be set to 0 or 1, indicating
|
|
|
|
|
+ * whether there have been errors decoding the input.
|
|
|
|
|
+ * A null decoded character is treated as an error.
|
|
|
|
|
+ * @return An UTF-8 null-terminated string which can be freed with free(),
|
|
|
|
|
+ * or NULL if out of memory.
|
|
|
|
|
+ */
|
|
|
|
|
+static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Decodes UTF-8 data into UTF-16 data as bytes.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param data UTF-8 data
|
|
|
|
|
+ * @param data_len size of data in bytes
|
|
|
|
|
+ * @param out output buffer
|
|
|
|
|
+ * @param out_avail number of bytes available in output buffer
|
|
|
|
|
+ * @param out_len if not NULL, *out_len will contain the number of bytes
|
|
|
|
|
+ * required to store the resulting data (or overflow)
|
|
|
|
|
+ * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
|
|
|
|
|
+ * indicating whether there have been errors decoding
|
|
|
|
|
+ * the input
|
|
|
|
|
+ */
|
|
|
|
|
+static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
|
|
|
|
|
+
|
|
|
|
|
+static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
|
|
|
|
|
+{
|
|
|
|
|
+ // will build the resulting UTF-8 string by appending to ExpString
|
|
|
|
|
+ ExpString str;
|
|
|
|
|
+ if (!ExpString_Init(&str)) {
|
|
|
|
|
+ goto fail0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // init UTF-16 decoder
|
|
|
|
|
+ Utf16Decoder decoder;
|
|
|
|
|
+ Utf16Decoder_Init(&decoder);
|
|
|
|
|
+
|
|
|
|
|
+ // set initial input and input matching positions
|
|
|
|
|
+ size_t i_in = 0;
|
|
|
|
|
+ size_t i_ch = 0;
|
|
|
|
|
+
|
|
|
|
|
+ int error = 0;
|
|
|
|
|
+
|
|
|
|
|
+ while (i_in < data_len) {
|
|
|
|
|
+ // read two input bytes from the input position
|
|
|
|
|
+ uint8_t x = data[i_in++];
|
|
|
|
|
+ if (i_in == data_len) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ uint8_t y = data[i_in++];
|
|
|
|
|
+
|
|
|
|
|
+ // combine them into a 16-bit value
|
|
|
|
|
+ uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
|
|
|
|
|
+
|
|
|
|
|
+ // give the 16-bit value to the UTF-16 decoder and maybe
|
|
|
|
|
+ // receive a Unicode character back
|
|
|
|
|
+ uint32_t ch;
|
|
|
|
|
+ if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (!error) {
|
|
|
|
|
+ // encode the Unicode character back into UTF-16
|
|
|
|
|
+ uint16_t chenc[2];
|
|
|
|
|
+ int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
|
|
|
|
|
+ ASSERT(chenc_n > 0)
|
|
|
|
|
+
|
|
|
|
|
+ // match the result with input
|
|
|
|
|
+ for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
|
|
|
|
|
+ uint8_t cx = (chenc[chenc_i] >> 8);
|
|
|
|
|
+ uint8_t cy = (chenc[chenc_i] & 0xFF);
|
|
|
|
|
+
|
|
|
|
|
+ if (i_ch >= data_len || data[i_ch] != cx) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ i_ch++;
|
|
|
|
|
+
|
|
|
|
|
+ if (i_ch >= data_len || data[i_ch] != cy) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ i_ch++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // we don't like null Unicode characters because we're building a
|
|
|
|
|
+ // null-terminated UTF-8 string
|
|
|
|
|
+ if (ch == 0) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // encode the Unicode character into UTF-8
|
|
|
|
|
+ uint8_t enc[5];
|
|
|
|
|
+ int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
|
|
|
|
|
+ ASSERT(enc_n > 0)
|
|
|
|
|
+
|
|
|
|
|
+ // append the resulting UTF-8 bytes to the result string
|
|
|
|
|
+ enc[enc_n] = 0;
|
|
|
|
|
+ if (!ExpString_Append(&str, enc)) {
|
|
|
|
|
+ goto fail1;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // check if we matched the whole input string when encoding back
|
|
|
|
|
+ if (i_ch < data_len) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (out_is_error) {
|
|
|
|
|
+ *out_is_error = error;
|
|
|
|
|
+ }
|
|
|
|
|
+ return ExpString_Get(&str);
|
|
|
|
|
+
|
|
|
|
|
+fail1:
|
|
|
|
|
+ ExpString_Free(&str);
|
|
|
|
|
+fail0:
|
|
|
|
|
+ return NULL;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
|
|
|
|
|
+{
|
|
|
|
|
+ Utf8Decoder decoder;
|
|
|
|
|
+ Utf8Decoder_Init(&decoder);
|
|
|
|
|
+
|
|
|
|
|
+ size_t i_in = 0;
|
|
|
|
|
+ size_t i_ch = 0;
|
|
|
|
|
+
|
|
|
|
|
+ bsize_t len = bsize_fromsize(0);
|
|
|
|
|
+
|
|
|
|
|
+ int error = 0;
|
|
|
|
|
+
|
|
|
|
|
+ while (i_in < data_len) {
|
|
|
|
|
+ uint8_t x = data[i_in++];
|
|
|
|
|
+
|
|
|
|
|
+ uint32_t ch;
|
|
|
|
|
+ if (!Utf8Decoder_Input(&decoder, x, &ch)) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (!error) {
|
|
|
|
|
+ uint8_t chenc[4];
|
|
|
|
|
+ int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
|
|
|
|
|
+ ASSERT(chenc_n > 0)
|
|
|
|
|
+
|
|
|
|
|
+ for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
|
|
|
|
|
+ if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ i_ch++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ uint16_t enc[2];
|
|
|
|
|
+ int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
|
|
|
|
|
+ ASSERT(enc_n > 0)
|
|
|
|
|
+
|
|
|
|
|
+ len = bsize_add(len, bsize_fromsize(2 * enc_n));
|
|
|
|
|
+
|
|
|
|
|
+ for (int enc_i = 0; enc_i < enc_n; enc_i++) {
|
|
|
|
|
+ if (out_avail == 0) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ *(out++) = (enc[enc_i] >> 8);
|
|
|
|
|
+ out_avail--;
|
|
|
|
|
+
|
|
|
|
|
+ if (out_avail == 0) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ *(out++) = (enc[enc_i] & 0xFF);
|
|
|
|
|
+ out_avail--;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (i_ch < data_len) {
|
|
|
|
|
+ error = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (out_len) {
|
|
|
|
|
+ *out_len = len;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (out_is_error) {
|
|
|
|
|
+ *out_is_error = error;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#endif
|