Sfoglia il codice sorgente

add some Unicode code

ambrop7 14 anni fa
parent
commit
5cffb05fd7
5 ha cambiato i file con 601 aggiunte e 0 eliminazioni
  1. 106 0
      misc/Utf16Decoder.h
  2. 60 0
      misc/Utf16Encoder.h
  3. 136 0
      misc/Utf8Decoder.h
  4. 74 0
      misc/Utf8Encoder.h
  5. 225 0
      misc/unicode_funcs.h

+ 106 - 0
misc/Utf16Decoder.h

@@ -0,0 +1,106 @@
+/**
+ * @file Utf16Decoder.h
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
+ * 
+ * @section LICENSE
+ * 
+ * This file is part of BadVPN.
+ * 
+ * BadVPN is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ * 
+ * BadVPN is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef BADVPN_UTF16DECODER_H
+#define BADVPN_UTF16DECODER_H
+
+#include <stdint.h>
+
+#include <misc/debug.h>
+
+/**
+ * Decodes UTF-16 data into Unicode characters.
+ */
+typedef struct {
+    int cont;
+    uint32_t ch;
+} Utf16Decoder;
+
+/**
+ * Initializes the UTF-16 decoder.
+ * 
+ * @param o the object
+ */
+static void Utf16Decoder_Init (Utf16Decoder *o);
+
+/**
+ * Inputs a 16-bit value to the decoder.
+ * 
+ * @param o the object
+ * @param b 16-bit value to input
+ * @param out_ch will receive a Unicode character if this function returns 1.
+ *               If written, the character will be in the range 0 - 0x10FFFF,
+ *               excluding the surrogate range 0xD800 - 0xDFFF.
+ * @return 1 if a Unicode character has been written to *out_ch, 0 if not
+ */
+static int Utf16Decoder_Input (Utf16Decoder *o, uint16_t b, uint32_t *out_ch);
+
+void Utf16Decoder_Init (Utf16Decoder *o)
+{
+    o->cont = 0;
+}
+
+int Utf16Decoder_Input (Utf16Decoder *o, uint16_t b, uint32_t *out_ch)
+{
+    // high surrogate
+    if (b >= UINT16_C(0xD800) && b <= UINT16_C(0xDBFF)) {
+        // set continuation state
+        o->cont = 1;
+        
+        // add high bits
+        o->ch = (uint32_t)(b - UINT16_C(0xD800)) << 10;
+        
+        return 0;
+    }
+    
+    // low surrogate
+    if (b >= UINT16_C(0xDC00) && b <= UINT16_C(0xDFFF)) {
+        // check continuation
+        if (!o->cont) {
+            return 0;
+        }
+        
+        // add low bits
+        o->ch |= (b - UINT16_C(0xDC00));
+        
+        // reset state
+        o->cont = 0;
+        
+        // don't report surrogates
+        if (o->ch >= UINT32_C(0xD800) && o->ch <= UINT32_C(0xDFFF)) {
+            return 0;
+        }
+        
+        // return character
+        *out_ch = o->ch;
+        return 1;
+    }
+    
+    // reset state
+    o->cont = 0;
+    
+    // return character
+    *out_ch = b;
+    return 1;
+}
+
+#endif

+ 60 - 0
misc/Utf16Encoder.h

@@ -0,0 +1,60 @@
+/**
+ * @file Utf16Encoder.h
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
+ * 
+ * @section LICENSE
+ * 
+ * This file is part of BadVPN.
+ * 
+ * BadVPN is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ * 
+ * BadVPN is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef BADVPN_UTF16ENCODER_H
+#define BADVPN_UTF16ENCODER_H
+
+#include <stdint.h>
+
+/**
+ * Encodes a Unicode character into a sequence of 16-bit values according to UTF-16.
+ * 
+ * @param ch Unicode character to encode
+ * @param out will receive the encoded 16-bit values. Must have space for 2 values.
+ * @return number of 16-bit values written, 0-2, with 0 meaning the character cannot
+ *         be encoded
+ */
+static int Utf16Encoder_EncodeCharacter (uint32_t ch, uint16_t *out);
+
+int Utf16Encoder_EncodeCharacter (uint32_t ch, uint16_t *out)
+{
+    if (ch <= UINT32_C(0xFFFF)) {
+        // surrogates
+        if (ch >= UINT32_C(0xD800) && ch <= UINT32_C(0xDFFF)) {
+            return 0;
+        }
+        
+        out[0] = ch;
+        return 1;
+    }
+    
+    if (ch <= UINT32_C(0x10FFFF)) {
+        uint32_t x = ch - UINT32_C(0x10000);
+        out[0] = UINT32_C(0xD800) + (x >> 10);
+        out[1] = UINT32_C(0xDC00) + (x & UINT32_C(0x3FF));
+        return 2;
+    }
+    
+    return 0;
+}
+
+#endif

+ 136 - 0
misc/Utf8Decoder.h

@@ -0,0 +1,136 @@
+/**
+ * @file Utf8Decoder.h
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
+ * 
+ * @section LICENSE
+ * 
+ * This file is part of BadVPN.
+ * 
+ * BadVPN is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ * 
+ * BadVPN is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef BADVPN_UTF8DECODER_H
+#define BADVPN_UTF8DECODER_H
+
+#include <stdint.h>
+
+#include <misc/debug.h>
+
+/**
+ * Decodes UTF-8 data into Unicode characters.
+ */
+typedef struct {
+    int bytes;
+    int pos;
+    uint32_t ch;
+} Utf8Decoder;
+
+/**
+ * Initializes the UTF-8 decoder.
+ * 
+ * @param o the object
+ */
+static void Utf8Decoder_Init (Utf8Decoder *o);
+
+/**
+ * Inputs a byte to the decoder.
+ * 
+ * @param o the object
+ * @param b byte to input
+ * @param out_ch will receive a Unicode character if this function returns 1.
+ *               If written, the character will be in the range 0 - 0x10FFFF,
+ *               excluding the surrogate range 0xD800 - 0xDFFF.
+ * @return 1 if a Unicode character has been written to *out_ch, 0 if not
+ */
+static int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch);
+
+void Utf8Decoder_Init (Utf8Decoder *o)
+{
+    o->bytes = 0;
+}
+
+int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch)
+{
+    // one-byte character
+    if ((b & 128) == 0) {
+        o->bytes = 0;
+        *out_ch = b;
+        return 1;
+    }
+    
+    // start of two-byte character
+    if ((b & 224) == 192) {
+        o->bytes = 2;
+        o->pos = 1;
+        o->ch = (uint32_t)(b & 31) << 6;
+        return 0;
+    }
+    
+    // start of three-byte character
+    if ((b & 240) == 224) {
+        o->bytes = 3;
+        o->pos = 1;
+        o->ch = (uint32_t)(b & 15) << 12;
+        return 0;
+    }
+    
+    // start of four-byte character
+    if ((b & 248) == 240) {
+        o->bytes = 4;
+        o->pos = 1;
+        o->ch = (uint32_t)(b & 7) << 18;
+        return 0;
+    }
+    
+    // continuation of multi-byte character
+    if ((b & 192) == 128 && o->bytes > 0) {
+        ASSERT(o->bytes <= 4)
+        ASSERT(o->pos > 0)
+        ASSERT(o->pos < o->bytes)
+        
+        // add bits from this byte
+        o->ch |= (uint32_t)(b & 63) << (6 * (o->bytes - o->pos - 1));
+        
+        // end of multi-byte character?
+        if (o->pos == o->bytes - 1) {
+            // reset state
+            o->bytes = 0;
+            
+            // don't report out-of-range characters
+            if (o->ch > UINT32_C(0x10FFFF)) {
+                return 0;
+            }
+            
+            // don't report surrogates
+            if (o->ch >= UINT32_C(0xD800) && o->ch <= UINT32_C(0xDFFF)) {
+                return 0;
+            }
+            
+            *out_ch = o->ch;
+            return 1;
+        }
+        
+        // increment byte index
+        o->pos++;
+        
+        return 0;
+    }
+    
+    // error, reset state
+    o->bytes = 0;
+    
+    return 0;
+}
+
+#endif

+ 74 - 0
misc/Utf8Encoder.h

@@ -0,0 +1,74 @@
+/**
+ * @file Utf8Encoder.h
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
+ * 
+ * @section LICENSE
+ * 
+ * This file is part of BadVPN.
+ * 
+ * BadVPN is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ * 
+ * BadVPN is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef BADVPN_UTF8ENCODER_H
+#define BADVPN_UTF8ENCODER_H
+
+#include <stdint.h>
+
+/**
+ * Encodes a Unicode character into a sequence of bytes according to UTF-8.
+ * 
+ * @param ch Unicode character to encode
+ * @param out will receive the encoded bytes. Must have space for 4 bytes.
+ * @return number of bytes written, 0-4, with 0 meaning the character cannot
+ *         be encoded
+ */
+static int Utf8Encoder_EncodeCharacter (uint32_t ch, uint8_t *out);
+
+int Utf8Encoder_EncodeCharacter (uint32_t ch, uint8_t *out)
+{
+    if (ch <= UINT32_C(0x007F)) {
+        out[0] = ch;
+        return 1;
+    }
+    
+    if (ch <= UINT32_C(0x07FF)) {
+        out[0] = (0xC0 | (ch >> 6));
+        out[1] = (0x80 | ((ch >> 0) & 0x3F));
+        return 2;
+    }
+    
+    if (ch <= UINT32_C(0xFFFF)) {
+        // surrogates
+        if (ch >= UINT32_C(0xD800) && ch <= UINT32_C(0xDFFF)) {
+            return 0;
+        }
+        
+        out[0] = (0xE0 | (ch >> 12));
+        out[1] = (0x80 | ((ch >> 6) & 0x3F));
+        out[2] = (0x80 | ((ch >> 0) & 0x3F));
+        return 3;
+    }
+    
+    if (ch < UINT32_C(0x10FFFF)) {
+        out[0] = (0xF0 | (ch >> 18));
+        out[1] = (0x80 | ((ch >> 12) & 0x3F));
+        out[2] = (0x80 | ((ch >> 6) & 0x3F));
+        out[3] = (0x80 | ((ch >> 0) & 0x3F));
+        return 4;
+    }
+    
+    return 0;
+}
+
+#endif

+ 225 - 0
misc/unicode_funcs.h

@@ -0,0 +1,225 @@
+/**
+ * @file unicode_funcs.h
+ * @author Ambroz Bizjak <ambrop7@gmail.com>
+ * 
+ * @section LICENSE
+ * 
+ * This file is part of BadVPN.
+ * 
+ * BadVPN is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ * 
+ * BadVPN is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef BADVPN_UNICODE_FUNCS_H
+#define BADVPN_UNICODE_FUNCS_H
+
+#include <misc/expstring.h>
+#include <misc/bsize.h>
+#include <misc/Utf8Encoder.h>
+#include <misc/Utf8Decoder.h>
+#include <misc/Utf16Encoder.h>
+#include <misc/Utf16Decoder.h>
+
+/**
+ * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
+ * 
+ * @param data UTF-16 data, in big endian
+ * @param data_len size of data in bytes
+ * @param out_is_error if not NULL and the function returns a string,
+ *                     *out_is_error will be set to 0 or 1, indicating
+ *                     whether there have been errors decoding the input.
+ *                     A null decoded character is treated as an error.
+ * @return An UTF-8 null-terminated string which can be freed with free(),
+ *         or NULL if out of memory.
+ */
+static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
+
+/**
+ * Decodes UTF-8 data into UTF-16 data as bytes.
+ * 
+ * @param data UTF-8 data
+ * @param data_len size of data in bytes
+ * @param out output buffer
+ * @param out_avail number of bytes available in output buffer
+ * @param out_len if not NULL, *out_len will contain the number of bytes
+ *                required to store the resulting data (or overflow)
+ * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
+ *                     indicating whether there have been errors decoding
+ *                     the input
+ */
+static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
+
+static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
+{
+    // will build the resulting UTF-8 string by appending to ExpString
+    ExpString str;
+    if (!ExpString_Init(&str)) {
+        goto fail0;
+    }
+    
+    // init UTF-16 decoder
+    Utf16Decoder decoder;
+    Utf16Decoder_Init(&decoder);
+    
+    // set initial input and input matching positions
+    size_t i_in = 0;
+    size_t i_ch = 0;
+    
+    int error = 0;
+    
+    while (i_in < data_len) {
+        // read two input bytes from the input position
+        uint8_t x = data[i_in++];
+        if (i_in == data_len) {
+            break;
+        }
+        uint8_t y = data[i_in++];
+        
+        // combine them into a 16-bit value
+        uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
+        
+        // give the 16-bit value to the UTF-16 decoder and maybe
+        // receive a Unicode character back
+        uint32_t ch;
+        if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
+            continue;
+        }
+        
+        if (!error) {
+            // encode the Unicode character back into UTF-16
+            uint16_t chenc[2];
+            int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
+            ASSERT(chenc_n > 0)
+            
+            // match the result with input
+            for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
+                uint8_t cx = (chenc[chenc_i] >> 8);
+                uint8_t cy = (chenc[chenc_i] & 0xFF);
+                
+                if (i_ch >= data_len || data[i_ch] != cx) {
+                    error = 1;
+                    break;
+                }
+                i_ch++;
+                
+                if (i_ch >= data_len || data[i_ch] != cy) {
+                    error = 1;
+                    break;
+                }
+                i_ch++;
+            }
+        }
+        
+        // we don't like null Unicode characters because we're building a
+        // null-terminated UTF-8 string
+        if (ch == 0) {
+            error = 1;
+            continue;
+        }
+        
+        // encode the Unicode character into UTF-8
+        uint8_t enc[5];
+        int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
+        ASSERT(enc_n > 0)
+        
+        // append the resulting UTF-8 bytes to the result string
+        enc[enc_n] = 0;
+        if (!ExpString_Append(&str, enc)) {
+            goto fail1;
+        }
+    }
+    
+    // check if we matched the whole input string when encoding back
+    if (i_ch < data_len) {
+        error = 1;
+    }
+    
+    if (out_is_error) {
+        *out_is_error = error;
+    }
+    return ExpString_Get(&str);
+    
+fail1:
+    ExpString_Free(&str);
+fail0:
+    return NULL;
+}
+
+static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
+{
+    Utf8Decoder decoder;
+    Utf8Decoder_Init(&decoder);
+    
+    size_t i_in = 0;
+    size_t i_ch = 0;
+    
+    bsize_t len = bsize_fromsize(0);
+    
+    int error = 0;
+    
+    while (i_in < data_len) {
+        uint8_t x = data[i_in++];
+        
+        uint32_t ch;
+        if (!Utf8Decoder_Input(&decoder, x, &ch)) {
+            continue;
+        }
+        
+        if (!error) {
+            uint8_t chenc[4];
+            int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
+            ASSERT(chenc_n > 0)
+            
+            for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
+                if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
+                    error = 1;
+                    break;
+                }
+                i_ch++;
+            }
+        }
+        
+        uint16_t enc[2];
+        int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
+        ASSERT(enc_n > 0)
+        
+        len = bsize_add(len, bsize_fromsize(2 * enc_n));
+        
+        for (int enc_i = 0; enc_i < enc_n; enc_i++) {
+            if (out_avail == 0) {
+                break;
+            }
+            *(out++) = (enc[enc_i] >> 8);
+            out_avail--;
+            
+            if (out_avail == 0) {
+                break;
+            }
+            *(out++) = (enc[enc_i] & 0xFF);
+            out_avail--;
+        }
+    }
+    
+    if (i_ch < data_len) {
+        error = 1;
+    }
+    
+    if (out_len) {
+        *out_len = len;
+    }
+    if (out_is_error) {
+        *out_is_error = error;
+    }
+}
+
+#endif