Utf8Encoder.h 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. /**
  2. * @file Utf8Encoder.h
  3. * @author Ambroz Bizjak <ambrop7@gmail.com>
  4. *
  5. * @section LICENSE
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. Neither the name of the author nor the
  15. * names of its contributors may be used to endorse or promote products
  16. * derived from this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  20. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  21. * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  22. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  23. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  25. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. */
  29. #ifndef BADVPN_UTF8ENCODER_H
  30. #define BADVPN_UTF8ENCODER_H
  31. #include <stdint.h>
  32. /**
  33. * Encodes a Unicode character into a sequence of bytes according to UTF-8.
  34. *
  35. * @param ch Unicode character to encode
  36. * @param out will receive the encoded bytes. Must have space for 4 bytes.
  37. * @return number of bytes written, 0-4, with 0 meaning the character cannot
  38. * be encoded
  39. */
  40. static int Utf8Encoder_EncodeCharacter (uint32_t ch, uint8_t *out);
  41. int Utf8Encoder_EncodeCharacter (uint32_t ch, uint8_t *out)
  42. {
  43. if (ch <= UINT32_C(0x007F)) {
  44. out[0] = ch;
  45. return 1;
  46. }
  47. if (ch <= UINT32_C(0x07FF)) {
  48. out[0] = (0xC0 | (ch >> 6));
  49. out[1] = (0x80 | ((ch >> 0) & 0x3F));
  50. return 2;
  51. }
  52. if (ch <= UINT32_C(0xFFFF)) {
  53. // surrogates
  54. if (ch >= UINT32_C(0xD800) && ch <= UINT32_C(0xDFFF)) {
  55. return 0;
  56. }
  57. out[0] = (0xE0 | (ch >> 12));
  58. out[1] = (0x80 | ((ch >> 6) & 0x3F));
  59. out[2] = (0x80 | ((ch >> 0) & 0x3F));
  60. return 3;
  61. }
  62. if (ch < UINT32_C(0x10FFFF)) {
  63. out[0] = (0xF0 | (ch >> 18));
  64. out[1] = (0x80 | ((ch >> 12) & 0x3F));
  65. out[2] = (0x80 | ((ch >> 6) & 0x3F));
  66. out[3] = (0x80 | ((ch >> 0) & 0x3F));
  67. return 4;
  68. }
  69. return 0;
  70. }
  71. #endif