utf16.h 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #ifndef utf16_h
  2. #define utf16_h
  3. // this header needs
  4. #include "wchar.h"
  5. #include "ascii.h"
  6. #include "unicode.h"
  7. // UTF16 can encode characters in blocks of 2 bytes
  8. // these building blocks of 2 bytes can be affected by endian
  9. // https://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16
  10. // https://en.wikipedia.org/wiki/UTF-16
  11. // https://linjan2.github.io/utf16-utf16.html
  12. // UTF16 has this thing, there are valid/invalid
  13. // code points that can be represented by it
  14. // BIG ENDIAN
  15. // code point range bin code point representation bin write representation bits encoded
  16. // 0x0000 - 0xD7FF XXXXXXXX YYYYYYYY XXXXXXXX YYYYYYYY 16
  17. // 0xD800 - 0xDFFF (invalid code points) ... ...
  18. // 0xE000 - 0xFFFF XXXXXXXX YYYYYYYY XXXXXXXX YYYYYYYY 16
  19. // 0x010000 - 0x10FFFF XXYY YYYYYYWW ZZZZZZZZ 110110XX YYYYYYYY 110111WW ZZZZZZZZ 20
  20. // LITTLE ENDIAN (bs)
  21. // code point range bin code point representation bin write representation bits encoded
  22. // 0x0000 - 0xD7FF XXXXXXXX YYYYYYYY YYYYYYYY XXXXXXXX 16
  23. // 0xD800 - 0xDFFF (invalid code points) ... ...
  24. // 0xE000 - 0xFFFF XXXXXXXX YYYYYYYY YYYYYYYY XXXXXXXX 16
  25. // 0x010000 - 0x10FFFF XXYY YYYYYYWW ZZZZZZZZ YYYYYYYY 110110XX ZZZZZZZZ 110111WW 20
  26. // ranges of the valid integers
  27. #define OWL_UTF16_RANGE1_MIN 0x000000
  28. #define OWL_UTF16_RANGE1_MAX 0x00D7FF
  29. #define OWL_UTF16_RANGE2_MIN 0x00E000
  30. #define OWL_UTF16_RANGE2_MAX 0x00FFFF
  31. #define OWL_UTF16_RANGE3_MIN 0x010000
  32. #define OWL_UTF16_RANGE3_MAX 0x10FFFF
  33. // ranges of the invalid integers
  34. #define OWL_INV_UTF16_RANGE1_MIN 0xD800
  35. #define OWL_INV_UTF16_RANGE1_MAX 0xDFFF
  36. // for when the character is in the 0x010000 - 0x10FFFF range the following mask will be used
  37. #define OWL_UTF16_MASK_1 0xFC // 11111100
  38. // the result of the mask (applied to the needed byte)
  39. // will output one of the following outputs if the character is UTF16
  40. #define OWL_UTF16_BLOCK_1 0xD8 // 11011000
  41. #define OWL_UTF16_BLOCK_2 0xDC // 11011100
  42. // also, for the same range, this is the integer to be substracted
  43. // from the actual code point to be able to be encoded in 20 bits
  44. #define OWL_UTF16_LAST_RANGE_BIAS 0x10000
  45. #define OWL_INV_ENC_UTF16BE_STR "[INV_ENC_UTF16BE]"
  46. #define OWL_INV_ENC_UTF16LE_STR "[INV_ENC_UTF16LE]"
  47. // check, get, write and print an UTF16 character
  48. // NOTE: the "be" and "le" functions are for big endian and
  49. // little endian byte ordered UTF16 encoded characters
  50. // basic functions
  51. owl_byte owl_check_wchar_utf16(owl_wchar ch);
  52. owl_byte owl_check_wchar_enc_utf16be(owl_byte * src, owl_umax size);
  53. owl_byte owl_check_wchar_enc_utf16le(owl_byte * src, owl_umax size);
  54. owl_wchar owl_get_wchar_utf16be(owl_byte * src, owl_umax size);
  55. owl_wchar owl_get_wchar_utf16le(owl_byte * src, owl_umax size);
  56. owl_byte * owl_write_wchar_enc_utf16be(owl_wchar ch, owl_byte * dest, owl_umax size);
  57. owl_byte * owl_write_wchar_enc_utf16le(owl_wchar ch, owl_byte * dest, owl_umax size);
  58. owl_byte owl_print_wchar_utf16(owl_wchar ch);
  59. owl_byte owl_print_wchar_enc_utf16be(owl_byte * src, owl_umax size);
  60. owl_byte owl_print_wchar_enc_utf16le(owl_byte * src, owl_umax size);
  61. // conversion to unicode
  62. owl_wchar owl_wchar_utf16_as_unicode(owl_wchar utf16);
  63. owl_wchar owl_wchar_unicode_as_utf16(owl_wchar unicode);
  64. #include "utf16.c"
  65. #endif // utf16_h