UTF8.h 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. #ifndef UTF8_HEADER
  2. #define UTF8_HEADER
  3. // this header needs
  4. #include <stdio.h>
  5. #include "../ARRAY.h"
  6. #include "../NUMBER/BITS.h"
  7. #include "ASCII.h"
  8. #include "UNICD.h"
  9. // will represent UTF8 characters as an unicd
  10. typedef unicd_int utf8_int;
  11. // minimum and maximum integers for each representation
  12. #define MIN_UTF8_INT 0x000000 // MIN_UNICD_INT
  13. #define MAX_UTF8_INT 0x1FFFFF // MAX_UNICD_INT
  14. #define INV_UTF8_INT 0xFFFFFF // INV_UNICD_INT
  15. #define INV_UTF8_INT_STR "[INV_UTF8_INT]"
  16. #define INV_ENC_UTF8_STR "[INV_ENC_UTF8]"
  17. // UTF8 can encode characters from 1 byte to 4 bytes
  18. // https://en.wikipedia.org/wiki/UTF-8
  19. // https://linjan2.github.io/utf16-utf8.html
  20. // It is CPU endian independant, big endian (YES)
  21. //
  22. // integer rep ranges binary rep of max number memory written representation bits encoded
  23. // 0x00 - 0x7F 0XXXXXXX 0XXXXXXX 7
  24. // 0x80 - 0x7FF XXX XXYYYYYY 110XXXXX 10YYYYYY 11
  25. // 0x800 - 0xFFFF XXXXYYYY YYZZZZZZ 1110XXXX 10YYYYYY 10ZZZZZZ 16
  26. // 0x10000 - 0x1FFFFF XXXYY YYYYWWWW WWZZZZZZ 11110XXX 10YYYYYY 10WWWWWW 10ZZZZZZ 21
  27. // important code point range numbers
  28. #define MAX_UTF8_CHAR_1BYTE 0x7F
  29. #define MAX_UTF8_CHAR_2BYTE 0x7FF
  30. #define MAX_UTF8_CHAR_3BYTE 0xFFFF
  31. #define MAX_UTF8_CHAR_4BYTE 0x1FFFFF
  32. // so 5 masks can be used for all bytes on a UTF8 stream
  33. #define utf8_byte_mask_1 0x80 // leftmost bit 10000000
  34. #define utf8_byte_mask_2 0xC0 // 2 leftmost bits 11000000
  35. #define utf8_byte_mask_3 0xE0 // 3 leftmost bits 11100000
  36. #define utf8_byte_mask_4 0xF0 // 4 leftmost bits 11110000
  37. #define utf8_byte_mask_5 0xF8 // 5 leftmost bits 11111000
  38. // checking
  39. byte check_utf8_int(utf8_int ch);
  40. byte check_enc_utf8(void * src, umax ssize);
  41. // get/write
  42. utf8_int get_utf8_int(void * src, umax ssize);
  43. void * write_enc_utf8(utf8_int ch, void * dest, umax dsize);
  44. // printing
  45. byte print_utf8_int(utf8_int ch);
  46. byte print_enc_utf8(void * src, umax ssize);
  47. // unicode conversion related (for compatibility with CHAR_ARR functions)
  48. // checking
  49. byte check_utf8_as_unicd(utf8_int ch);
  50. bool check_unicd_as_utf8(unicd_int ch);
  51. // getting
  52. unicd_int get_utf8_as_unicd(utf8_int ch);
  53. utf8_int get_unicd_as_utf8(unicd_int ch);
  54. #include "UTF8.c"
  55. #endif // UTF8_HEADER