12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- #ifndef UTF8_HEADER
- #define UTF8_HEADER
- // this header needs
- #include <stdio.h>
- #include "../ARRAY.h"
- #include "../NUMBER/BITS.h"
- #include "ASCII.h"
- #include "UNICD.h"
- // will represent UTF8 characters as an unicd
- typedef unicd_int utf8_int;
- // minimum and maximum integers for each representation
- #define MIN_UTF8_INT 0x000000 // MIN_UNICD_INT
- #define MAX_UTF8_INT 0x1FFFFF // MAX_UNICD_INT
- #define INV_UTF8_INT 0xFFFFFF // INV_UNICD_INT
- #define INV_UTF8_INT_STR "[INV_UTF8_INT]"
- #define INV_ENC_UTF8_STR "[INV_ENC_UTF8]"
- // UTF8 can encode characters from 1 byte to 4 bytes
- // https://en.wikipedia.org/wiki/UTF-8
- // https://linjan2.github.io/utf16-utf8.html
- // It is CPU endian independant, big endian (YES)
- //
- // integer rep ranges binary rep of max number memory written representation bits encoded
- // 0x00 - 0x7F 0XXXXXXX 0XXXXXXX 7
- // 0x80 - 0x7FF XXX XXYYYYYY 110XXXXX 10YYYYYY 11
- // 0x800 - 0xFFFF XXXXYYYY YYZZZZZZ 1110XXXX 10YYYYYY 10ZZZZZZ 16
- // 0x10000 - 0x1FFFFF XXXYY YYYYWWWW WWZZZZZZ 11110XXX 10YYYYYY 10WWWWWW 10ZZZZZZ 21
- // important code point range numbers
- #define MAX_UTF8_CHAR_1BYTE 0x7F
- #define MAX_UTF8_CHAR_2BYTE 0x7FF
- #define MAX_UTF8_CHAR_3BYTE 0xFFFF
- #define MAX_UTF8_CHAR_4BYTE 0x1FFFFF
- // so 5 masks can be used for all bytes on a UTF8 stream
- #define utf8_byte_mask_1 0x80 // leftmost bit 10000000
- #define utf8_byte_mask_2 0xC0 // 2 leftmost bits 11000000
- #define utf8_byte_mask_3 0xE0 // 3 leftmost bits 11100000
- #define utf8_byte_mask_4 0xF0 // 4 leftmost bits 11110000
- #define utf8_byte_mask_5 0xF8 // 5 leftmost bits 11111000
- // checking
- byte check_utf8_int(utf8_int ch);
- byte check_enc_utf8(void * src, umax ssize);
- // get/write
- utf8_int get_utf8_int(void * src, umax ssize);
- void * write_enc_utf8(utf8_int ch, void * dest, umax dsize);
- // printing
- byte print_utf8_int(utf8_int ch);
- byte print_enc_utf8(void * src, umax ssize);
- // unicode conversion related (for compatibility with CHAR_ARR functions)
- // checking
- byte check_utf8_as_unicd(utf8_int ch);
- bool check_unicd_as_utf8(unicd_int ch);
- // getting
- unicd_int get_utf8_as_unicd(utf8_int ch);
- utf8_int get_unicd_as_utf8(unicd_int ch);
- #include "UTF8.c"
- #endif // UTF8_HEADER
|