Humming-Owl
/
bean


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
							#ifndef UTF8_HEADER
#define UTF8_HEADER

// this header needs
#include <stdio.h>
#include "../ARRAY.h"
#include "../NUMBER/BITS.h"
#include "ASCII.h"
#include "UNICD.h"

// will represent UTF8 characters as an unicd
typedef unicd_int utf8_int;

// minimum and maximum integers for each representation
#define MIN_UTF8_INT 0x000000 // MIN_UNICD_INT
#define MAX_UTF8_INT 0x1FFFFF // MAX_UNICD_INT
#define INV_UTF8_INT 0xFFFFFF // INV_UNICD_INT
#define INV_UTF8_INT_STR "[INV_UTF8_INT]"
#define INV_ENC_UTF8_STR "[INV_ENC_UTF8]"

// UTF8 can encode characters from 1 byte to 4 bytes 
// https://en.wikipedia.org/wiki/UTF-8
// https://linjan2.github.io/utf16-utf8.html
// It is CPU endian independant, big endian (YES)
//
// integer rep ranges   binary rep of max number  memory written representation         bits encoded
// 0x00 - 0x7F          0XXXXXXX                  0XXXXXXX                              7
// 0x80 - 0x7FF         XXX XXYYYYYY              110XXXXX 10YYYYYY                     11
// 0x800 - 0xFFFF       XXXXYYYY YYZZZZZZ         1110XXXX 10YYYYYY 10ZZZZZZ            16
// 0x10000 - 0x1FFFFF   XXXYY YYYYWWWW WWZZZZZZ   11110XXX 10YYYYYY 10WWWWWW 10ZZZZZZ   21 

// important code point range numbers
#define MAX_UTF8_CHAR_1BYTE 0x7F
#define MAX_UTF8_CHAR_2BYTE 0x7FF
#define MAX_UTF8_CHAR_3BYTE 0xFFFF
#define MAX_UTF8_CHAR_4BYTE 0x1FFFFF

// so 5 masks can be used for all bytes on a UTF8 stream
#define utf8_byte_mask_1 0x80 // leftmost bit     10000000
#define utf8_byte_mask_2 0xC0 // 2 leftmost bits  11000000
#define utf8_byte_mask_3 0xE0 // 3 leftmost bits  11100000
#define utf8_byte_mask_4 0xF0 // 4 leftmost bits  11110000
#define utf8_byte_mask_5 0xF8 // 5 leftmost bits  11111000

// checking
byte check_utf8_int(utf8_int ch);
byte check_enc_utf8(void * src, umax ssize);
// get/write
utf8_int get_utf8_int(void * src, umax ssize);
void * write_enc_utf8(utf8_int ch, void * dest, umax dsize);
// printing
byte print_utf8_int(utf8_int ch);
byte print_enc_utf8(void * src, umax ssize);

// unicode conversion related (for compatibility with CHAR_ARR functions)

// checking
byte check_utf8_as_unicd(utf8_int ch);
bool check_unicd_as_utf8(unicd_int ch);
// getting
unicd_int get_utf8_as_unicd(utf8_int ch);
utf8_int get_unicd_as_utf8(unicd_int ch);

#include "UTF8.c"

#endif // UTF8_HEADER