123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- // C source file with the function definitions to handle UTF16 characters
- // check_utf16_int() function
- byte check_utf16_int(utf16_int ch)
- {
- // check if it is a valid code point value
- if (ch > MAX_UTF16_INT || (ch >= INV_UTF16_INT_RANGE1_MIN && ch <= INV_UTF16_INT_RANGE1_MAX))
- return 0;
-
- // 2 bytes long
- if (ch < UTF16_LAST_RANGE_BIAS)
- return 2;
- // 4 bytes long
- else
- return 4;
- }
- // check_enc_utf16_be() function
- byte check_enc_utf16be(void * src, umax ssize)
- {
- // check function parameters
- if (src == NULL || ssize <= 1)
- return 0;
-
- // check the character
- byte * ptr = src;
-
- // 2 bytes check
- if (((ptr[0] & utf16_byte_mask_1) != utf16_block_1_bits)
- && ((ptr[0] & utf16_byte_mask_1) != utf16_block_2_bits))
- return 2;
- // 4 bytes check
- else if ((ssize >= 4)
- && ((ptr[0] & utf16_byte_mask_1) == utf16_block_1_bits)
- && ((ptr[2] & utf16_byte_mask_1) == utf16_block_2_bits))
- return 4;
- // else, the character is invalid
- return 0;
- }
- // check_enc_utf16le() function
- byte check_enc_utf16le(void * src, umax ssize)
- {
- // check function parameters
- // gonna reuse check_utf16_mem_be() in a clever way
- if (src == NULL || ssize <= 1)
- return 0;
-
- // make a byte array of size 4
- byte arr[4] = {0};
- // copy 4 bytes if mx_size allows it
- if (ssize >= 4) {
- cp_mem_bytes(src, 4, arr);
- reverse_array(arr, 2, 1);
- reverse_array(arr + 2, 2, 1);
- }
- else /* copy 2 bytes only */ {
- cp_mem_bytes(src, 2, arr);
- reverse_array(arr, 2, 1);
- }
-
- // in arr there will be a UTF16BE character
- return check_enc_utf16be(arr, ssize);
- }
- // get_utf16be_int() function
- utf16_int get_utf16be_int(void * src, umax ssize)
- {
- // check the encoded UTF16
- byte ch_size = check_enc_utf16be(src, ssize);
- if (ch_size == 0)
- return INV_UTF16_INT;
-
- // get integer depending on ch_size
- utf16_int ch = 0;
-
- // arrays to hold src data
- byte * tmp1 = allocate_memory(ch_size, 1);
- byte * tmp2 = allocate_memory(ch_size, 1);
- cp_mem_bytes(src, ch_size, tmp1);
- cp_mem_bytes(tmp1, ch_size, tmp2);
-
- // ptr to write to
- byte * ptr = (byte *) &ch;
-
- // 2 bytes
- if (ch_size == 2)
- cp_mem_bytes(tmp1, ch_size, (byte *) &ch + sizeof(utf16_int) - 2);
- else // 4 bytes
- {
- // first byte // ... ... ... OOOOOOOO (tmp1)
- ptr[sizeof(utf16_int) - 1] = tmp1[3]; // ... ... ... OOOOOOOO (ptr)
-
- // second byte // ... ... XXXXXXOO XXXXXXXX (tmp1)
- ptr[sizeof(utf16_int) - 2] = (tmp1[2] & (~ utf16_byte_mask_1)); // ... ... 000000OO OOOOOOOO (ptr)
-
- // third byte // ... OOOOOOOO XXXXXXXX XXXXXXXX (tmp1)
- tmp1[0] = 0; // 00000000 OOOOOOOO XXXXXXXX XXXXXXXX (tmp1)
- arr_n_bitshift(tmp1, 2, 1, BITSHIFT_LEFT, 2); // 000000OO OOOOOO00 XXXXXXXX XXXXXXXX (tmp1)
- ptr[sizeof(utf16_int) - 3] += tmp1[0]; // 00000000 000000OO 000000OO OOOOOOOO (ptr)
- ptr[sizeof(utf16_int) - 2] += tmp1[1]; // 00000000 000000OO OOOOOOOO OOOOOOOO (ptr)
-
- // fourth byte // XXXXXXOO XXXXXXXX XXXXXXXX XXXXXXXX (tmp2)
- ptr[sizeof(utf16_int) - 3] += (tmp2[0] &
- (~ utf16_byte_mask_1)) << 2; // 00000000 0000OOOO OOOOOOOO OOOOOOOO (ptr)
- }
-
- // put ch in the system byte order and add the bias
- if (SYS_ENDIAN == LITTLE)
- reverse_array(&ch, sizeof(utf16_int), 1);
- // check if the bias needs to be added
- if (ch_size == 4)
- ch += UTF16_LAST_RANGE_BIAS;
-
- // free memory used
- free_memory(tmp1);
- free_memory(tmp2);
-
- // return ch
- return ch;
- }
- // get_utf16le_int() function
- utf16_int get_utf16le_int(void * src, umax ssize)
- {
- // check the encoded UTF16
- byte ch_size = check_enc_utf16le(src, ssize);
- if (ch_size == 0)
- return INV_UTF16_INT;
-
- // make a byte array of size 4
- byte arr[4] = {0};
- // copy 4 bytes if mx_size allows it
- if (ssize >= 4) {
- cp_mem_bytes(src, 4, arr);
- reverse_array(arr, 2, 1);
- reverse_array(arr + 2, 2, 1);
- }
- else /* copy 2 bytes only */ {
- cp_mem_bytes(src, 2, arr);
- reverse_array(arr, 2, 1);
- }
-
- // in arr there will be a UTF16BE character
- return get_utf16be_int(arr, ssize);
- }
- // write_enc_utf16be() function
- void * write_enc_utf16be(utf16_int ch, void * dest, umax dsize)
- {
- // check params
- byte ch_size = check_utf16_int(ch);
- if (dsize < ch_size || dest == NULL || dsize == 0)
- return NULL;
-
- // else write the integer
- if (ch_size == 2) {
- // flip the integer bytes if needed
- if (SYS_ENDIAN == LITTLE)
- reverse_array(&ch, sizeof(utf16_int), 1);
- // write the integer bytes into dest
- cp_mem_bytes(&ch + sizeof(utf16_int) - 2, 2, dest);
- return (byte *) dest + 2;
- }
-
- // ch_size is 4
- ch -= UTF16_LAST_RANGE_BIAS;
- // array to hold src data
- byte * tmp1 = allocate_memory(4, 1);
-
- // copy the bytes depending on the system endian into tmp1
- if (SYS_ENDIAN == BIG)
- cp_mem_bytes((byte *) &ch + sizeof(utf16_int) - 4, 4, tmp1);
- else if (SYS_ENDIAN == LITTLE) {
- cp_mem_bytes(&ch, 4, tmp1);
- reverse_array(tmp1, 4, 1);
- }
-
- // write everything into tmp
- utf16_int tmp = 0;
- byte * ptr = (byte *) &tmp;
- // first byte // ... ... ... OOOOOOOO (tmp1)
- ptr[sizeof(utf16_int) - 1] = tmp1[3]; // ... ... ... OOOOOOOO (ptr)
- // second byte // ... ... XXXXXXOO OOOOOOOO (tmp1)
- ptr[sizeof(utf16_int) - 2] = (tmp1[2] & (~ utf16_byte_mask_1))
- + utf16_block_2_bits; // ... ... 110111OO OOOOOOOO (ptr)
- // third byte // ... XXXXxxOO OOOOOOXX ... (tmp1)
- arr_n_bitshift(tmp1 + 1, 2, 1, BITSHIFT_RIGHT, 2); // ... XXXXXXxx OOOOOOOO ... (tmp1)
- ptr[sizeof(utf16_int) - 3] = tmp1[2]; // ... OOOOOOOO 110111OO OOOOOOOO (ptr)
- // fourth byte (by using modified tmp1 from ^) // ... XXXXXXOO ... ... (tmp1)
- ptr[sizeof(utf16_int) - 4] = (tmp1[1] & (~ utf16_byte_mask_1))
- + utf16_block_1_bits; // 110110OO OOOOOOOO 110111OO OOOOOOOO (ptr)
- // free memory used
- free_memory(tmp1);
-
- // write the integer into dest
- cp_mem_bytes(ptr + sizeof(utf16_int) - 4, 4, dest);
- return (byte *) dest + 4;
- }
- // write_enc_utf16le() function
- void * write_enc_utf16le(utf16_int ch, void * dest, umax dsize)
- {
- // check params
- byte ch_size = check_utf16_int(ch);
- byte * write_result = write_enc_utf16be(ch, dest, dsize);
- if (write_result == NULL)
- return NULL;
-
- // handle the bytes written
- if (ch_size == 2)
- reverse_array(dest, 2, 1);
- else if (ch_size == 4) {
- reverse_array(dest, 2, 1);
- reverse_array((byte *) dest + 2, 2, 1);
- }
- return write_result;
- }
- // print_utf16_int() function
- byte print_utf16_int(utf16_int ch)
- {
- // invalid character
- byte rtn = check_utf16_int(ch);
- if (rtn == 0)
- printf(INV_UTF16_INT_STR);
- else
- print_unicd_int((unicd_int) ch);
- return rtn;
- }
- // print_enc_utf16be() function
- byte print_enc_utf16be(void * src, umax ssize)
- {
- // invalid character
- byte rtn = check_enc_utf16be(src, ssize);
- if (rtn == 0)
- printf(INV_ENC_UTF16_STR);
- else // print the encoded bytes
- for (byte i = 0; i < rtn; i++)
- printf("%02X", ((byte *) src + i)[0]);
- return rtn;
- }
- // print_enc_utf16le() function
- byte print_enc_utf16le(void * src, umax ssize)
- {
- // invalid character
- byte rtn = check_enc_utf16le(src, ssize);
- if (rtn == 0)
- printf(INV_ENC_UTF16_STR);
- else // print the encoded bytes
- for (byte i = 0; i < rtn; i++)
- printf("%02X", ((byte *) src + i)[0]);
- return rtn;
- }
- // check_utf16_as_unicd() function
- byte check_utf16_as_unicd(utf16_int ch)
- {
- // all characters on UTF16 are able to be converted to unicode
- return check_utf16_int(ch);
- }
- // check_unicd_as_utf16() function
- bool check_unicd_as_utf16(unicd_int ch)
- {
- // check if it is an UTF16 character (0x000000 - 0x10FFFF)
- if (ch >= MIN_UTF16_INT && ch <= MAX_UTF16_INT)
- return true;
- return false;
- }
- // get_utf16_as_unicd() function
- unicd_int get_utf16_as_unicd(utf16_int ch)
- {
- // check first, then return
- if (check_utf16_as_unicd(ch) == 0)
- return INV_UNICD_INT;
- return (unicd_int) ch;
- }
- // get_unicd_as_utf16() function
- utf16_int get_unicd_as_utf16(unicd_int ch)
- {
- // check first, then return
- if (check_unicd_as_utf16(ch) == 0)
- return INV_UTF16_INT;
- return (utf16_int) ch;
- }
|