UTF16.c 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. // C source file with the function definitions to handle UTF16 characters
  2. // check_utf16_int() function
  3. byte check_utf16_int(utf16_int ch)
  4. {
  5. // check if it is a valid code point value
  6. if (ch > MAX_UTF16_INT || (ch >= INV_UTF16_INT_RANGE1_MIN && ch <= INV_UTF16_INT_RANGE1_MAX))
  7. return 0;
  8. // 2 bytes long
  9. if (ch < UTF16_LAST_RANGE_BIAS)
  10. return 2;
  11. // 4 bytes long
  12. else
  13. return 4;
  14. }
  15. // check_enc_utf16_be() function
  16. byte check_enc_utf16be(void * src, umax ssize)
  17. {
  18. // check function parameters
  19. if (src == NULL || ssize <= 1)
  20. return 0;
  21. // check the character
  22. byte * ptr = src;
  23. // 2 bytes check
  24. if (((ptr[0] & utf16_byte_mask_1) != utf16_block_1_bits)
  25. && ((ptr[0] & utf16_byte_mask_1) != utf16_block_2_bits))
  26. return 2;
  27. // 4 bytes check
  28. else if ((ssize >= 4)
  29. && ((ptr[0] & utf16_byte_mask_1) == utf16_block_1_bits)
  30. && ((ptr[2] & utf16_byte_mask_1) == utf16_block_2_bits))
  31. return 4;
  32. // else, the character is invalid
  33. return 0;
  34. }
  35. // check_enc_utf16le() function
  36. byte check_enc_utf16le(void * src, umax ssize)
  37. {
  38. // check function parameters
  39. // gonna reuse check_utf16_mem_be() in a clever way
  40. if (src == NULL || ssize <= 1)
  41. return 0;
  42. // make a byte array of size 4
  43. byte arr[4] = {0};
  44. // copy 4 bytes if mx_size allows it
  45. if (ssize >= 4) {
  46. cp_mem_bytes(src, 4, arr);
  47. reverse_array(arr, 2, 1);
  48. reverse_array(arr + 2, 2, 1);
  49. }
  50. else /* copy 2 bytes only */ {
  51. cp_mem_bytes(src, 2, arr);
  52. reverse_array(arr, 2, 1);
  53. }
  54. // in arr there will be a UTF16BE character
  55. return check_enc_utf16be(arr, ssize);
  56. }
  57. // get_utf16be_int() function
  58. utf16_int get_utf16be_int(void * src, umax ssize)
  59. {
  60. // check the encoded UTF16
  61. byte ch_size = check_enc_utf16be(src, ssize);
  62. if (ch_size == 0)
  63. return INV_UTF16_INT;
  64. // get integer depending on ch_size
  65. utf16_int ch = 0;
  66. // arrays to hold src data
  67. byte * tmp1 = allocate_memory(ch_size, 1);
  68. byte * tmp2 = allocate_memory(ch_size, 1);
  69. cp_mem_bytes(src, ch_size, tmp1);
  70. cp_mem_bytes(tmp1, ch_size, tmp2);
  71. // ptr to write to
  72. byte * ptr = (byte *) &ch;
  73. // 2 bytes
  74. if (ch_size == 2)
  75. cp_mem_bytes(tmp1, ch_size, (byte *) &ch + sizeof(utf16_int) - 2);
  76. else // 4 bytes
  77. {
  78. // first byte // ... ... ... OOOOOOOO (tmp1)
  79. ptr[sizeof(utf16_int) - 1] = tmp1[3]; // ... ... ... OOOOOOOO (ptr)
  80. // second byte // ... ... XXXXXXOO XXXXXXXX (tmp1)
  81. ptr[sizeof(utf16_int) - 2] = (tmp1[2] & (~ utf16_byte_mask_1)); // ... ... 000000OO OOOOOOOO (ptr)
  82. // third byte // ... OOOOOOOO XXXXXXXX XXXXXXXX (tmp1)
  83. tmp1[0] = 0; // 00000000 OOOOOOOO XXXXXXXX XXXXXXXX (tmp1)
  84. arr_n_bitshift(tmp1, 2, 1, BITSHIFT_LEFT, 2); // 000000OO OOOOOO00 XXXXXXXX XXXXXXXX (tmp1)
  85. ptr[sizeof(utf16_int) - 3] += tmp1[0]; // 00000000 000000OO 000000OO OOOOOOOO (ptr)
  86. ptr[sizeof(utf16_int) - 2] += tmp1[1]; // 00000000 000000OO OOOOOOOO OOOOOOOO (ptr)
  87. // fourth byte // XXXXXXOO XXXXXXXX XXXXXXXX XXXXXXXX (tmp2)
  88. ptr[sizeof(utf16_int) - 3] += (tmp2[0] &
  89. (~ utf16_byte_mask_1)) << 2; // 00000000 0000OOOO OOOOOOOO OOOOOOOO (ptr)
  90. }
  91. // put ch in the system byte order and add the bias
  92. if (SYS_ENDIAN == LITTLE)
  93. reverse_array(&ch, sizeof(utf16_int), 1);
  94. // check if the bias needs to be added
  95. if (ch_size == 4)
  96. ch += UTF16_LAST_RANGE_BIAS;
  97. // free memory used
  98. free_memory(tmp1);
  99. free_memory(tmp2);
  100. // return ch
  101. return ch;
  102. }
  103. // get_utf16le_int() function
  104. utf16_int get_utf16le_int(void * src, umax ssize)
  105. {
  106. // check the encoded UTF16
  107. byte ch_size = check_enc_utf16le(src, ssize);
  108. if (ch_size == 0)
  109. return INV_UTF16_INT;
  110. // make a byte array of size 4
  111. byte arr[4] = {0};
  112. // copy 4 bytes if mx_size allows it
  113. if (ssize >= 4) {
  114. cp_mem_bytes(src, 4, arr);
  115. reverse_array(arr, 2, 1);
  116. reverse_array(arr + 2, 2, 1);
  117. }
  118. else /* copy 2 bytes only */ {
  119. cp_mem_bytes(src, 2, arr);
  120. reverse_array(arr, 2, 1);
  121. }
  122. // in arr there will be a UTF16BE character
  123. return get_utf16be_int(arr, ssize);
  124. }
  125. // write_enc_utf16be() function
  126. void * write_enc_utf16be(utf16_int ch, void * dest, umax dsize)
  127. {
  128. // check params
  129. byte ch_size = check_utf16_int(ch);
  130. if (dsize < ch_size || dest == NULL || dsize == 0)
  131. return NULL;
  132. // else write the integer
  133. if (ch_size == 2) {
  134. // flip the integer bytes if needed
  135. if (SYS_ENDIAN == LITTLE)
  136. reverse_array(&ch, sizeof(utf16_int), 1);
  137. // write the integer bytes into dest
  138. cp_mem_bytes(&ch + sizeof(utf16_int) - 2, 2, dest);
  139. return (byte *) dest + 2;
  140. }
  141. // ch_size is 4
  142. ch -= UTF16_LAST_RANGE_BIAS;
  143. // array to hold src data
  144. byte * tmp1 = allocate_memory(4, 1);
  145. // copy the bytes depending on the system endian into tmp1
  146. if (SYS_ENDIAN == BIG)
  147. cp_mem_bytes((byte *) &ch + sizeof(utf16_int) - 4, 4, tmp1);
  148. else if (SYS_ENDIAN == LITTLE) {
  149. cp_mem_bytes(&ch, 4, tmp1);
  150. reverse_array(tmp1, 4, 1);
  151. }
  152. // write everything into tmp
  153. utf16_int tmp = 0;
  154. byte * ptr = (byte *) &tmp;
  155. // first byte // ... ... ... OOOOOOOO (tmp1)
  156. ptr[sizeof(utf16_int) - 1] = tmp1[3]; // ... ... ... OOOOOOOO (ptr)
  157. // second byte // ... ... XXXXXXOO OOOOOOOO (tmp1)
  158. ptr[sizeof(utf16_int) - 2] = (tmp1[2] & (~ utf16_byte_mask_1))
  159. + utf16_block_2_bits; // ... ... 110111OO OOOOOOOO (ptr)
  160. // third byte // ... XXXXxxOO OOOOOOXX ... (tmp1)
  161. arr_n_bitshift(tmp1 + 1, 2, 1, BITSHIFT_RIGHT, 2); // ... XXXXXXxx OOOOOOOO ... (tmp1)
  162. ptr[sizeof(utf16_int) - 3] = tmp1[2]; // ... OOOOOOOO 110111OO OOOOOOOO (ptr)
  163. // fourth byte (by using modified tmp1 from ^) // ... XXXXXXOO ... ... (tmp1)
  164. ptr[sizeof(utf16_int) - 4] = (tmp1[1] & (~ utf16_byte_mask_1))
  165. + utf16_block_1_bits; // 110110OO OOOOOOOO 110111OO OOOOOOOO (ptr)
  166. // free memory used
  167. free_memory(tmp1);
  168. // write the integer into dest
  169. cp_mem_bytes(ptr + sizeof(utf16_int) - 4, 4, dest);
  170. return (byte *) dest + 4;
  171. }
  172. // write_enc_utf16le() function
  173. void * write_enc_utf16le(utf16_int ch, void * dest, umax dsize)
  174. {
  175. // check params
  176. byte ch_size = check_utf16_int(ch);
  177. byte * write_result = write_enc_utf16be(ch, dest, dsize);
  178. if (write_result == NULL)
  179. return NULL;
  180. // handle the bytes written
  181. if (ch_size == 2)
  182. reverse_array(dest, 2, 1);
  183. else if (ch_size == 4) {
  184. reverse_array(dest, 2, 1);
  185. reverse_array((byte *) dest + 2, 2, 1);
  186. }
  187. return write_result;
  188. }
  189. // print_utf16_int() function
  190. byte print_utf16_int(utf16_int ch)
  191. {
  192. // invalid character
  193. byte rtn = check_utf16_int(ch);
  194. if (rtn == 0)
  195. printf(INV_UTF16_INT_STR);
  196. else
  197. print_unicd_int((unicd_int) ch);
  198. return rtn;
  199. }
  200. // print_enc_utf16be() function
  201. byte print_enc_utf16be(void * src, umax ssize)
  202. {
  203. // invalid character
  204. byte rtn = check_enc_utf16be(src, ssize);
  205. if (rtn == 0)
  206. printf(INV_ENC_UTF16_STR);
  207. else // print the encoded bytes
  208. for (byte i = 0; i < rtn; i++)
  209. printf("%02X", ((byte *) src + i)[0]);
  210. return rtn;
  211. }
  212. // print_enc_utf16le() function
  213. byte print_enc_utf16le(void * src, umax ssize)
  214. {
  215. // invalid character
  216. byte rtn = check_enc_utf16le(src, ssize);
  217. if (rtn == 0)
  218. printf(INV_ENC_UTF16_STR);
  219. else // print the encoded bytes
  220. for (byte i = 0; i < rtn; i++)
  221. printf("%02X", ((byte *) src + i)[0]);
  222. return rtn;
  223. }
  224. // check_utf16_as_unicd() function
  225. byte check_utf16_as_unicd(utf16_int ch)
  226. {
  227. // all characters on UTF16 are able to be converted to unicode
  228. return check_utf16_int(ch);
  229. }
  230. // check_unicd_as_utf16() function
  231. bool check_unicd_as_utf16(unicd_int ch)
  232. {
  233. // check if it is an UTF16 character (0x000000 - 0x10FFFF)
  234. if (ch >= MIN_UTF16_INT && ch <= MAX_UTF16_INT)
  235. return true;
  236. return false;
  237. }
  238. // get_utf16_as_unicd() function
  239. unicd_int get_utf16_as_unicd(utf16_int ch)
  240. {
  241. // check first, then return
  242. if (check_utf16_as_unicd(ch) == 0)
  243. return INV_UNICD_INT;
  244. return (unicd_int) ch;
  245. }
  246. // get_unicd_as_utf16() function
  247. utf16_int get_unicd_as_utf16(unicd_int ch)
  248. {
  249. // check first, then return
  250. if (check_unicd_as_utf16(ch) == 0)
  251. return INV_UTF16_INT;
  252. return (utf16_int) ch;
  253. }