utf-8of16.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /* Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. *
  3. * Authors: Mike Sperber, Robert Ransom
  4. */
  5. /*
  6. * This file defines functions for dealing with a synthetic text
  7. * encoding called UTF-8. It's like UTF-8, but also encodes unpaired
  8. * surrogates directly, which is what we need for the Windows API.
  9. */
  10. #include <windows.h>
  11. static char masks[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
  12. /*
  13. * - NUL-terminates
  14. * - if utf_8of16 is NULL, we just compute the size
  15. * - returns size (sans NUL) needed for UTF-8of16
  16. */
  17. int
  18. s48_utf_16_to_utf_8of16(LPWSTR utf_16,
  19. unsigned char* utf_8of16)
  20. {
  21. int p = 0, i = 0;
  22. while (utf_16[i])
  23. {
  24. unsigned int c = utf_16[i];
  25. ++i;
  26. if ((c >= 0xD800) && (c <= 0xDBFF) /* high surrogate */
  27. && utf_16[i]
  28. && (utf_16[i] >= 0xDC00) && (utf_16[i] <= 0xDFFF)) /* low surrogate */
  29. {
  30. c = ((c - 0xd7c0) << 10) + (utf_16[i] & 0x3ff);
  31. ++i;
  32. }
  33. if (c <= 0x7f)
  34. {
  35. if (utf_8of16)
  36. utf_8of16[p] = (unsigned char) c;
  37. ++p;
  38. }
  39. else if (c <= 0x7ff)
  40. {
  41. if (utf_8of16)
  42. {
  43. utf_8of16[p] = (unsigned char) ((c >> 6) + 0xc0);
  44. utf_8of16[p+1] = (unsigned char) ((c & 0x3f) + 0x80);
  45. }
  46. p += 2;
  47. }
  48. else if (c <= 0xffff)
  49. {
  50. if (utf_8of16)
  51. {
  52. utf_8of16[p] = (unsigned char) ((c >> 12) + 0xe0);
  53. utf_8of16[p+1] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
  54. utf_8of16[p+2] = (unsigned char) ((c & 0x3f) + 0x80);
  55. }
  56. p += 3;
  57. }
  58. else
  59. {
  60. if (utf_8of16)
  61. {
  62. utf_8of16[p] = (unsigned char) ((c >> 18) + 0xf0);
  63. utf_8of16[p+1] = (unsigned char) (((c >> 12) + 0xe0) + 0x80);
  64. utf_8of16[p+2] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
  65. utf_8of16[p+3] = (unsigned char) ((c & 0x3f) + 0x80);
  66. }
  67. p += 4;
  68. }
  69. }
  70. if (utf_8of16)
  71. utf_8of16[p] = 0;
  72. return p;
  73. }
  74. /*
  75. * The table, and the associated decoding algorithm, is from
  76. * Richard Gillam: "Unicode Demystified", chapter 14
  77. */
  78. static char states[3][32] =
  79. {
  80. {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1},
  81. {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -2, -2, -2},
  82. {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 1, 1, 1, 1, 1, 1, 1, 1, -2, -2, -2, -2, -2, -2, -2, -2},
  83. };
  84. /*
  85. * - NUL-terminates
  86. * - if utf_16 is NULL, we just compute the size
  87. * - returns size (sans NUL) needed for UTF-16
  88. */
  89. int
  90. s48_utf_8of16_to_utf_16(const unsigned char* utf_8of16,
  91. LPWSTR utf_16,
  92. int* errorp)
  93. {
  94. int p = 0, q = 0, state = 0, error = 0;
  95. unsigned int scalar_value = 0;
  96. unsigned mask = 0;
  97. while (utf_8of16[q])
  98. {
  99. unsigned char c = utf_8of16[q];
  100. ++q;
  101. state = states[state][c >> 3];
  102. switch (state) {
  103. case 0:
  104. scalar_value += c & 0x7f;
  105. if (scalar_value > 0xffff)
  106. {
  107. if (utf_16)
  108. {
  109. utf_16[p] = (scalar_value >> 10) + 0xD7C0;
  110. utf_16[p+1] = (scalar_value & 0x3FF) + 0xDC00;
  111. }
  112. p += 2;
  113. }
  114. else
  115. {
  116. if (utf_16)
  117. utf_16[p] = scalar_value;
  118. ++p;
  119. }
  120. scalar_value = 0;
  121. mask = 0;
  122. break;
  123. case 1:
  124. case 2:
  125. if (mask == 0)
  126. mask = masks[state];
  127. scalar_value = (scalar_value + (c & mask)) << 6;
  128. mask = 0x3f;
  129. break;
  130. case -2:
  131. --q;
  132. /* fall thru */
  133. case -1:
  134. if (utf_16)
  135. utf_16[p] = 0xfffd;
  136. ++p;
  137. scalar_value = 0;
  138. state = 0;
  139. mask = 0;
  140. error = 1;
  141. break;
  142. }
  143. }
  144. if (errorp)
  145. *errorp = error;
  146. if (utf_16)
  147. utf_16[p] = 0;
  148. return p;
  149. }
  150. /*
  151. #include <stdlib.h>
  152. #include <stdio.h>
  153. int
  154. main(void)
  155. {
  156. unsigned int t1[] = { 'A', 'B', 0xd800, 0xd900, 0xdfff, 'C', 'D', 0 };
  157. int size_8 = s48_utf_16_to_utf_8of16(t1, NULL);
  158. printf("size_8 %d\n", size_8);
  159. unsigned char c[1000];
  160. size_8 = s48_utf_16_to_utf_8of16(t1, c);
  161. printf("size_8 %d\n", size_8);
  162. {
  163. int i = 0;
  164. while (i < size_8)
  165. {
  166. printf("%d: %4x\n", i, c[i]);
  167. ++i;
  168. }
  169. }
  170. unsigned int u[1000];
  171. int error;
  172. int size_16 = s48_utf_8of16_to_utf_16(c, NULL, &error);
  173. printf("size_16 %d\n", size_16);
  174. size_16 = s48_utf_8of16_to_utf_16(c, u, &error);
  175. printf("size_16 %d\n", size_16);
  176. {
  177. int i = 0;
  178. while (i < size_16)
  179. {
  180. printf("%d: %4x\n", i, u[i]);
  181. ++i;
  182. }
  183. }
  184. }
  185. */