utf8.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. /*
  2. * UTF-8 support routines
  3. *
  4. * Copyright 2000 Alexandre Julliard
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  19. */
  20. #include <string.h>
  21. #include "wine/asm.h"
  22. #ifdef __ASM_OBSOLETE
  23. #include "unicode.h"
  24. extern WCHAR wine_compose( const WCHAR *str ) DECLSPEC_HIDDEN;
  25. /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
  26. static const char utf8_length[128] =
  27. {
  28. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
  29. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
  30. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
  31. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
  32. 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
  33. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
  34. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
  35. 3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
  36. };
  37. /* first byte mask depending on UTF-8 sequence length */
  38. static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
  39. /* minimum Unicode value depending on UTF-8 sequence length */
  40. static const unsigned int utf8_minval[4] = { 0x0, 0x80, 0x800, 0x10000 };
  41. /* get the next char value taking surrogates into account */
  42. static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
  43. {
  44. if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
  45. {
  46. if (src[0] > 0xdbff || /* invalid high surrogate */
  47. srclen <= 1 || /* missing low surrogate */
  48. src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
  49. return 0;
  50. return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
  51. }
  52. return src[0];
  53. }
  54. /* query necessary dst length for src string */
  55. static inline int get_length_wcs_utf8( int flags, const WCHAR *src, unsigned int srclen )
  56. {
  57. int len;
  58. unsigned int val;
  59. for (len = 0; srclen; srclen--, src++)
  60. {
  61. if (*src < 0x80) /* 0x00-0x7f: 1 byte */
  62. {
  63. len++;
  64. continue;
  65. }
  66. if (*src < 0x800) /* 0x80-0x7ff: 2 bytes */
  67. {
  68. len += 2;
  69. continue;
  70. }
  71. if (!(val = get_surrogate_value( src, srclen )))
  72. {
  73. if (flags & WC_ERR_INVALID_CHARS) return -2;
  74. continue;
  75. }
  76. if (val < 0x10000) /* 0x800-0xffff: 3 bytes */
  77. len += 3;
  78. else /* 0x10000-0x10ffff: 4 bytes */
  79. {
  80. len += 4;
  81. src++;
  82. srclen--;
  83. }
  84. }
  85. return len;
  86. }
  87. /* wide char to UTF-8 string conversion */
  88. /* return -1 on dst buffer overflow, -2 on invalid input char */
  89. int wine_utf8_wcstombs_obsolete( int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
  90. {
  91. int len;
  92. if (!dstlen) return get_length_wcs_utf8( flags, src, srclen );
  93. for (len = dstlen; srclen; srclen--, src++)
  94. {
  95. WCHAR ch = *src;
  96. unsigned int val;
  97. if (ch < 0x80) /* 0x00-0x7f: 1 byte */
  98. {
  99. if (!len--) return -1; /* overflow */
  100. *dst++ = ch;
  101. continue;
  102. }
  103. if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
  104. {
  105. if ((len -= 2) < 0) return -1; /* overflow */
  106. dst[1] = 0x80 | (ch & 0x3f);
  107. ch >>= 6;
  108. dst[0] = 0xc0 | ch;
  109. dst += 2;
  110. continue;
  111. }
  112. if (!(val = get_surrogate_value( src, srclen )))
  113. {
  114. if (flags & WC_ERR_INVALID_CHARS) return -2;
  115. continue;
  116. }
  117. if (val < 0x10000) /* 0x800-0xffff: 3 bytes */
  118. {
  119. if ((len -= 3) < 0) return -1; /* overflow */
  120. dst[2] = 0x80 | (val & 0x3f);
  121. val >>= 6;
  122. dst[1] = 0x80 | (val & 0x3f);
  123. val >>= 6;
  124. dst[0] = 0xe0 | val;
  125. dst += 3;
  126. }
  127. else /* 0x10000-0x10ffff: 4 bytes */
  128. {
  129. if ((len -= 4) < 0) return -1; /* overflow */
  130. dst[3] = 0x80 | (val & 0x3f);
  131. val >>= 6;
  132. dst[2] = 0x80 | (val & 0x3f);
  133. val >>= 6;
  134. dst[1] = 0x80 | (val & 0x3f);
  135. val >>= 6;
  136. dst[0] = 0xf0 | val;
  137. dst += 4;
  138. src++;
  139. srclen--;
  140. }
  141. }
  142. return dstlen - len;
  143. }
  144. /* helper for the various utf8 mbstowcs functions */
  145. static inline unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend )
  146. {
  147. unsigned int len = utf8_length[ch-0x80];
  148. unsigned int res = ch & utf8_mask[len];
  149. const char *end = *str + len;
  150. if (end > strend) return ~0;
  151. switch(len)
  152. {
  153. case 3:
  154. if ((ch = end[-3] ^ 0x80) >= 0x40) break;
  155. res = (res << 6) | ch;
  156. (*str)++;
  157. case 2:
  158. if ((ch = end[-2] ^ 0x80) >= 0x40) break;
  159. res = (res << 6) | ch;
  160. (*str)++;
  161. case 1:
  162. if ((ch = end[-1] ^ 0x80) >= 0x40) break;
  163. res = (res << 6) | ch;
  164. (*str)++;
  165. if (res < utf8_minval[len]) break;
  166. return res;
  167. }
  168. return ~0;
  169. }
  170. /* query necessary dst length for src string with composition */
  171. static inline int get_length_mbs_utf8_compose( int flags, const char *src, int srclen )
  172. {
  173. int ret = 0;
  174. unsigned int res;
  175. WCHAR composed[2];
  176. const char *srcend = src + srclen;
  177. composed[0] = 0;
  178. while (src < srcend)
  179. {
  180. unsigned char ch = *src++;
  181. if (ch < 0x80) /* special fast case for 7-bit ASCII */
  182. {
  183. composed[0] = ch;
  184. ret++;
  185. continue;
  186. }
  187. if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
  188. {
  189. if (composed[0])
  190. {
  191. composed[1] = res;
  192. if ((composed[0] = wine_compose( composed ))) continue;
  193. }
  194. composed[0] = res;
  195. ret++;
  196. }
  197. else if (res <= 0x10ffff)
  198. {
  199. ret += 2;
  200. composed[0] = 0; /* no composition for surrogates */
  201. }
  202. else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
  203. /* otherwise ignore it */
  204. }
  205. return ret;
  206. }
  207. /* UTF-8 to wide char string conversion with composition */
  208. /* return -1 on dst buffer overflow, -2 on invalid input char */
  209. static int utf8_mbstowcs_compose( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
  210. {
  211. unsigned int res;
  212. const char *srcend = src + srclen;
  213. WCHAR composed[2];
  214. WCHAR *dstend = dst + dstlen;
  215. if (!dstlen) return get_length_mbs_utf8_compose( flags, src, srclen );
  216. composed[0] = 0;
  217. while (src < srcend)
  218. {
  219. unsigned char ch = *src++;
  220. if (ch < 0x80) /* special fast case for 7-bit ASCII */
  221. {
  222. if (dst >= dstend) return -1; /* overflow */
  223. *dst++ = composed[0] = ch;
  224. continue;
  225. }
  226. if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
  227. {
  228. if (composed[0])
  229. {
  230. composed[1] = res;
  231. if ((composed[0] = wine_compose( composed )))
  232. {
  233. dst[-1] = composed[0];
  234. continue;
  235. }
  236. }
  237. if (dst >= dstend) return -1; /* overflow */
  238. *dst++ = composed[0] = res;
  239. }
  240. else if (res <= 0x10ffff) /* we need surrogates */
  241. {
  242. if (dst >= dstend - 1) return -1; /* overflow */
  243. res -= 0x10000;
  244. *dst++ = 0xd800 | (res >> 10);
  245. *dst++ = 0xdc00 | (res & 0x3ff);
  246. composed[0] = 0; /* no composition for surrogates */
  247. }
  248. else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
  249. /* otherwise ignore it */
  250. }
  251. return dstlen - (dstend - dst);
  252. }
  253. /* query necessary dst length for src string */
  254. static inline int get_length_mbs_utf8( int flags, const char *src, int srclen )
  255. {
  256. int ret = 0;
  257. unsigned int res;
  258. const char *srcend = src + srclen;
  259. while (src < srcend)
  260. {
  261. unsigned char ch = *src++;
  262. if (ch < 0x80) /* special fast case for 7-bit ASCII */
  263. {
  264. ret++;
  265. continue;
  266. }
  267. if ((res = decode_utf8_char( ch, &src, srcend )) <= 0x10ffff)
  268. {
  269. if (res > 0xffff) ret++;
  270. ret++;
  271. }
  272. else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
  273. /* otherwise ignore it */
  274. }
  275. return ret;
  276. }
  277. /* UTF-8 to wide char string conversion */
  278. /* return -1 on dst buffer overflow, -2 on invalid input char */
  279. int wine_utf8_mbstowcs_obsolete( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
  280. {
  281. unsigned int res;
  282. const char *srcend = src + srclen;
  283. WCHAR *dstend = dst + dstlen;
  284. if (flags & MB_COMPOSITE) return utf8_mbstowcs_compose( flags, src, srclen, dst, dstlen );
  285. if (!dstlen) return get_length_mbs_utf8( flags, src, srclen );
  286. while ((dst < dstend) && (src < srcend))
  287. {
  288. unsigned char ch = *src++;
  289. if (ch < 0x80) /* special fast case for 7-bit ASCII */
  290. {
  291. *dst++ = ch;
  292. continue;
  293. }
  294. if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
  295. {
  296. *dst++ = res;
  297. }
  298. else if (res <= 0x10ffff) /* we need surrogates */
  299. {
  300. if (dst == dstend - 1) return -1; /* overflow */
  301. res -= 0x10000;
  302. *dst++ = 0xd800 | (res >> 10);
  303. *dst++ = 0xdc00 | (res & 0x3ff);
  304. }
  305. else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
  306. /* otherwise ignore it */
  307. }
  308. if (src < srcend) return -1; /* overflow */
  309. return dstlen - (dstend - dst);
  310. }
  311. __ASM_OBSOLETE(wine_utf8_wcstombs);
  312. __ASM_OBSOLETE(wine_utf8_mbstowcs);
  313. #endif /* __ASM_OBSOLETE */