physfs_unicode.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. #include "physfs.h"
  2. #define __PHYSICSFS_INTERNAL__
  3. #include "physfs_internal.h"
  4. /*
  5. * From rfc3629, the UTF-8 spec:
  6. * http://www.ietf.org/rfc/rfc3629.txt
  7. *
  8. * Char. number range | UTF-8 octet sequence
  9. * (hexadecimal) | (binary)
  10. * --------------------+---------------------------------------------
  11. * 0000 0000-0000 007F | 0xxxxxxx
  12. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  13. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  14. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  15. */
  16. /*
  17. * This may not be the best value, but it's one that isn't represented
  18. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  19. * value from utf8codepoint() if there's bogus bits in the
  20. * stream. utf8codepoint() will turn this value into something
  21. * reasonable (like a question mark), for text that wants to try to recover,
  22. * whereas utf8valid() will use the value to determine if a string has bad
  23. * bits.
  24. */
  25. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  26. /*
  27. * This is the codepoint we currently return when there was bogus bits in a
  28. * UTF-8 string. May not fly in Asian locales?
  29. */
  30. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  31. static PHYSFS_uint32 utf8codepoint(const char **_str)
  32. {
  33. const char *str = *_str;
  34. PHYSFS_uint32 retval = 0;
  35. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  36. PHYSFS_uint32 octet2, octet3, octet4;
  37. if (octet == 0) /* null terminator, end of string. */
  38. return 0;
  39. else if (octet < 128) /* one octet char: 0 to 127 */
  40. {
  41. (*_str)++; /* skip to next possible start of codepoint. */
  42. return(octet);
  43. } /* else if */
  44. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  45. {
  46. /*
  47. * Apparently each of these is supposed to be flagged as a bogus
  48. * char, instead of just resyncing to the next valid codepoint.
  49. */
  50. (*_str)++; /* skip to next possible start of codepoint. */
  51. return UNICODE_BOGUS_CHAR_VALUE;
  52. } /* else if */
  53. else if (octet < 224) /* two octets */
  54. {
  55. (*_str)++; /* advance at least one byte in case of an error */
  56. octet -= (128+64);
  57. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  58. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  59. return UNICODE_BOGUS_CHAR_VALUE;
  60. *_str += 1; /* skip to next possible start of codepoint. */
  61. retval = ((octet << 6) | (octet2 - 128));
  62. if ((retval >= 0x80) && (retval <= 0x7FF))
  63. return retval;
  64. } /* else if */
  65. else if (octet < 240) /* three octets */
  66. {
  67. (*_str)++; // advance at least one byte in case of an error
  68. octet -= (128+64+32);
  69. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  70. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  71. return UNICODE_BOGUS_CHAR_VALUE;
  72. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  73. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  74. return UNICODE_BOGUS_CHAR_VALUE;
  75. *_str += 2; /* skip to next possible start of codepoint. */
  76. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  77. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  78. switch (retval)
  79. {
  80. case 0xD800:
  81. case 0xDB7F:
  82. case 0xDB80:
  83. case 0xDBFF:
  84. case 0xDC00:
  85. case 0xDF80:
  86. case 0xDFFF:
  87. return UNICODE_BOGUS_CHAR_VALUE;
  88. } /* switch */
  89. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  90. if ((retval >= 0x800) && (retval <= 0xFFFD))
  91. return retval;
  92. } /* else if */
  93. else if (octet < 248) /* four octets */
  94. {
  95. (*_str)++; // advance at least one byte in case of an error
  96. octet -= (128+64+32+16);
  97. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  98. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  99. return UNICODE_BOGUS_CHAR_VALUE;
  100. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  101. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  102. return UNICODE_BOGUS_CHAR_VALUE;
  103. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  104. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  105. return UNICODE_BOGUS_CHAR_VALUE;
  106. *_str += 3; /* skip to next possible start of codepoint. */
  107. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  108. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  109. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  110. return retval;
  111. } /* else if */
  112. /*
  113. * Five and six octet sequences became illegal in rfc3629.
  114. * We throw the codepoint away, but parse them to make sure we move
  115. * ahead the right number of bytes and don't overflow the buffer.
  116. */
  117. else if (octet < 252) /* five octets */
  118. {
  119. (*_str)++; // advance at least one byte in case of an error
  120. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  121. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  122. return UNICODE_BOGUS_CHAR_VALUE;
  123. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  124. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  125. return UNICODE_BOGUS_CHAR_VALUE;
  126. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  127. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  128. return UNICODE_BOGUS_CHAR_VALUE;
  129. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  130. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  131. return UNICODE_BOGUS_CHAR_VALUE;
  132. *_str += 4; /* skip to next possible start of codepoint. */
  133. return UNICODE_BOGUS_CHAR_VALUE;
  134. } /* else if */
  135. else /* six octets */
  136. {
  137. (*_str)++; // advance at least one byte in case of an error
  138. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  139. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  140. return UNICODE_BOGUS_CHAR_VALUE;
  141. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  142. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  143. return UNICODE_BOGUS_CHAR_VALUE;
  144. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  145. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  146. return UNICODE_BOGUS_CHAR_VALUE;
  147. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  148. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  149. return UNICODE_BOGUS_CHAR_VALUE;
  150. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  151. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  152. return UNICODE_BOGUS_CHAR_VALUE;
  153. *_str += 5; /* skip to next possible start of codepoint. */
  154. return UNICODE_BOGUS_CHAR_VALUE;
  155. } /* else if */
  156. return UNICODE_BOGUS_CHAR_VALUE;
  157. } /* utf8codepoint */
  158. void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  159. {
  160. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  161. while (len >= sizeof (PHYSFS_uint32))
  162. {
  163. PHYSFS_uint32 cp = utf8codepoint(&src);
  164. if (cp == 0)
  165. break;
  166. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  167. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  168. *(dst++) = cp;
  169. len -= sizeof (PHYSFS_uint32);
  170. } /* while */
  171. *dst = 0;
  172. } /* PHYSFS_utf8ToUcs4 */
  173. void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  174. {
  175. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  176. while (len >= sizeof (PHYSFS_uint16))
  177. {
  178. PHYSFS_uint32 cp = utf8codepoint(&src);
  179. if (cp == 0)
  180. break;
  181. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  182. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  183. /* !!! BLUESKY: UTF-16 surrogates? */
  184. if (cp > 0xFFFF)
  185. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  186. *(dst++) = cp;
  187. len -= sizeof (PHYSFS_uint16);
  188. } /* while */
  189. *dst = 0;
  190. } /* PHYSFS_utf8ToUcs2 */
  191. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  192. {
  193. char *dst = *_dst;
  194. PHYSFS_uint64 len = *_len;
  195. if (len == 0)
  196. return;
  197. if (cp > 0x10FFFF)
  198. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  199. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  200. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  201. else
  202. {
  203. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  204. switch (cp)
  205. {
  206. case 0xD800:
  207. case 0xDB7F:
  208. case 0xDB80:
  209. case 0xDBFF:
  210. case 0xDC00:
  211. case 0xDF80:
  212. case 0xDFFF:
  213. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  214. } /* switch */
  215. } /* else */
  216. /* Do the encoding... */
  217. if (cp < 0x80)
  218. {
  219. *(dst++) = (char) cp;
  220. len--;
  221. } /* if */
  222. else if (cp < 0x800)
  223. {
  224. if (len < 2)
  225. len = 0;
  226. else
  227. {
  228. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  229. *(dst++) = (char) (cp & 0x3F) | 128;
  230. len -= 2;
  231. } /* else */
  232. } /* else if */
  233. else if (cp < 0x10000)
  234. {
  235. if (len < 3)
  236. len = 0;
  237. else
  238. {
  239. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  240. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  241. *(dst++) = (char) (cp & 0x3F) | 128;
  242. len -= 3;
  243. } /* else */
  244. } /* else if */
  245. else
  246. {
  247. if (len < 4)
  248. len = 0;
  249. else
  250. {
  251. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  252. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  253. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  254. *(dst++) = (char) (cp & 0x3F) | 128;
  255. len -= 4;
  256. } /* else if */
  257. } /* else */
  258. *_dst = dst;
  259. *_len = len;
  260. } /* utf8fromcodepoint */
  261. #define UTF8FROMTYPE(typ, src, dst, len) \
  262. if (len == 0) return; \
  263. len--; \
  264. while (len) \
  265. { \
  266. const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
  267. if (cp == 0) break; \
  268. utf8fromcodepoint(cp, &dst, &len); \
  269. } \
  270. *dst = '\0'; \
  271. void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  272. {
  273. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  274. } /* PHYSFS_utf8FromUcs4 */
  275. void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  276. {
  277. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  278. } /* PHYSFS_utf8FromUcs4 */
  279. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  280. void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
  281. {
  282. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  283. } /* PHYSFS_utf8FromLatin1 */
  284. #undef UTF8FROMTYPE
  285. typedef struct CaseFoldMapping
  286. {
  287. PHYSFS_uint32 from;
  288. PHYSFS_uint32 to0;
  289. PHYSFS_uint32 to1;
  290. PHYSFS_uint32 to2;
  291. } CaseFoldMapping;
  292. typedef struct CaseFoldHashBucket
  293. {
  294. const PHYSFS_uint8 count;
  295. const CaseFoldMapping *list;
  296. } CaseFoldHashBucket;
  297. #include "physfs_casefolding.h"
  298. static void locate_case_fold_mapping(const PHYSFS_uint32 from,
  299. PHYSFS_uint32 *to)
  300. {
  301. PHYSFS_uint32 i;
  302. const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
  303. const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
  304. const CaseFoldMapping *mapping = bucket->list;
  305. for (i = 0; i < bucket->count; i++, mapping++)
  306. {
  307. if (mapping->from == from)
  308. {
  309. to[0] = mapping->to0;
  310. to[1] = mapping->to1;
  311. to[2] = mapping->to2;
  312. return;
  313. } /* if */
  314. } /* for */
  315. /* Not found...there's no remapping for this codepoint. */
  316. to[0] = from;
  317. to[1] = 0;
  318. to[2] = 0;
  319. } /* locate_case_fold_mapping */
  320. static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
  321. {
  322. PHYSFS_uint32 folded1[3], folded2[3];
  323. locate_case_fold_mapping(cp1, folded1);
  324. locate_case_fold_mapping(cp2, folded2);
  325. return ( (folded1[0] == folded2[0]) &&
  326. (folded1[1] == folded2[1]) &&
  327. (folded1[2] == folded2[2]) );
  328. } /* utf8codepointcmp */
  329. int __PHYSFS_utf8strcasecmp(const char *str1, const char *str2)
  330. {
  331. while (1)
  332. {
  333. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  334. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  335. if (!utf8codepointcmp(cp1, cp2)) return 0;
  336. if (cp1 == 0) return 1;
  337. } /* while */
  338. return 0; /* shouldn't hit this. */
  339. } /* __PHYSFS_utf8strcasecmp */
  340. int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
  341. {
  342. while (n > 0)
  343. {
  344. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  345. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  346. if (!utf8codepointcmp(cp1, cp2)) return 0;
  347. if (cp1 == 0) return 1;
  348. n--;
  349. } /* while */
  350. return 1; /* matched to n chars. */
  351. } /* __PHYSFS_utf8strnicmp */
  352. int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
  353. {
  354. while (1)
  355. {
  356. const char ch1 = *(str1++);
  357. const char ch2 = *(str2++);
  358. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  359. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  360. if (cp1 < cp2)
  361. return -1;
  362. else if (cp1 > cp2)
  363. return 1;
  364. else if (cp1 == 0) /* they're both null chars? */
  365. return 0;
  366. } /* while */
  367. return 0; /* shouldn't hit this. */
  368. } /* __PHYSFS_stricmpASCII */
  369. int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
  370. {
  371. while (n-- > 0)
  372. {
  373. const char ch1 = *(str1++);
  374. const char ch2 = *(str2++);
  375. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  376. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  377. if (cp1 < cp2)
  378. return -1;
  379. else if (cp1 > cp2)
  380. return 1;
  381. else if (cp1 == 0) /* they're both null chars? */
  382. return 0;
  383. } /* while */
  384. return 0;
  385. } /* __PHYSFS_stricmpASCII */
  386. /* end of physfs_unicode.c ... */