sortkey.c 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. /*
  2. * Unicode sort key generation
  3. *
  4. * Copyright 2003 Dmitry Timoshkov
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  19. */
  20. #include "wine/asm.h"
  21. #ifdef __ASM_OBSOLETE
  22. #include "unicode.h"
  23. extern const unsigned int collation_table[];
  24. extern const unsigned short nfd_table[] DECLSPEC_HIDDEN;
  25. static const WCHAR *get_decomposition( WCHAR ch, unsigned int *len )
  26. {
  27. unsigned short offset = nfd_table[nfd_table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
  28. unsigned short start = nfd_table[offset];
  29. unsigned short end = nfd_table[offset + 1];
  30. if ((*len = end - start)) return nfd_table + start;
  31. *len = 1;
  32. return NULL;
  33. }
  34. /*
  35. * flags - normalization NORM_* flags
  36. *
  37. * FIXME: 'variable' flag not handled
  38. */
  39. int wine_get_sortkey_obsolete(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
  40. {
  41. WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
  42. int key_len[4];
  43. char *key_ptr[4];
  44. const WCHAR *src_save = src;
  45. int srclen_save = srclen;
  46. key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
  47. for (; srclen; srclen--, src++)
  48. {
  49. unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
  50. dummy[0] = *src;
  51. if (decomposed_len)
  52. {
  53. for (i = 0; i < decomposed_len; i++)
  54. {
  55. WCHAR wch = dummy[i];
  56. unsigned int ce;
  57. /* tests show that win2k just ignores NORM_IGNORENONSPACE,
  58. * and skips white space and punctuation characters for
  59. * NORM_IGNORESYMBOLS.
  60. */
  61. if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
  62. continue;
  63. if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
  64. ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
  65. if (ce != (unsigned int)-1)
  66. {
  67. if (ce >> 16) key_len[0] += 2;
  68. if ((ce >> 8) & 0xff) key_len[1]++;
  69. if ((ce >> 4) & 0x0f) key_len[2]++;
  70. if (ce & 1)
  71. {
  72. if (wch >> 8) key_len[3]++;
  73. key_len[3]++;
  74. }
  75. }
  76. else
  77. {
  78. key_len[0] += 2;
  79. if (wch >> 8) key_len[0]++;
  80. if (wch & 0xff) key_len[0]++;
  81. }
  82. }
  83. }
  84. }
  85. if (!dstlen) /* compute length */
  86. /* 4 * '\1' + key length */
  87. return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
  88. if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
  89. return 0; /* overflow */
  90. src = src_save;
  91. srclen = srclen_save;
  92. key_ptr[0] = dst;
  93. key_ptr[1] = key_ptr[0] + key_len[0] + 1;
  94. key_ptr[2] = key_ptr[1] + key_len[1] + 1;
  95. key_ptr[3] = key_ptr[2] + key_len[2] + 1;
  96. for (; srclen; srclen--, src++)
  97. {
  98. unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
  99. dummy[0] = *src;
  100. if (decomposed_len)
  101. {
  102. for (i = 0; i < decomposed_len; i++)
  103. {
  104. WCHAR wch = dummy[i];
  105. unsigned int ce;
  106. /* tests show that win2k just ignores NORM_IGNORENONSPACE,
  107. * and skips white space and punctuation characters for
  108. * NORM_IGNORESYMBOLS.
  109. */
  110. if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
  111. continue;
  112. if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
  113. ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
  114. if (ce != (unsigned int)-1)
  115. {
  116. WCHAR key;
  117. if ((key = ce >> 16))
  118. {
  119. *key_ptr[0]++ = key >> 8;
  120. *key_ptr[0]++ = key & 0xff;
  121. }
  122. /* make key 1 start from 2 */
  123. if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
  124. /* make key 2 start from 2 */
  125. if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
  126. /* key 3 is always a character code */
  127. if (ce & 1)
  128. {
  129. if (wch >> 8) *key_ptr[3]++ = wch >> 8;
  130. if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
  131. }
  132. }
  133. else
  134. {
  135. *key_ptr[0]++ = 0xff;
  136. *key_ptr[0]++ = 0xfe;
  137. if (wch >> 8) *key_ptr[0]++ = wch >> 8;
  138. if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
  139. }
  140. }
  141. }
  142. }
  143. *key_ptr[0] = '\1';
  144. *key_ptr[1] = '\1';
  145. *key_ptr[2] = '\1';
  146. *key_ptr[3]++ = '\1';
  147. *key_ptr[3] = 0;
  148. return key_ptr[3] - dst;
  149. }
  150. enum weight
  151. {
  152. UNICODE_WEIGHT,
  153. DIACRITIC_WEIGHT,
  154. CASE_WEIGHT
  155. };
  156. static unsigned int get_weight(WCHAR ch, enum weight type)
  157. {
  158. unsigned int ret;
  159. ret = collation_table[collation_table[collation_table[ch >> 8] + ((ch >> 4) & 0x0f)] + (ch & 0xf)];
  160. if (ret == (unsigned int)-1)
  161. return ch;
  162. switch(type)
  163. {
  164. case UNICODE_WEIGHT:
  165. return ret >> 16;
  166. case DIACRITIC_WEIGHT:
  167. return (ret >> 8) & 0xff;
  168. case CASE_WEIGHT:
  169. default:
  170. return (ret >> 4) & 0x0f;
  171. }
  172. }
  173. static void inc_str_pos(const WCHAR **str, int *len, unsigned int *dpos, unsigned int *dlen)
  174. {
  175. (*dpos)++;
  176. if (*dpos == *dlen)
  177. {
  178. *dpos = *dlen = 0;
  179. (*str)++;
  180. (*len)--;
  181. }
  182. }
  183. static inline int compare_weights(int flags, const WCHAR *str1, int len1,
  184. const WCHAR *str2, int len2, enum weight type)
  185. {
  186. unsigned int ce1, ce2, dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0;
  187. const WCHAR *dstr1 = NULL, *dstr2 = NULL;
  188. /* 32-bit collation element table format:
  189. * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
  190. * case weight - high 4 bit of low 8 bit.
  191. */
  192. while (len1 > 0 && len2 > 0)
  193. {
  194. if (!dlen1 && !(dstr1 = get_decomposition( *str1, &dlen1 ))) dstr1 = str1;
  195. if (!dlen2 && !(dstr2 = get_decomposition( *str2, &dlen2 ))) dstr2 = str2;
  196. if (flags & NORM_IGNORESYMBOLS)
  197. {
  198. int skip = 0;
  199. /* FIXME: not tested */
  200. if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE))
  201. {
  202. inc_str_pos(&str1, &len1, &dpos1, &dlen1);
  203. skip = 1;
  204. }
  205. if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE))
  206. {
  207. inc_str_pos(&str2, &len2, &dpos2, &dlen2);
  208. skip = 1;
  209. }
  210. if (skip) continue;
  211. }
  212. /* hyphen and apostrophe are treated differently depending on
  213. * whether SORT_STRINGSORT specified or not
  214. */
  215. if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT))
  216. {
  217. if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'')
  218. {
  219. if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'')
  220. {
  221. inc_str_pos(&str1, &len1, &dpos1, &dlen1);
  222. continue;
  223. }
  224. }
  225. else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'')
  226. {
  227. inc_str_pos(&str2, &len2, &dpos2, &dlen2);
  228. continue;
  229. }
  230. }
  231. ce1 = get_weight(dstr1[dpos1], type);
  232. if (!ce1)
  233. {
  234. inc_str_pos(&str1, &len1, &dpos1, &dlen1);
  235. continue;
  236. }
  237. ce2 = get_weight(dstr2[dpos2], type);
  238. if (!ce2)
  239. {
  240. inc_str_pos(&str2, &len2, &dpos2, &dlen2);
  241. continue;
  242. }
  243. if (ce1 - ce2) return ce1 - ce2;
  244. inc_str_pos(&str1, &len1, &dpos1, &dlen1);
  245. inc_str_pos(&str2, &len2, &dpos2, &dlen2);
  246. }
  247. while (len1)
  248. {
  249. if (!dlen1 && !(dstr1 = get_decomposition( *str1, &dlen1 ))) dstr1 = str1;
  250. ce1 = get_weight(dstr1[dpos1], type);
  251. if (ce1) break;
  252. inc_str_pos(&str1, &len1, &dpos1, &dlen1);
  253. }
  254. while (len2)
  255. {
  256. if (!dlen2 && !(dstr2 = get_decomposition( *str2, &dlen2 ))) dstr2 = str2;
  257. ce2 = get_weight(dstr2[dpos2], type);
  258. if (ce2) break;
  259. inc_str_pos(&str2, &len2, &dpos2, &dlen2);
  260. }
  261. return len1 - len2;
  262. }
  263. int wine_compare_string_obsolete(int flags, const WCHAR *str1, int len1,
  264. const WCHAR *str2, int len2)
  265. {
  266. int ret;
  267. ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT);
  268. if (!ret)
  269. {
  270. if (!(flags & NORM_IGNORENONSPACE))
  271. ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT);
  272. if (!ret && !(flags & NORM_IGNORECASE))
  273. ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT);
  274. }
  275. return ret;
  276. }
  277. __ASM_OBSOLETE(wine_get_sortkey);
  278. __ASM_OBSOLETE(wine_compare_string);
  279. #endif /* __ASM_OBSOLETE */