collationfastlatin.cpp 42 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationfastlatin.cpp
  9. *
  10. * created on: 2013aug18
  11. * created by: Markus W. Scherer
  12. */
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/ucol.h"
  16. #include "collationdata.h"
  17. #include "collationfastlatin.h"
  18. #include "collationsettings.h"
  19. #include "uassert.h"
  20. U_NAMESPACE_BEGIN
  21. int32_t
  22. CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
  23. uint16_t *primaries, int32_t capacity) {
  24. const uint16_t *table = data->fastLatinTable;
  25. if(table == NULL) { return -1; }
  26. U_ASSERT(capacity == LATIN_LIMIT);
  27. if(capacity != LATIN_LIMIT) { return -1; }
  28. uint32_t miniVarTop;
  29. if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
  30. // No mini primaries are variable, set a variableTop just below the
  31. // lowest long mini primary.
  32. miniVarTop = MIN_LONG - 1;
  33. } else {
  34. int32_t headerLength = *table & 0xff;
  35. int32_t i = 1 + settings.getMaxVariable();
  36. if(i >= headerLength) {
  37. return -1; // variableTop >= digits, should not occur
  38. }
  39. miniVarTop = table[i];
  40. }
  41. UBool digitsAreReordered = FALSE;
  42. if(settings.hasReordering()) {
  43. uint32_t prevStart = 0;
  44. uint32_t beforeDigitStart = 0;
  45. uint32_t digitStart = 0;
  46. uint32_t afterDigitStart = 0;
  47. for(int32_t group = UCOL_REORDER_CODE_FIRST;
  48. group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
  49. ++group) {
  50. uint32_t start = data->getFirstPrimaryForGroup(group);
  51. start = settings.reorder(start);
  52. if(group == UCOL_REORDER_CODE_DIGIT) {
  53. beforeDigitStart = prevStart;
  54. digitStart = start;
  55. } else if(start != 0) {
  56. if(start < prevStart) {
  57. // The permutation affects the groups up to Latin.
  58. return -1;
  59. }
  60. // In the future, there might be a special group between digits & Latin.
  61. if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
  62. afterDigitStart = start;
  63. }
  64. prevStart = start;
  65. }
  66. }
  67. uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
  68. latinStart = settings.reorder(latinStart);
  69. if(latinStart < prevStart) {
  70. return -1;
  71. }
  72. if(afterDigitStart == 0) {
  73. afterDigitStart = latinStart;
  74. }
  75. if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
  76. digitsAreReordered = TRUE;
  77. }
  78. }
  79. table += (table[0] & 0xff); // skip the header
  80. for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
  81. uint32_t p = table[c];
  82. if(p >= MIN_SHORT) {
  83. p &= SHORT_PRIMARY_MASK;
  84. } else if(p > miniVarTop) {
  85. p &= LONG_PRIMARY_MASK;
  86. } else {
  87. p = 0;
  88. }
  89. primaries[c] = (uint16_t)p;
  90. }
  91. if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
  92. // Bail out for digits.
  93. for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
  94. }
  95. // Shift the miniVarTop above other options.
  96. return ((int32_t)miniVarTop << 16) | settings.options;
  97. }
  98. int32_t
  99. CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
  100. const UChar *left, int32_t leftLength,
  101. const UChar *right, int32_t rightLength) {
  102. // This is a modified copy of CollationCompare::compareUpToQuaternary(),
  103. // optimized for common Latin text.
  104. // Keep them in sync!
  105. // Keep compareUTF16() and compareUTF8() in sync very closely!
  106. U_ASSERT((table[0] >> 8) == VERSION);
  107. table += (table[0] & 0xff); // skip the header
  108. uint32_t variableTop = (uint32_t)options >> 16; // see getOptions()
  109. options &= 0xffff; // needed for CollationSettings::getStrength() to work
  110. // Check for supported characters, fetch mini CEs, and compare primaries.
  111. int32_t leftIndex = 0, rightIndex = 0;
  112. /**
  113. * Single mini CE or a pair.
  114. * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
  115. * If there is only one, then it is in the lower bits, and the upper bits are 0.
  116. */
  117. uint32_t leftPair = 0, rightPair = 0;
  118. for(;;) {
  119. // We fetch CEs until we get a non-ignorable primary or reach the end.
  120. while(leftPair == 0) {
  121. if(leftIndex == leftLength) {
  122. leftPair = EOS;
  123. break;
  124. }
  125. UChar32 c = left[leftIndex++];
  126. if(c <= LATIN_MAX) {
  127. leftPair = primaries[c];
  128. if(leftPair != 0) { break; }
  129. if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
  130. return BAIL_OUT_RESULT;
  131. }
  132. leftPair = table[c];
  133. } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
  134. leftPair = table[c - PUNCT_START + LATIN_LIMIT];
  135. } else {
  136. leftPair = lookup(table, c);
  137. }
  138. if(leftPair >= MIN_SHORT) {
  139. leftPair &= SHORT_PRIMARY_MASK;
  140. break;
  141. } else if(leftPair > variableTop) {
  142. leftPair &= LONG_PRIMARY_MASK;
  143. break;
  144. } else {
  145. leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
  146. if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
  147. leftPair = getPrimaries(variableTop, leftPair);
  148. }
  149. }
  150. while(rightPair == 0) {
  151. if(rightIndex == rightLength) {
  152. rightPair = EOS;
  153. break;
  154. }
  155. UChar32 c = right[rightIndex++];
  156. if(c <= LATIN_MAX) {
  157. rightPair = primaries[c];
  158. if(rightPair != 0) { break; }
  159. if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
  160. return BAIL_OUT_RESULT;
  161. }
  162. rightPair = table[c];
  163. } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
  164. rightPair = table[c - PUNCT_START + LATIN_LIMIT];
  165. } else {
  166. rightPair = lookup(table, c);
  167. }
  168. if(rightPair >= MIN_SHORT) {
  169. rightPair &= SHORT_PRIMARY_MASK;
  170. break;
  171. } else if(rightPair > variableTop) {
  172. rightPair &= LONG_PRIMARY_MASK;
  173. break;
  174. } else {
  175. rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
  176. if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
  177. rightPair = getPrimaries(variableTop, rightPair);
  178. }
  179. }
  180. if(leftPair == rightPair) {
  181. if(leftPair == EOS) { break; }
  182. leftPair = rightPair = 0;
  183. continue;
  184. }
  185. uint32_t leftPrimary = leftPair & 0xffff;
  186. uint32_t rightPrimary = rightPair & 0xffff;
  187. if(leftPrimary != rightPrimary) {
  188. // Return the primary difference.
  189. return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
  190. }
  191. if(leftPair == EOS) { break; }
  192. leftPair >>= 16;
  193. rightPair >>= 16;
  194. }
  195. // In the following, we need to re-fetch each character because we did not buffer the CEs,
  196. // but we know that the string is well-formed and
  197. // only contains supported characters and mappings.
  198. // We might skip the secondary level but continue with the case level
  199. // which is turned on separately.
  200. if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
  201. leftIndex = rightIndex = 0;
  202. leftPair = rightPair = 0;
  203. for(;;) {
  204. while(leftPair == 0) {
  205. if(leftIndex == leftLength) {
  206. leftPair = EOS;
  207. break;
  208. }
  209. UChar32 c = left[leftIndex++];
  210. if(c <= LATIN_MAX) {
  211. leftPair = table[c];
  212. } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
  213. leftPair = table[c - PUNCT_START + LATIN_LIMIT];
  214. } else {
  215. leftPair = lookup(table, c);
  216. }
  217. if(leftPair >= MIN_SHORT) {
  218. leftPair = getSecondariesFromOneShortCE(leftPair);
  219. break;
  220. } else if(leftPair > variableTop) {
  221. leftPair = COMMON_SEC_PLUS_OFFSET;
  222. break;
  223. } else {
  224. leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
  225. leftPair = getSecondaries(variableTop, leftPair);
  226. }
  227. }
  228. while(rightPair == 0) {
  229. if(rightIndex == rightLength) {
  230. rightPair = EOS;
  231. break;
  232. }
  233. UChar32 c = right[rightIndex++];
  234. if(c <= LATIN_MAX) {
  235. rightPair = table[c];
  236. } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
  237. rightPair = table[c - PUNCT_START + LATIN_LIMIT];
  238. } else {
  239. rightPair = lookup(table, c);
  240. }
  241. if(rightPair >= MIN_SHORT) {
  242. rightPair = getSecondariesFromOneShortCE(rightPair);
  243. break;
  244. } else if(rightPair > variableTop) {
  245. rightPair = COMMON_SEC_PLUS_OFFSET;
  246. break;
  247. } else {
  248. rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
  249. rightPair = getSecondaries(variableTop, rightPair);
  250. }
  251. }
  252. if(leftPair == rightPair) {
  253. if(leftPair == EOS) { break; }
  254. leftPair = rightPair = 0;
  255. continue;
  256. }
  257. uint32_t leftSecondary = leftPair & 0xffff;
  258. uint32_t rightSecondary = rightPair & 0xffff;
  259. if(leftSecondary != rightSecondary) {
  260. if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
  261. // Full support for backwards secondary requires backwards contraction matching
  262. // and moving backwards between merge separators.
  263. return BAIL_OUT_RESULT;
  264. }
  265. return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
  266. }
  267. if(leftPair == EOS) { break; }
  268. leftPair >>= 16;
  269. rightPair >>= 16;
  270. }
  271. }
  272. if((options & CollationSettings::CASE_LEVEL) != 0) {
  273. UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
  274. leftIndex = rightIndex = 0;
  275. leftPair = rightPair = 0;
  276. for(;;) {
  277. while(leftPair == 0) {
  278. if(leftIndex == leftLength) {
  279. leftPair = EOS;
  280. break;
  281. }
  282. UChar32 c = left[leftIndex++];
  283. leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  284. if(leftPair < MIN_LONG) {
  285. leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
  286. }
  287. leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
  288. }
  289. while(rightPair == 0) {
  290. if(rightIndex == rightLength) {
  291. rightPair = EOS;
  292. break;
  293. }
  294. UChar32 c = right[rightIndex++];
  295. rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  296. if(rightPair < MIN_LONG) {
  297. rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
  298. }
  299. rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
  300. }
  301. if(leftPair == rightPair) {
  302. if(leftPair == EOS) { break; }
  303. leftPair = rightPair = 0;
  304. continue;
  305. }
  306. uint32_t leftCase = leftPair & 0xffff;
  307. uint32_t rightCase = rightPair & 0xffff;
  308. if(leftCase != rightCase) {
  309. if((options & CollationSettings::UPPER_FIRST) == 0) {
  310. return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
  311. } else {
  312. return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
  313. }
  314. }
  315. if(leftPair == EOS) { break; }
  316. leftPair >>= 16;
  317. rightPair >>= 16;
  318. }
  319. }
  320. if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
  321. // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
  322. UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
  323. leftIndex = rightIndex = 0;
  324. leftPair = rightPair = 0;
  325. for(;;) {
  326. while(leftPair == 0) {
  327. if(leftIndex == leftLength) {
  328. leftPair = EOS;
  329. break;
  330. }
  331. UChar32 c = left[leftIndex++];
  332. leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  333. if(leftPair < MIN_LONG) {
  334. leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
  335. }
  336. leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
  337. }
  338. while(rightPair == 0) {
  339. if(rightIndex == rightLength) {
  340. rightPair = EOS;
  341. break;
  342. }
  343. UChar32 c = right[rightIndex++];
  344. rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  345. if(rightPair < MIN_LONG) {
  346. rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
  347. }
  348. rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
  349. }
  350. if(leftPair == rightPair) {
  351. if(leftPair == EOS) { break; }
  352. leftPair = rightPair = 0;
  353. continue;
  354. }
  355. uint32_t leftTertiary = leftPair & 0xffff;
  356. uint32_t rightTertiary = rightPair & 0xffff;
  357. if(leftTertiary != rightTertiary) {
  358. if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
  359. // Pass through EOS and MERGE_WEIGHT
  360. // and keep real tertiary weights larger than the MERGE_WEIGHT.
  361. // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
  362. if(leftTertiary > MERGE_WEIGHT) {
  363. leftTertiary ^= CASE_MASK;
  364. }
  365. if(rightTertiary > MERGE_WEIGHT) {
  366. rightTertiary ^= CASE_MASK;
  367. }
  368. }
  369. return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
  370. }
  371. if(leftPair == EOS) { break; }
  372. leftPair >>= 16;
  373. rightPair >>= 16;
  374. }
  375. if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
  376. leftIndex = rightIndex = 0;
  377. leftPair = rightPair = 0;
  378. for(;;) {
  379. while(leftPair == 0) {
  380. if(leftIndex == leftLength) {
  381. leftPair = EOS;
  382. break;
  383. }
  384. UChar32 c = left[leftIndex++];
  385. leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  386. if(leftPair < MIN_LONG) {
  387. leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
  388. }
  389. leftPair = getQuaternaries(variableTop, leftPair);
  390. }
  391. while(rightPair == 0) {
  392. if(rightIndex == rightLength) {
  393. rightPair = EOS;
  394. break;
  395. }
  396. UChar32 c = right[rightIndex++];
  397. rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
  398. if(rightPair < MIN_LONG) {
  399. rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
  400. }
  401. rightPair = getQuaternaries(variableTop, rightPair);
  402. }
  403. if(leftPair == rightPair) {
  404. if(leftPair == EOS) { break; }
  405. leftPair = rightPair = 0;
  406. continue;
  407. }
  408. uint32_t leftQuaternary = leftPair & 0xffff;
  409. uint32_t rightQuaternary = rightPair & 0xffff;
  410. if(leftQuaternary != rightQuaternary) {
  411. return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
  412. }
  413. if(leftPair == EOS) { break; }
  414. leftPair >>= 16;
  415. rightPair >>= 16;
  416. }
  417. return UCOL_EQUAL;
  418. }
  419. int32_t
  420. CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
  421. const uint8_t *left, int32_t leftLength,
  422. const uint8_t *right, int32_t rightLength) {
  423. // Keep compareUTF16() and compareUTF8() in sync very closely!
  424. U_ASSERT((table[0] >> 8) == VERSION);
  425. table += (table[0] & 0xff); // skip the header
  426. uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions()
  427. options &= 0xffff; // needed for CollationSettings::getStrength() to work
  428. // Check for supported characters, fetch mini CEs, and compare primaries.
  429. int32_t leftIndex = 0, rightIndex = 0;
  430. /**
  431. * Single mini CE or a pair.
  432. * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
  433. * If there is only one, then it is in the lower bits, and the upper bits are 0.
  434. */
  435. uint32_t leftPair = 0, rightPair = 0;
  436. // Note: There is no need to assemble the code point.
  437. // We only need to look up the table entry for the character,
  438. // and nextPair() looks for whether c==0.
  439. for(;;) {
  440. // We fetch CEs until we get a non-ignorable primary or reach the end.
  441. while(leftPair == 0) {
  442. if(leftIndex == leftLength) {
  443. leftPair = EOS;
  444. break;
  445. }
  446. UChar32 c = left[leftIndex++];
  447. uint8_t t;
  448. if(c <= 0x7f) {
  449. leftPair = primaries[c];
  450. if(leftPair != 0) { break; }
  451. if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
  452. return BAIL_OUT_RESULT;
  453. }
  454. leftPair = table[c];
  455. } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
  456. 0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
  457. ++leftIndex;
  458. c = ((c - 0xc2) << 6) + t;
  459. leftPair = primaries[c];
  460. if(leftPair != 0) { break; }
  461. leftPair = table[c];
  462. } else {
  463. leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
  464. }
  465. if(leftPair >= MIN_SHORT) {
  466. leftPair &= SHORT_PRIMARY_MASK;
  467. break;
  468. } else if(leftPair > variableTop) {
  469. leftPair &= LONG_PRIMARY_MASK;
  470. break;
  471. } else {
  472. leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
  473. if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
  474. leftPair = getPrimaries(variableTop, leftPair);
  475. }
  476. }
  477. while(rightPair == 0) {
  478. if(rightIndex == rightLength) {
  479. rightPair = EOS;
  480. break;
  481. }
  482. UChar32 c = right[rightIndex++];
  483. uint8_t t;
  484. if(c <= 0x7f) {
  485. rightPair = primaries[c];
  486. if(rightPair != 0) { break; }
  487. if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
  488. return BAIL_OUT_RESULT;
  489. }
  490. rightPair = table[c];
  491. } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
  492. 0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
  493. ++rightIndex;
  494. c = ((c - 0xc2) << 6) + t;
  495. rightPair = primaries[c];
  496. if(rightPair != 0) { break; }
  497. rightPair = table[c];
  498. } else {
  499. rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
  500. }
  501. if(rightPair >= MIN_SHORT) {
  502. rightPair &= SHORT_PRIMARY_MASK;
  503. break;
  504. } else if(rightPair > variableTop) {
  505. rightPair &= LONG_PRIMARY_MASK;
  506. break;
  507. } else {
  508. rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
  509. if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
  510. rightPair = getPrimaries(variableTop, rightPair);
  511. }
  512. }
  513. if(leftPair == rightPair) {
  514. if(leftPair == EOS) { break; }
  515. leftPair = rightPair = 0;
  516. continue;
  517. }
  518. uint32_t leftPrimary = leftPair & 0xffff;
  519. uint32_t rightPrimary = rightPair & 0xffff;
  520. if(leftPrimary != rightPrimary) {
  521. // Return the primary difference.
  522. return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
  523. }
  524. if(leftPair == EOS) { break; }
  525. leftPair >>= 16;
  526. rightPair >>= 16;
  527. }
  528. // In the following, we need to re-fetch each character because we did not buffer the CEs,
  529. // but we know that the string is well-formed and
  530. // only contains supported characters and mappings.
  531. // We might skip the secondary level but continue with the case level
  532. // which is turned on separately.
  533. if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
  534. leftIndex = rightIndex = 0;
  535. leftPair = rightPair = 0;
  536. for(;;) {
  537. while(leftPair == 0) {
  538. if(leftIndex == leftLength) {
  539. leftPair = EOS;
  540. break;
  541. }
  542. UChar32 c = left[leftIndex++];
  543. if(c <= 0x7f) {
  544. leftPair = table[c];
  545. } else if(c <= LATIN_MAX_UTF8_LEAD) {
  546. leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
  547. } else {
  548. leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
  549. }
  550. if(leftPair >= MIN_SHORT) {
  551. leftPair = getSecondariesFromOneShortCE(leftPair);
  552. break;
  553. } else if(leftPair > variableTop) {
  554. leftPair = COMMON_SEC_PLUS_OFFSET;
  555. break;
  556. } else {
  557. leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
  558. leftPair = getSecondaries(variableTop, leftPair);
  559. }
  560. }
  561. while(rightPair == 0) {
  562. if(rightIndex == rightLength) {
  563. rightPair = EOS;
  564. break;
  565. }
  566. UChar32 c = right[rightIndex++];
  567. if(c <= 0x7f) {
  568. rightPair = table[c];
  569. } else if(c <= LATIN_MAX_UTF8_LEAD) {
  570. rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
  571. } else {
  572. rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
  573. }
  574. if(rightPair >= MIN_SHORT) {
  575. rightPair = getSecondariesFromOneShortCE(rightPair);
  576. break;
  577. } else if(rightPair > variableTop) {
  578. rightPair = COMMON_SEC_PLUS_OFFSET;
  579. break;
  580. } else {
  581. rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
  582. rightPair = getSecondaries(variableTop, rightPair);
  583. }
  584. }
  585. if(leftPair == rightPair) {
  586. if(leftPair == EOS) { break; }
  587. leftPair = rightPair = 0;
  588. continue;
  589. }
  590. uint32_t leftSecondary = leftPair & 0xffff;
  591. uint32_t rightSecondary = rightPair & 0xffff;
  592. if(leftSecondary != rightSecondary) {
  593. if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
  594. // Full support for backwards secondary requires backwards contraction matching
  595. // and moving backwards between merge separators.
  596. return BAIL_OUT_RESULT;
  597. }
  598. return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
  599. }
  600. if(leftPair == EOS) { break; }
  601. leftPair >>= 16;
  602. rightPair >>= 16;
  603. }
  604. }
  605. if((options & CollationSettings::CASE_LEVEL) != 0) {
  606. UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
  607. leftIndex = rightIndex = 0;
  608. leftPair = rightPair = 0;
  609. for(;;) {
  610. while(leftPair == 0) {
  611. if(leftIndex == leftLength) {
  612. leftPair = EOS;
  613. break;
  614. }
  615. UChar32 c = left[leftIndex++];
  616. leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
  617. if(leftPair < MIN_LONG) {
  618. leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
  619. }
  620. leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
  621. }
  622. while(rightPair == 0) {
  623. if(rightIndex == rightLength) {
  624. rightPair = EOS;
  625. break;
  626. }
  627. UChar32 c = right[rightIndex++];
  628. rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
  629. if(rightPair < MIN_LONG) {
  630. rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
  631. }
  632. rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
  633. }
  634. if(leftPair == rightPair) {
  635. if(leftPair == EOS) { break; }
  636. leftPair = rightPair = 0;
  637. continue;
  638. }
  639. uint32_t leftCase = leftPair & 0xffff;
  640. uint32_t rightCase = rightPair & 0xffff;
  641. if(leftCase != rightCase) {
  642. if((options & CollationSettings::UPPER_FIRST) == 0) {
  643. return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
  644. } else {
  645. return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
  646. }
  647. }
  648. if(leftPair == EOS) { break; }
  649. leftPair >>= 16;
  650. rightPair >>= 16;
  651. }
  652. }
  653. if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
  654. // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
  655. UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
  656. leftIndex = rightIndex = 0;
  657. leftPair = rightPair = 0;
  658. for(;;) {
  659. while(leftPair == 0) {
  660. if(leftIndex == leftLength) {
  661. leftPair = EOS;
  662. break;
  663. }
  664. UChar32 c = left[leftIndex++];
  665. leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
  666. if(leftPair < MIN_LONG) {
  667. leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
  668. }
  669. leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
  670. }
  671. while(rightPair == 0) {
  672. if(rightIndex == rightLength) {
  673. rightPair = EOS;
  674. break;
  675. }
  676. UChar32 c = right[rightIndex++];
  677. rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
  678. if(rightPair < MIN_LONG) {
  679. rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
  680. }
  681. rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
  682. }
  683. if(leftPair == rightPair) {
  684. if(leftPair == EOS) { break; }
  685. leftPair = rightPair = 0;
  686. continue;
  687. }
  688. uint32_t leftTertiary = leftPair & 0xffff;
  689. uint32_t rightTertiary = rightPair & 0xffff;
  690. if(leftTertiary != rightTertiary) {
  691. if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
  692. // Pass through EOS and MERGE_WEIGHT
  693. // and keep real tertiary weights larger than the MERGE_WEIGHT.
  694. // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
  695. if(leftTertiary > MERGE_WEIGHT) {
  696. leftTertiary ^= CASE_MASK;
  697. }
  698. if(rightTertiary > MERGE_WEIGHT) {
  699. rightTertiary ^= CASE_MASK;
  700. }
  701. }
  702. return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
  703. }
  704. if(leftPair == EOS) { break; }
  705. leftPair >>= 16;
  706. rightPair >>= 16;
  707. }
  708. if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
  709. leftIndex = rightIndex = 0;
  710. leftPair = rightPair = 0;
  711. for(;;) {
  712. while(leftPair == 0) {
  713. if(leftIndex == leftLength) {
  714. leftPair = EOS;
  715. break;
  716. }
  717. UChar32 c = left[leftIndex++];
  718. leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
  719. if(leftPair < MIN_LONG) {
  720. leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
  721. }
  722. leftPair = getQuaternaries(variableTop, leftPair);
  723. }
  724. while(rightPair == 0) {
  725. if(rightIndex == rightLength) {
  726. rightPair = EOS;
  727. break;
  728. }
  729. UChar32 c = right[rightIndex++];
  730. rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
  731. if(rightPair < MIN_LONG) {
  732. rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
  733. }
  734. rightPair = getQuaternaries(variableTop, rightPair);
  735. }
  736. if(leftPair == rightPair) {
  737. if(leftPair == EOS) { break; }
  738. leftPair = rightPair = 0;
  739. continue;
  740. }
  741. uint32_t leftQuaternary = leftPair & 0xffff;
  742. uint32_t rightQuaternary = rightPair & 0xffff;
  743. if(leftQuaternary != rightQuaternary) {
  744. return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
  745. }
  746. if(leftPair == EOS) { break; }
  747. leftPair >>= 16;
  748. rightPair >>= 16;
  749. }
  750. return UCOL_EQUAL;
  751. }
  752. uint32_t
  753. CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
  754. U_ASSERT(c > LATIN_MAX);
  755. if(PUNCT_START <= c && c < PUNCT_LIMIT) {
  756. return table[c - PUNCT_START + LATIN_LIMIT];
  757. } else if(c == 0xfffe) {
  758. return MERGE_WEIGHT;
  759. } else if(c == 0xffff) {
  760. return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
  761. } else {
  762. return BAIL_OUT;
  763. }
  764. }
  765. uint32_t
  766. CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
  767. const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
  768. // The caller handled ASCII and valid/supported Latin.
  769. U_ASSERT(c > 0x7f);
  770. int32_t i2 = sIndex + 1;
  771. if(i2 < sLength || sLength < 0) {
  772. uint8_t t1 = s8[sIndex];
  773. uint8_t t2 = s8[i2];
  774. sIndex += 2;
  775. if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
  776. return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
  777. } else if(c == 0xef && t1 == 0xbf) {
  778. if(t2 == 0xbe) {
  779. return MERGE_WEIGHT; // U+FFFE
  780. } else if(t2 == 0xbf) {
  781. return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
  782. }
  783. }
  784. }
  785. return BAIL_OUT;
  786. }
  787. uint32_t
  788. CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
  789. const uint8_t *s8, int32_t &sIndex) {
  790. // The caller handled ASCII.
  791. // The string is well-formed and contains only supported characters.
  792. U_ASSERT(c > 0x7f);
  793. if(c <= LATIN_MAX_UTF8_LEAD) {
  794. return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F
  795. }
  796. uint8_t t2 = s8[sIndex + 1];
  797. sIndex += 2;
  798. if(c == 0xe2) {
  799. return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
  800. } else if(t2 == 0xbe) {
  801. return MERGE_WEIGHT; // U+FFFE
  802. } else {
  803. return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
  804. }
  805. }
  806. uint32_t
  807. CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
  808. const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
  809. if(ce >= MIN_LONG || ce < CONTRACTION) {
  810. return ce; // simple or special mini CE
  811. } else if(ce >= EXPANSION) {
  812. int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
  813. return ((uint32_t)table[index + 1] << 16) | table[index];
  814. } else /* ce >= CONTRACTION */ {
  815. if(c == 0 && sLength < 0) {
  816. sLength = sIndex - 1;
  817. return EOS;
  818. }
  819. // Contraction list: Default mapping followed by
  820. // 0 or more single-character contraction suffix mappings.
  821. int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
  822. if(sIndex != sLength) {
  823. // Read the next character.
  824. int32_t c2;
  825. int32_t nextIndex = sIndex;
  826. if(s16 != NULL) {
  827. c2 = s16[nextIndex++];
  828. if(c2 > LATIN_MAX) {
  829. if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
  830. c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF
  831. } else if(c2 == 0xfffe || c2 == 0xffff) {
  832. c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
  833. } else {
  834. return BAIL_OUT;
  835. }
  836. }
  837. } else {
  838. c2 = s8[nextIndex++];
  839. if(c2 > 0x7f) {
  840. uint8_t t;
  841. if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
  842. 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
  843. c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F
  844. ++nextIndex;
  845. } else {
  846. int32_t i2 = nextIndex + 1;
  847. if(i2 < sLength || sLength < 0) {
  848. if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
  849. 0x80 <= (t = s8[i2]) && t <= 0xbf) {
  850. c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF
  851. } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
  852. ((t = s8[i2]) == 0xbe || t == 0xbf)) {
  853. c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
  854. } else {
  855. return BAIL_OUT;
  856. }
  857. } else {
  858. return BAIL_OUT;
  859. }
  860. nextIndex += 2;
  861. }
  862. }
  863. }
  864. if(c2 == 0 && sLength < 0) {
  865. sLength = sIndex;
  866. c2 = -1;
  867. }
  868. // Look for the next character in the contraction suffix list,
  869. // which is in ascending order of single suffix characters.
  870. int32_t i = index;
  871. int32_t head = table[i]; // first skip the default mapping
  872. int32_t x;
  873. do {
  874. i += head >> CONTR_LENGTH_SHIFT;
  875. head = table[i];
  876. x = head & CONTR_CHAR_MASK;
  877. } while(x < c2);
  878. if(x == c2) {
  879. index = i;
  880. sIndex = nextIndex;
  881. }
  882. }
  883. // Return the CE or CEs for the default or contraction mapping.
  884. int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
  885. if(length == 1) {
  886. return BAIL_OUT;
  887. }
  888. ce = table[index + 1];
  889. if(length == 2) {
  890. return ce;
  891. } else {
  892. return ((uint32_t)table[index + 2] << 16) | ce;
  893. }
  894. }
  895. }
  896. uint32_t
  897. CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
  898. if(pair <= 0xffff) {
  899. // one mini CE
  900. if(pair >= MIN_SHORT) {
  901. pair = getSecondariesFromOneShortCE(pair);
  902. } else if(pair > variableTop) {
  903. pair = COMMON_SEC_PLUS_OFFSET;
  904. } else if(pair >= MIN_LONG) {
  905. pair = 0; // variable
  906. }
  907. // else special mini CE
  908. } else {
  909. uint32_t ce = pair & 0xffff;
  910. if(ce >= MIN_SHORT) {
  911. pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
  912. } else if(ce > variableTop) {
  913. pair = TWO_COMMON_SEC_PLUS_OFFSET;
  914. } else {
  915. U_ASSERT(ce >= MIN_LONG);
  916. pair = 0; // variable
  917. }
  918. }
  919. return pair;
  920. }
  921. uint32_t
  922. CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
  923. // Primary+caseLevel: Ignore case level weights of primary ignorables.
  924. // Otherwise: Ignore case level weights of secondary ignorables.
  925. // For details see the comments in the CollationCompare class.
  926. // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
  927. if(pair <= 0xffff) {
  928. // one mini CE
  929. if(pair >= MIN_SHORT) {
  930. // A high secondary weight means we really have two CEs,
  931. // a primary CE and a secondary CE.
  932. uint32_t ce = pair;
  933. pair &= CASE_MASK; // explicit weight of primary CE
  934. if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
  935. pair |= LOWER_CASE << 16; // implied weight of secondary CE
  936. }
  937. } else if(pair > variableTop) {
  938. pair = LOWER_CASE;
  939. } else if(pair >= MIN_LONG) {
  940. pair = 0; // variable
  941. }
  942. // else special mini CE
  943. } else {
  944. // two mini CEs, same primary groups, neither expands like above
  945. uint32_t ce = pair & 0xffff;
  946. if(ce >= MIN_SHORT) {
  947. if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
  948. pair &= CASE_MASK;
  949. } else {
  950. pair &= TWO_CASES_MASK;
  951. }
  952. } else if(ce > variableTop) {
  953. pair = TWO_LOWER_CASES;
  954. } else {
  955. U_ASSERT(ce >= MIN_LONG);
  956. pair = 0; // variable
  957. }
  958. }
  959. return pair;
  960. }
  961. uint32_t
  962. CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
  963. if(pair <= 0xffff) {
  964. // one mini CE
  965. if(pair >= MIN_SHORT) {
  966. // A high secondary weight means we really have two CEs,
  967. // a primary CE and a secondary CE.
  968. uint32_t ce = pair;
  969. if(withCaseBits) {
  970. pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
  971. if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
  972. pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
  973. }
  974. } else {
  975. pair = (pair & TERTIARY_MASK) + TER_OFFSET;
  976. if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
  977. pair |= COMMON_TER_PLUS_OFFSET << 16;
  978. }
  979. }
  980. } else if(pair > variableTop) {
  981. pair = (pair & TERTIARY_MASK) + TER_OFFSET;
  982. if(withCaseBits) {
  983. pair |= LOWER_CASE;
  984. }
  985. } else if(pair >= MIN_LONG) {
  986. pair = 0; // variable
  987. }
  988. // else special mini CE
  989. } else {
  990. // two mini CEs, same primary groups, neither expands like above
  991. uint32_t ce = pair & 0xffff;
  992. if(ce >= MIN_SHORT) {
  993. if(withCaseBits) {
  994. pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
  995. } else {
  996. pair &= TWO_TERTIARIES_MASK;
  997. }
  998. pair += TWO_TER_OFFSETS;
  999. } else if(ce > variableTop) {
  1000. pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
  1001. if(withCaseBits) {
  1002. pair |= TWO_LOWER_CASES;
  1003. }
  1004. } else {
  1005. U_ASSERT(ce >= MIN_LONG);
  1006. pair = 0; // variable
  1007. }
  1008. }
  1009. return pair;
  1010. }
  1011. uint32_t
  1012. CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
  1013. // Return the primary weight of a variable CE,
  1014. // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
  1015. if(pair <= 0xffff) {
  1016. // one mini CE
  1017. if(pair >= MIN_SHORT) {
  1018. // A high secondary weight means we really have two CEs,
  1019. // a primary CE and a secondary CE.
  1020. if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
  1021. pair = TWO_SHORT_PRIMARIES_MASK;
  1022. } else {
  1023. pair = SHORT_PRIMARY_MASK;
  1024. }
  1025. } else if(pair > variableTop) {
  1026. pair = SHORT_PRIMARY_MASK;
  1027. } else if(pair >= MIN_LONG) {
  1028. pair &= LONG_PRIMARY_MASK; // variable
  1029. }
  1030. // else special mini CE
  1031. } else {
  1032. // two mini CEs, same primary groups, neither expands like above
  1033. uint32_t ce = pair & 0xffff;
  1034. if(ce > variableTop) {
  1035. pair = TWO_SHORT_PRIMARIES_MASK;
  1036. } else {
  1037. U_ASSERT(ce >= MIN_LONG);
  1038. pair &= TWO_LONG_PRIMARIES_MASK; // variable
  1039. }
  1040. }
  1041. return pair;
  1042. }
  1043. U_NAMESPACE_END
  1044. #endif // !UCONFIG_NO_COLLATION