utrie.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2001-2011, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: utrie.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2001nov08
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __UTRIE_H__
  19. #define __UTRIE_H__
  20. #include "unicode/utypes.h"
  21. #include "unicode/utf16.h"
  22. U_CDECL_BEGIN
  23. /**
  24. * \file
  25. *
  26. * This is a common implementation of a "folded" trie.
  27. * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
  28. * Unicode code points (0..0x10ffff).
  29. *
  30. * This implementation is optimized for getting values while walking forward
  31. * through a UTF-16 string.
  32. * Therefore, the simplest and fastest access macros are the
  33. * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
  34. *
  35. * The _FROM_BMP() macros are a little more complicated; they get values
  36. * even for lead surrogate code _points_, while the _FROM_LEAD() macros
  37. * get special "folded" values for lead surrogate code _units_ if
  38. * there is relevant data associated with them.
  39. * From such a folded value, an offset needs to be extracted to supply
  40. * to the _FROM_OFFSET_TRAIL() macros.
  41. *
  42. * Most of the more complex (and more convenient) functions/macros call a callback function
  43. * to get that offset from the folded value for a lead surrogate unit.
  44. */
  45. /**
  46. * Trie constants, defining shift widths, index array lengths, etc.
  47. */
  48. enum {
  49. /** Shift size for shifting right the input index. 1..9 */
  50. UTRIE_SHIFT=5,
  51. /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
  52. UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
  53. /** Mask for getting the lower bits from the input index. */
  54. UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
  55. /**
  56. * Lead surrogate code points' index displacement in the index array.
  57. * 0x10000-0xd800=0x2800
  58. */
  59. UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
  60. /**
  61. * Shift size for shifting left the index array values.
  62. * Increases possible data size with 16-bit index values at the cost
  63. * of compactability.
  64. * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
  65. * 0..UTRIE_SHIFT
  66. */
  67. UTRIE_INDEX_SHIFT=2,
  68. /** The alignment size of a stage 2 data block. Also the granularity for compaction. */
  69. UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
  70. /** Number of bits of a trail surrogate that are used in index table lookups. */
  71. UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
  72. /**
  73. * Number of index (stage 1) entries per lead surrogate.
  74. * Same as number of index entries for 1024 trail surrogates,
  75. * ==0x400>>UTRIE_SHIFT
  76. */
  77. UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
  78. /** Length of the BMP portion of the index (stage 1) array. */
  79. UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
  80. };
  81. /**
  82. * Length of the index (stage 1) array before folding.
  83. * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
  84. */
  85. #define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
  86. /**
  87. * Maximum length of the runtime data (stage 2) array.
  88. * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
  89. */
  90. #define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
  91. /**
  92. * Maximum length of the build-time data (stage 2) array.
  93. * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
  94. * (Number of Unicode code points + one all-initial-value block +
  95. * possible duplicate entries for 1024 lead surrogates.)
  96. */
  97. #define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
  98. /**
  99. * Number of bytes for a dummy trie.
  100. * A dummy trie is an empty runtime trie, used when a real data trie cannot
  101. * be loaded.
  102. * The number of bytes works for Latin-1-linear tries with 32-bit data
  103. * (worst case).
  104. *
  105. * Calculation:
  106. * BMP index + 1 index block for lead surrogate code points +
  107. * Latin-1-linear array + 1 data block for lead surrogate code points
  108. *
  109. * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block }
  110. *
  111. * @see utrie_unserializeDummy
  112. */
  113. #define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4)
  114. /**
  115. * Runtime UTrie callback function.
  116. * Extract from a lead surrogate's data the
  117. * index array offset of the indexes for that lead surrogate.
  118. *
  119. * @param data data value for a surrogate from the trie, including the folding offset
  120. * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
  121. */
  122. typedef int32_t U_CALLCONV
  123. UTrieGetFoldingOffset(uint32_t data);
  124. /**
  125. * Run-time Trie structure.
  126. *
  127. * Either the data table is 16 bits wide and accessed via the index
  128. * pointer, with each index item increased by indexLength;
  129. * in this case, data32==NULL.
  130. *
  131. * Or the data table is 32 bits wide and accessed via the data32 pointer.
  132. */
  133. struct UTrie {
  134. const uint16_t *index;
  135. const uint32_t *data32; /* NULL if 16b data is used via index */
  136. /**
  137. * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
  138. * If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
  139. *
  140. * utrie_unserialize() sets a default function which simply returns
  141. * the lead surrogate's value itself - which is the inverse of the default
  142. * folding function used by utrie_serialize().
  143. *
  144. * @see UTrieGetFoldingOffset
  145. */
  146. UTrieGetFoldingOffset *getFoldingOffset;
  147. int32_t indexLength, dataLength;
  148. uint32_t initialValue;
  149. UBool isLatin1Linear;
  150. };
  151. #ifndef __UTRIE2_H__
  152. typedef struct UTrie UTrie;
  153. #endif
  154. /** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
  155. #define _UTRIE_GET_RAW(trie, data, offset, c16) \
  156. (trie)->data[ \
  157. ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
  158. ((c16)&UTRIE_MASK) \
  159. ]
  160. /** Internal trie getter from a pair of surrogates */
  161. #define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
  162. int32_t __offset; \
  163. \
  164. /* get data for lead surrogate */ \
  165. (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
  166. __offset=(trie)->getFoldingOffset(result); \
  167. \
  168. /* get the real data from the folded lead/trail units */ \
  169. if(__offset>0) { \
  170. (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
  171. } else { \
  172. (result)=(resultType)((trie)->initialValue); \
  173. } \
  174. } UPRV_BLOCK_MACRO_END
  175. /** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
  176. #define _UTRIE_GET_FROM_BMP(trie, data, c16) \
  177. _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16)
  178. /**
  179. * Internal trie getter from a code point.
  180. * Could be faster(?) but longer with
  181. * if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
  182. */
  183. #define _UTRIE_GET(trie, data, c32, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
  184. if((uint32_t)(c32)<=0xffff) { \
  185. /* BMP code points */ \
  186. (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
  187. } else if((uint32_t)(c32)<=0x10ffff) { \
  188. /* supplementary code point */ \
  189. UChar __lead16=U16_LEAD(c32); \
  190. _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \
  191. } else { \
  192. /* out of range */ \
  193. (result)=(resultType)((trie)->initialValue); \
  194. } \
  195. } UPRV_BLOCK_MACRO_END
  196. /** Internal next-post-increment: get the next code point (c, c2) and its data */
  197. #define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
  198. (c)=*(src)++; \
  199. if(!U16_IS_LEAD(c)) { \
  200. (c2)=0; \
  201. (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
  202. } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \
  203. ++(src); \
  204. _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
  205. } else { \
  206. /* unpaired lead surrogate code point */ \
  207. (c2)=0; \
  208. (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
  209. } \
  210. } UPRV_BLOCK_MACRO_END
  211. /** Internal previous: get the previous code point (c, c2) and its data */
  212. #define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
  213. (c)=*--(src); \
  214. if(!U16_IS_SURROGATE(c)) { \
  215. (c2)=0; \
  216. (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
  217. } else if(!U16_IS_SURROGATE_LEAD(c)) { \
  218. /* trail surrogate */ \
  219. if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \
  220. --(src); \
  221. (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
  222. _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
  223. } else { \
  224. /* unpaired trail surrogate code point */ \
  225. (c2)=0; \
  226. (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
  227. } \
  228. } else { \
  229. /* unpaired lead surrogate code point */ \
  230. (c2)=0; \
  231. (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
  232. } \
  233. } UPRV_BLOCK_MACRO_END
  234. /* Public UTrie API ---------------------------------------------------------*/
  235. /**
  236. * Get a pointer to the contiguous part of the data array
  237. * for the Latin-1 range (U+0000..U+00ff).
  238. * Must be used only if the Latin-1 range is in fact linear
  239. * (trie->isLatin1Linear).
  240. *
  241. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  242. * @return (const uint16_t *) pointer to values for Latin-1 code points
  243. */
  244. #define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
  245. /**
  246. * Get a pointer to the contiguous part of the data array
  247. * for the Latin-1 range (U+0000..U+00ff).
  248. * Must be used only if the Latin-1 range is in fact linear
  249. * (trie->isLatin1Linear).
  250. *
  251. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  252. * @return (const uint32_t *) pointer to values for Latin-1 code points
  253. */
  254. #define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
  255. /**
  256. * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
  257. * c16 may be a lead surrogate, which may have a value including a folding offset.
  258. *
  259. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  260. * @param c16 (UChar, in) the input BMP code point
  261. * @return (uint16_t) trie lookup result
  262. */
  263. #define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
  264. /**
  265. * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
  266. * c16 may be a lead surrogate, which may have a value including a folding offset.
  267. *
  268. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  269. * @param c16 (UChar, in) the input BMP code point
  270. * @return (uint32_t) trie lookup result
  271. */
  272. #define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
  273. /**
  274. * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
  275. * Even lead surrogate code points are treated as normal code points,
  276. * with unfolded values that may differ from _FROM_LEAD() macro results for them.
  277. *
  278. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  279. * @param c16 (UChar, in) the input BMP code point
  280. * @return (uint16_t) trie lookup result
  281. */
  282. #define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
  283. /**
  284. * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
  285. * Even lead surrogate code points are treated as normal code points,
  286. * with unfolded values that may differ from _FROM_LEAD() macro results for them.
  287. *
  288. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  289. * @param c16 (UChar, in) the input BMP code point
  290. * @return (uint32_t) trie lookup result
  291. */
  292. #define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
  293. /**
  294. * Get a 16-bit trie value from a code point.
  295. * Even lead surrogate code points are treated as normal code points,
  296. * with unfolded values that may differ from _FROM_LEAD() macro results for them.
  297. *
  298. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  299. * @param c32 (UChar32, in) the input code point
  300. * @param result (uint16_t, out) uint16_t variable for the trie lookup result
  301. */
  302. #define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t)
  303. /**
  304. * Get a 32-bit trie value from a code point.
  305. * Even lead surrogate code points are treated as normal code points,
  306. * with unfolded values that may differ from _FROM_LEAD() macro results for them.
  307. *
  308. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  309. * @param c32 (UChar32, in) the input code point
  310. * @param result (uint32_t, out) uint32_t variable for the trie lookup result
  311. */
  312. #define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t)
  313. /**
  314. * Get the next code point (c, c2), post-increment src,
  315. * and get a 16-bit value from the trie.
  316. *
  317. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  318. * @param src (const UChar *, in/out) the source text pointer
  319. * @param limit (const UChar *, in) the limit pointer for the text, or NULL
  320. * @param c (UChar, out) variable for the BMP or lead code unit
  321. * @param c2 (UChar, out) variable for 0 or the trail code unit
  322. * @param result (uint16_t, out) uint16_t variable for the trie lookup result
  323. */
  324. #define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t)
  325. /**
  326. * Get the next code point (c, c2), post-increment src,
  327. * and get a 32-bit value from the trie.
  328. *
  329. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  330. * @param src (const UChar *, in/out) the source text pointer
  331. * @param limit (const UChar *, in) the limit pointer for the text, or NULL
  332. * @param c (UChar, out) variable for the BMP or lead code unit
  333. * @param c2 (UChar, out) variable for 0 or the trail code unit
  334. * @param result (uint32_t, out) uint32_t variable for the trie lookup result
  335. */
  336. #define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t)
  337. /**
  338. * Get the previous code point (c, c2), pre-decrement src,
  339. * and get a 16-bit value from the trie.
  340. *
  341. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  342. * @param start (const UChar *, in) the start pointer for the text, or NULL
  343. * @param src (const UChar *, in/out) the source text pointer
  344. * @param c (UChar, out) variable for the BMP or lead code unit
  345. * @param c2 (UChar, out) variable for 0 or the trail code unit
  346. * @param result (uint16_t, out) uint16_t variable for the trie lookup result
  347. */
  348. #define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t)
  349. /**
  350. * Get the previous code point (c, c2), pre-decrement src,
  351. * and get a 32-bit value from the trie.
  352. *
  353. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  354. * @param start (const UChar *, in) the start pointer for the text, or NULL
  355. * @param src (const UChar *, in/out) the source text pointer
  356. * @param c (UChar, out) variable for the BMP or lead code unit
  357. * @param c2 (UChar, out) variable for 0 or the trail code unit
  358. * @param result (uint32_t, out) uint32_t variable for the trie lookup result
  359. */
  360. #define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t)
  361. /**
  362. * Get a 16-bit trie value from a pair of surrogates.
  363. *
  364. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  365. * @param c (UChar, in) a lead surrogate
  366. * @param c2 (UChar, in) a trail surrogate
  367. * @param result (uint16_t, out) uint16_t variable for the trie lookup result
  368. */
  369. #define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t)
  370. /**
  371. * Get a 32-bit trie value from a pair of surrogates.
  372. *
  373. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  374. * @param c (UChar, in) a lead surrogate
  375. * @param c2 (UChar, in) a trail surrogate
  376. * @param result (uint32_t, out) uint32_t variable for the trie lookup result
  377. */
  378. #define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t)
  379. /**
  380. * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
  381. * and a trail surrogate.
  382. *
  383. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  384. * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
  385. * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
  386. * @return (uint16_t) trie lookup result
  387. */
  388. #define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
  389. /**
  390. * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
  391. * and a trail surrogate.
  392. *
  393. * @param trie (const UTrie *, in) a pointer to the runtime trie structure
  394. * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
  395. * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
  396. * @return (uint32_t) trie lookup result
  397. */
  398. #define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
  399. /* enumeration callback types */
  400. /**
  401. * Callback from utrie_enum(), extracts a uint32_t value from a
  402. * trie value. This value will be passed on to the UTrieEnumRange function.
  403. *
  404. * @param context an opaque pointer, as passed into utrie_enum()
  405. * @param value a value from the trie
  406. * @return the value that is to be passed on to the UTrieEnumRange function
  407. */
  408. typedef uint32_t U_CALLCONV
  409. UTrieEnumValue(const void *context, uint32_t value);
  410. /**
  411. * Callback from utrie_enum(), is called for each contiguous range
  412. * of code points with the same value as retrieved from the trie and
  413. * transformed by the UTrieEnumValue function.
  414. *
  415. * The callback function can stop the enumeration by returning false.
  416. *
  417. * @param context an opaque pointer, as passed into utrie_enum()
  418. * @param start the first code point in a contiguous range with value
  419. * @param limit one past the last code point in a contiguous range with value
  420. * @param value the value that is set for all code points in [start..limit[
  421. * @return false to stop the enumeration
  422. */
  423. typedef UBool U_CALLCONV
  424. UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
  425. /**
  426. * Enumerate efficiently all values in a trie.
  427. * For each entry in the trie, the value to be delivered is passed through
  428. * the UTrieEnumValue function.
  429. * The value is unchanged if that function pointer is NULL.
  430. *
  431. * For each contiguous range of code points with a given value,
  432. * the UTrieEnumRange function is called.
  433. *
  434. * @param trie a pointer to the runtime trie structure
  435. * @param enumValue a pointer to a function that may transform the trie entry value,
  436. * or NULL if the values from the trie are to be used directly
  437. * @param enumRange a pointer to a function that is called for each contiguous range
  438. * of code points with the same value
  439. * @param context an opaque pointer that is passed on to the callback functions
  440. */
  441. U_CAPI void U_EXPORT2
  442. utrie_enum(const UTrie *trie,
  443. UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
  444. /**
  445. * Unserialize a trie from 32-bit-aligned memory.
  446. * Inverse of utrie_serialize().
  447. * Fills the UTrie runtime trie structure with the settings for the trie data.
  448. *
  449. * @param trie a pointer to the runtime trie structure
  450. * @param data a pointer to 32-bit-aligned memory containing trie data
  451. * @param length the number of bytes available at data
  452. * @param pErrorCode an in/out ICU UErrorCode
  453. * @return the number of bytes at data taken up by the trie data
  454. */
  455. U_CAPI int32_t U_EXPORT2
  456. utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode);
  457. /**
  458. * "Unserialize" a dummy trie.
  459. * A dummy trie is an empty runtime trie, used when a real data trie cannot
  460. * be loaded.
  461. *
  462. * The input memory is filled so that the trie always returns the initialValue,
  463. * or the leadUnitValue for lead surrogate code points.
  464. * The Latin-1 part is always set up to be linear.
  465. *
  466. * @param trie a pointer to the runtime trie structure
  467. * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data
  468. * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE)
  469. * @param initialValue the initial value that is set for all code points
  470. * @param leadUnitValue the value for lead surrogate code _units_ that do not
  471. * have associated supplementary data
  472. * @param pErrorCode an in/out ICU UErrorCode
  473. *
  474. * @see UTRIE_DUMMY_SIZE
  475. * @see utrie_open
  476. */
  477. U_CAPI int32_t U_EXPORT2
  478. utrie_unserializeDummy(UTrie *trie,
  479. void *data, int32_t length,
  480. uint32_t initialValue, uint32_t leadUnitValue,
  481. UBool make16BitTrie,
  482. UErrorCode *pErrorCode);
  483. /**
  484. * Default implementation for UTrie.getFoldingOffset, set automatically by
  485. * utrie_unserialize().
  486. * Simply returns the lead surrogate's value itself - which is the inverse
  487. * of the default folding function used by utrie_serialize().
  488. * Exported for static const UTrie structures.
  489. *
  490. * @see UTrieGetFoldingOffset
  491. */
  492. U_CAPI int32_t U_EXPORT2
  493. utrie_defaultGetFoldingOffset(uint32_t data);
  494. /* Building a trie ----------------------------------------------------------*/
  495. /**
  496. * Build-time trie structure.
  497. * Opaque definition, here only to make fillIn parameters possible
  498. * for utrie_open() and utrie_clone().
  499. */
  500. struct UNewTrie {
  501. /**
  502. * Index values at build-time are 32 bits wide for easier processing.
  503. * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
  504. */
  505. int32_t index[UTRIE_MAX_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT];
  506. uint32_t *data;
  507. uint32_t leadUnitValue;
  508. int32_t indexLength, dataCapacity, dataLength;
  509. UBool isAllocated, isDataAllocated;
  510. UBool isLatin1Linear, isCompacted;
  511. /**
  512. * Map of adjusted indexes, used in utrie_compact().
  513. * Maps from original indexes to new ones.
  514. */
  515. int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
  516. };
  517. typedef struct UNewTrie UNewTrie;
  518. /**
  519. * Build-time trie callback function, used with utrie_serialize().
  520. * This function calculates a lead surrogate's value including a folding offset
  521. * from the 1024 supplementary code points [start..start+1024[ .
  522. * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
  523. *
  524. * The folding offset is provided by the caller.
  525. * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
  526. * Instead of the offset itself, n can be stored in 10 bits -
  527. * or fewer if it can be assumed that few lead surrogates have associated data.
  528. *
  529. * The returned value must be
  530. * - not zero if and only if there is relevant data
  531. * for the corresponding 1024 supplementary code points
  532. * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
  533. *
  534. * @return a folded value, or 0 if there is no relevant data for the lead surrogate.
  535. */
  536. typedef uint32_t U_CALLCONV
  537. UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
  538. /**
  539. * Open a build-time trie structure.
  540. * The size of the build-time data array is specified to avoid allocating a large
  541. * array in all cases. The array itself can also be passed in.
  542. *
  543. * Although the trie is never fully expanded to a linear array, especially when
  544. * utrie_setRange32() is used, the data array could be large during build time.
  545. * The maximum length is
  546. * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
  547. * (Number of Unicode code points + one all-initial-value block +
  548. * possible duplicate entries for 1024 lead surrogates.)
  549. * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
  550. *
  551. * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
  552. * NULL if one is to be allocated
  553. * @param aliasData a pointer to a data array to be used (will not be released), or
  554. * NULL if one is to be allocated
  555. * @param maxDataLength the capacity of aliasData (if not NULL) or
  556. * the length of the data array to be allocated
  557. * @param initialValue the initial value that is set for all code points
  558. * @param leadUnitValue the value for lead surrogate code _units_ that do not
  559. * have associated supplementary data
  560. * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
  561. * kept in a linear, contiguous part of the data array
  562. * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
  563. */
  564. U_CAPI UNewTrie * U_EXPORT2
  565. utrie_open(UNewTrie *fillIn,
  566. uint32_t *aliasData, int32_t maxDataLength,
  567. uint32_t initialValue, uint32_t leadUnitValue,
  568. UBool latin1Linear);
  569. /**
  570. * Clone a build-time trie structure with all entries.
  571. *
  572. * @param fillIn like in utrie_open()
  573. * @param other the build-time trie structure to clone
  574. * @param aliasData like in utrie_open(),
  575. * used if aliasDataLength>=(capacity of other's data array)
  576. * @param aliasDataLength the length of aliasData
  577. * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
  578. */
  579. U_CAPI UNewTrie * U_EXPORT2
  580. utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
  581. /**
  582. * Close a build-time trie structure, and release memory
  583. * that was allocated by utrie_open() or utrie_clone().
  584. *
  585. * @param trie the build-time trie
  586. */
  587. U_CAPI void U_EXPORT2
  588. utrie_close(UNewTrie *trie);
  589. /**
  590. * Get the data array of a build-time trie.
  591. * The data may be modified, but entries that are equal before
  592. * must still be equal after modification.
  593. *
  594. * @param trie the build-time trie
  595. * @param pLength (out) a pointer to a variable that receives the number
  596. * of entries in the data array
  597. * @return the data array
  598. */
  599. U_CAPI uint32_t * U_EXPORT2
  600. utrie_getData(UNewTrie *trie, int32_t *pLength);
  601. /**
  602. * Set a value for a code point.
  603. *
  604. * @param trie the build-time trie
  605. * @param c the code point
  606. * @param value the value
  607. * @return false if a failure occurred (illegal argument or data array overrun)
  608. */
  609. U_CAPI UBool U_EXPORT2
  610. utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
  611. /**
  612. * Get a value from a code point as stored in the build-time trie.
  613. *
  614. * @param trie the build-time trie
  615. * @param c the code point
  616. * @param pInBlockZero if not NULL, then *pInBlockZero is set to true
  617. * iff the value is retrieved from block 0;
  618. * block 0 is the all-initial-value initial block
  619. * @return the value
  620. */
  621. U_CAPI uint32_t U_EXPORT2
  622. utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
  623. /**
  624. * Set a value in a range of code points [start..limit[.
  625. * All code points c with start<=c<limit will get the value if
  626. * overwrite is true or if the old value is 0.
  627. *
  628. * @param trie the build-time trie
  629. * @param start the first code point to get the value
  630. * @param limit one past the last code point to get the value
  631. * @param value the value
  632. * @param overwrite flag for whether old non-initial values are to be overwritten
  633. * @return false if a failure occurred (illegal argument or data array overrun)
  634. */
  635. U_CAPI UBool U_EXPORT2
  636. utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
  637. /**
  638. * Compact the build-time trie after all values are set, and then
  639. * serialize it into 32-bit aligned memory.
  640. *
  641. * After this, the trie can only be serizalized again and/or closed;
  642. * no further values can be added.
  643. *
  644. * @see utrie_unserialize()
  645. *
  646. * @param trie the build-time trie
  647. * @param data a pointer to 32-bit-aligned memory for the trie data
  648. * @param capacity the number of bytes available at data
  649. * @param getFoldedValue a callback function that calculates the value for
  650. * a lead surrogate from all of its supplementary code points
  651. * and the folding offset;
  652. * if NULL, then a default function is used which returns just
  653. * the input offset when there are any non-initial-value entries
  654. * @param reduceTo16Bits flag for whether the values are to be reduced to a
  655. * width of 16 bits for serialization and runtime
  656. * @param pErrorCode a UErrorCode argument; among other possible error codes:
  657. * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
  658. * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
  659. * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
  660. *
  661. * @return the number of bytes written for the trie
  662. */
  663. U_CAPI int32_t U_EXPORT2
  664. utrie_serialize(UNewTrie *trie, void *data, int32_t capacity,
  665. UNewTrieGetFoldedValue *getFoldedValue,
  666. UBool reduceTo16Bits,
  667. UErrorCode *pErrorCode);
  668. /* serialization ------------------------------------------------------------ */
  669. // UTrie signature values, in platform endianness and opposite endianness.
  670. // The UTrie signature ASCII byte values spell "Trie".
  671. #define UTRIE_SIG 0x54726965
  672. #define UTRIE_OE_SIG 0x65697254
  673. /**
  674. * Trie data structure in serialized form:
  675. *
  676. * UTrieHeader header;
  677. * uint16_t index[header.indexLength];
  678. * uint16_t data[header.dataLength];
  679. * @internal
  680. */
  681. typedef struct UTrieHeader {
  682. /** "Trie" in big-endian US-ASCII (0x54726965) */
  683. uint32_t signature;
  684. /**
  685. * options bit field:
  686. * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
  687. * 8 0=16-bit data, 1=32-bit data
  688. * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
  689. * 3..0 UTRIE_SHIFT // 1..9
  690. */
  691. uint32_t options;
  692. /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
  693. int32_t indexLength;
  694. /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
  695. int32_t dataLength;
  696. } UTrieHeader;
  697. /**
  698. * Constants for use with UTrieHeader.options.
  699. * @internal
  700. */
  701. enum {
  702. /** Mask to get the UTRIE_SHIFT value from options. */
  703. UTRIE_OPTIONS_SHIFT_MASK=0xf,
  704. /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
  705. UTRIE_OPTIONS_INDEX_SHIFT=4,
  706. /** If set, then the data (stage 2) array is 32 bits wide. */
  707. UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
  708. /**
  709. * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
  710. * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
  711. */
  712. UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
  713. };
  714. U_CDECL_END
  715. #endif