ucdata.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163
  1. /*
  2. * Copyright 1996, 1997, 1998 Computing Research Labs,
  3. * New Mexico State University
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a
  6. * copy of this software and associated documentation files (the "Software"),
  7. * to deal in the Software without restriction, including without limitation
  8. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. * and/or sell copies of the Software, and to permit persons to whom the
  10. * Software is furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19. * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20. * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21. * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. */
  23. #ifndef lint
  24. #ifdef __GNUC__
  25. static char rcsid[] __attribute__ ((unused)) = "$Id: ucdata.c,v 1.1 1999/01/08 00:19:11 ftang%netscape.com Exp $";
  26. #else
  27. static char rcsid[] = "$Id: ucdata.c,v 1.1 1999/01/08 00:19:11 ftang%netscape.com Exp $";
  28. #endif
  29. #endif
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. #include <string.h>
  33. #ifndef WIN32
  34. #include <unistd.h>
  35. #endif
  36. #include "ucdata.h"
  37. /**************************************************************************
  38. *
  39. * Miscellaneous types, data, and support functions.
  40. *
  41. **************************************************************************/
  42. typedef struct {
  43. unsigned short bom;
  44. unsigned short cnt;
  45. union {
  46. unsigned long bytes;
  47. unsigned short len[2];
  48. } size;
  49. } _ucheader_t;
  50. /*
  51. * A simple array of 32-bit masks for lookup.
  52. */
  53. static unsigned long masks32[32] = {
  54. 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
  55. 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
  56. 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
  57. 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
  58. 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
  59. 0x40000000, 0x80000000
  60. };
  61. #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
  62. #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
  63. ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
  64. static FILE *
  65. #ifdef __STDC__
  66. _ucopenfile(char *paths, char *filename, char *mode)
  67. #else
  68. _ucopenfile(paths, filename, mode)
  69. char *paths, *filename, *mode;
  70. #endif
  71. {
  72. FILE *f;
  73. char *fp, *dp, *pp, path[BUFSIZ];
  74. if (filename == 0 || *filename == 0)
  75. return 0;
  76. dp = paths;
  77. while (dp && *dp) {
  78. pp = path;
  79. while (*dp && *dp != ':')
  80. *pp++ = *dp++;
  81. *pp++ = '/';
  82. fp = filename;
  83. while (*fp)
  84. *pp++ = *fp++;
  85. *pp = 0;
  86. if ((f = fopen(path, mode)) != 0)
  87. return f;
  88. if (*dp == ':')
  89. dp++;
  90. }
  91. return 0;
  92. }
  93. /**************************************************************************
  94. *
  95. * Support for the character properties.
  96. *
  97. **************************************************************************/
  98. static unsigned long _ucprop_size;
  99. static unsigned short *_ucprop_offsets;
  100. static unsigned long *_ucprop_ranges;
  101. static void
  102. #ifdef __STDC__
  103. _ucprop_load(char *paths, int reload)
  104. #else
  105. _ucprop_load(paths, reload)
  106. char *paths;
  107. int reload;
  108. #endif
  109. {
  110. FILE *in;
  111. unsigned long size, i;
  112. _ucheader_t hdr;
  113. if (_ucprop_size > 0) {
  114. if (!reload)
  115. /*
  116. * The character properties have already been loaded.
  117. */
  118. return;
  119. /*
  120. * Unload the current character property data in preparation for
  121. * loading a new copy. Only the first array has to be deallocated
  122. * because all the memory for the arrays is allocated as a single
  123. * block.
  124. */
  125. free((char *) _ucprop_offsets);
  126. _ucprop_size = 0;
  127. }
  128. if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
  129. return;
  130. /*
  131. * Load the header.
  132. */
  133. fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
  134. if (hdr.bom == 0xfffe) {
  135. hdr.cnt = endian_short(hdr.cnt);
  136. hdr.size.bytes = endian_long(hdr.size.bytes);
  137. }
  138. if ((_ucprop_size = hdr.cnt) == 0) {
  139. fclose(in);
  140. return;
  141. }
  142. /*
  143. * Allocate all the storage needed for the lookup table.
  144. */
  145. _ucprop_offsets = (unsigned short *) malloc(hdr.size.bytes);
  146. /*
  147. * Calculate the offset into the storage for the ranges. The offsets
  148. * array is on a 4-byte boundary and one larger than the value provided in
  149. * the header count field. This means the offset to the ranges must be
  150. * calculated after aligning the count to a 4-byte boundary.
  151. */
  152. if ((size = ((hdr.cnt + 1) * sizeof(unsigned short))) & 3)
  153. size += 4 - (size & 3);
  154. size >>= 1;
  155. _ucprop_ranges = (unsigned long *) (_ucprop_offsets + size);
  156. /*
  157. * Load the offset array.
  158. */
  159. fread((char *) _ucprop_offsets, sizeof(unsigned short), size, in);
  160. /*
  161. * Do an endian swap if necessary. Don't forget there is an extra node on
  162. * the end with the final index.
  163. */
  164. if (hdr.bom == 0xfffe) {
  165. for (i = 0; i <= _ucprop_size; i++)
  166. _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
  167. }
  168. /*
  169. * Load the ranges. The number of elements is in the last array position
  170. * of the offsets.
  171. */
  172. fread((char *) _ucprop_ranges, sizeof(unsigned long),
  173. _ucprop_offsets[_ucprop_size], in);
  174. fclose(in);
  175. /*
  176. * Do an endian swap if necessary.
  177. */
  178. if (hdr.bom == 0xfffe) {
  179. for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
  180. _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
  181. }
  182. }
  183. static void
  184. #ifdef __STDC__
  185. _ucprop_unload(void)
  186. #else
  187. _ucprop_unload()
  188. #endif
  189. {
  190. if (_ucprop_size == 0)
  191. return;
  192. /*
  193. * Only need to free the offsets because the memory is allocated as a
  194. * single block.
  195. */
  196. free((char *) _ucprop_offsets);
  197. _ucprop_size = 0;
  198. }
  199. static int
  200. #ifdef __STDC__
  201. _ucprop_lookup(unsigned long code, unsigned long n)
  202. #else
  203. _ucprop_lookup(code, n)
  204. unsigned long code, n;
  205. #endif
  206. {
  207. long l, r, m;
  208. /*
  209. * There is an extra node on the end of the offsets to allow this routine
  210. * to work right. If the index is 0xffff, then there are no nodes for the
  211. * property.
  212. */
  213. if ((l = _ucprop_offsets[n]) == 0xffff)
  214. return 0;
  215. /*
  216. * Locate the next offset that is not 0xffff. The sentinel at the end of
  217. * the array is the max index value.
  218. */
  219. for (m = 1;
  220. n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
  221. r = _ucprop_offsets[n + m] - 1;
  222. while (l <= r) {
  223. /*
  224. * Determine a "mid" point and adjust to make sure the mid point is at
  225. * the beginning of a range pair.
  226. */
  227. m = (l + r) >> 1;
  228. m -= (m & 1);
  229. if (code > _ucprop_ranges[m + 1])
  230. l = m + 2;
  231. else if (code < _ucprop_ranges[m])
  232. r = m - 2;
  233. else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
  234. return 1;
  235. }
  236. return 0;
  237. }
  238. int
  239. #ifdef __STDC__
  240. ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2)
  241. #else
  242. ucisprop(code, mask1, mask2)
  243. unsigned long code, mask1, mask2;
  244. #endif
  245. {
  246. unsigned long i;
  247. if (mask1 == 0 && mask2 == 0)
  248. return 0;
  249. for (i = 0; mask1 && i < 32; i++) {
  250. if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
  251. return 1;
  252. }
  253. for (i = 32; mask2 && i < _ucprop_size; i++) {
  254. if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
  255. return 1;
  256. }
  257. return 0;
  258. }
  259. /**************************************************************************
  260. *
  261. * Support for case mapping.
  262. *
  263. **************************************************************************/
  264. static unsigned long _uccase_size;
  265. static unsigned short _uccase_len[2];
  266. static unsigned long *_uccase_map;
  267. static void
  268. #ifdef __STDC__
  269. _uccase_load(char *paths, int reload)
  270. #else
  271. _uccase_load(paths, reload)
  272. char *paths;
  273. int reload;
  274. #endif
  275. {
  276. FILE *in;
  277. unsigned long i;
  278. _ucheader_t hdr;
  279. if (_uccase_size > 0) {
  280. if (!reload)
  281. /*
  282. * The case mappings have already been loaded.
  283. */
  284. return;
  285. free((char *) _uccase_map);
  286. _uccase_size = 0;
  287. }
  288. if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
  289. return;
  290. /*
  291. * Load the header.
  292. */
  293. fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
  294. if (hdr.bom == 0xfffe) {
  295. hdr.cnt = endian_short(hdr.cnt);
  296. hdr.size.len[0] = endian_short(hdr.size.len[0]);
  297. hdr.size.len[1] = endian_short(hdr.size.len[1]);
  298. }
  299. /*
  300. * Set the node count and lengths of the upper and lower case mapping
  301. * tables.
  302. */
  303. _uccase_size = hdr.cnt * 3;
  304. _uccase_len[0] = hdr.size.len[0] * 3;
  305. _uccase_len[1] = hdr.size.len[1] * 3;
  306. _uccase_map = (unsigned long *)
  307. malloc(_uccase_size * sizeof(unsigned long));
  308. /*
  309. * Load the case mapping table.
  310. */
  311. fread((char *) _uccase_map, sizeof(unsigned long), _uccase_size, in);
  312. /*
  313. * Do an endian swap if necessary.
  314. */
  315. if (hdr.bom == 0xfffe) {
  316. for (i = 0; i < _uccase_size; i++)
  317. _uccase_map[i] = endian_long(_uccase_map[i]);
  318. }
  319. }
  320. static void
  321. #ifdef __STDC__
  322. _uccase_unload(void)
  323. #else
  324. _uccase_unload()
  325. #endif
  326. {
  327. if (_uccase_size == 0)
  328. return;
  329. free((char *) _uccase_map);
  330. _uccase_size = 0;
  331. }
  332. static unsigned long
  333. #ifdef __STDC__
  334. _uccase_lookup(unsigned long code, long l, long r, int field)
  335. #else
  336. _uccase_lookup(code, l, r, field)
  337. unsigned long code;
  338. long l, r;
  339. int field;
  340. #endif
  341. {
  342. long m;
  343. /*
  344. * Do the binary search.
  345. */
  346. while (l <= r) {
  347. /*
  348. * Determine a "mid" point and adjust to make sure the mid point is at
  349. * the beginning of a case mapping triple.
  350. */
  351. m = (l + r) >> 1;
  352. m -= (m % 3);
  353. if (code > _uccase_map[m])
  354. l = m + 3;
  355. else if (code < _uccase_map[m])
  356. r = m - 3;
  357. else if (code == _uccase_map[m])
  358. return _uccase_map[m + field];
  359. }
  360. return code;
  361. }
  362. unsigned long
  363. #ifdef __STDC__
  364. uctoupper(unsigned long code)
  365. #else
  366. uctoupper(code)
  367. unsigned long code;
  368. #endif
  369. {
  370. int field;
  371. long l, r;
  372. if (ucisupper(code))
  373. return code;
  374. if (ucislower(code)) {
  375. /*
  376. * The character is lower case.
  377. */
  378. field = 1;
  379. l = _uccase_len[0];
  380. r = (l + _uccase_len[1]) - 1;
  381. } else {
  382. /*
  383. * The character is title case.
  384. */
  385. field = 2;
  386. l = _uccase_len[0] + _uccase_len[1];
  387. r = _uccase_size - 1;
  388. }
  389. return _uccase_lookup(code, l, r, field);
  390. }
  391. unsigned long
  392. #ifdef __STDC__
  393. uctolower(unsigned long code)
  394. #else
  395. uctolower(code)
  396. unsigned long code;
  397. #endif
  398. {
  399. int field;
  400. long l, r;
  401. if (ucislower(code))
  402. return code;
  403. if (ucisupper(code)) {
  404. /*
  405. * The character is upper case.
  406. */
  407. field = 1;
  408. l = 0;
  409. r = _uccase_len[0] - 1;
  410. } else {
  411. /*
  412. * The character is title case.
  413. */
  414. field = 2;
  415. l = _uccase_len[0] + _uccase_len[1];
  416. r = _uccase_size - 1;
  417. }
  418. return _uccase_lookup(code, l, r, field);
  419. }
  420. unsigned long
  421. #ifdef __STDC__
  422. uctotitle(unsigned long code)
  423. #else
  424. uctotitle(code)
  425. unsigned long code;
  426. #endif
  427. {
  428. int field;
  429. long l, r;
  430. if (ucistitle(code))
  431. return code;
  432. /*
  433. * The offset will always be the same for converting to title case.
  434. */
  435. field = 2;
  436. if (ucisupper(code)) {
  437. /*
  438. * The character is upper case.
  439. */
  440. l = 0;
  441. r = _uccase_len[0] - 1;
  442. } else {
  443. /*
  444. * The character is lower case.
  445. */
  446. l = _uccase_len[0];
  447. r = (l + _uccase_len[1]) - 1;
  448. }
  449. return _uccase_lookup(code, l, r, field);
  450. }
  451. /**************************************************************************
  452. *
  453. * Support for decompositions.
  454. *
  455. **************************************************************************/
  456. static unsigned long _ucdcmp_size;
  457. static unsigned long *_ucdcmp_nodes;
  458. static unsigned long *_ucdcmp_decomp;
  459. static void
  460. #ifdef __STDC__
  461. _ucdcmp_load(char *paths, int reload)
  462. #else
  463. _ucdcmp_load(paths, reload)
  464. char *paths;
  465. int reload;
  466. #endif
  467. {
  468. FILE *in;
  469. unsigned long size, i;
  470. _ucheader_t hdr;
  471. if (_ucdcmp_size > 0) {
  472. if (!reload)
  473. /*
  474. * The decompositions have already been loaded.
  475. */
  476. return;
  477. free((char *) _ucdcmp_nodes);
  478. _ucdcmp_size = 0;
  479. }
  480. if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
  481. return;
  482. /*
  483. * Load the header.
  484. */
  485. fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
  486. if (hdr.bom == 0xfffe) {
  487. hdr.cnt = endian_short(hdr.cnt);
  488. hdr.size.bytes = endian_long(hdr.size.bytes);
  489. }
  490. _ucdcmp_size = hdr.cnt << 1;
  491. _ucdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes);
  492. _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
  493. /*
  494. * Read the decomposition data in.
  495. */
  496. size = hdr.size.bytes / sizeof(unsigned long);
  497. fread((char *) _ucdcmp_nodes, sizeof(unsigned long), size, in);
  498. /*
  499. * Do an endian swap if necessary.
  500. */
  501. if (hdr.bom == 0xfffe) {
  502. for (i = 0; i < size; i++)
  503. _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
  504. }
  505. }
  506. static void
  507. #ifdef __STDC__
  508. _ucdcmp_unload(void)
  509. #else
  510. _ucdcmp_unload()
  511. #endif
  512. {
  513. if (_ucdcmp_size == 0)
  514. return;
  515. /*
  516. * Only need to free the offsets because the memory is allocated as a
  517. * single block.
  518. */
  519. free((char *) _ucdcmp_nodes);
  520. _ucdcmp_size = 0;
  521. }
  522. int
  523. #ifdef __STDC__
  524. ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
  525. #else
  526. ucdecomp(code, num, decomp)
  527. unsigned long code, *num, **decomp;
  528. #endif
  529. {
  530. long l, r, m;
  531. l = 0;
  532. r = _ucdcmp_nodes[_ucdcmp_size] - 1;
  533. while (l <= r) {
  534. /*
  535. * Determine a "mid" point and adjust to make sure the mid point is at
  536. * the beginning of a code+offset pair.
  537. */
  538. m = (l + r) >> 1;
  539. m -= (m & 1);
  540. if (code > _ucdcmp_nodes[m])
  541. l = m + 2;
  542. else if (code < _ucdcmp_nodes[m])
  543. r = m - 2;
  544. else if (code == _ucdcmp_nodes[m]) {
  545. *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
  546. *decomp = &_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
  547. return 1;
  548. }
  549. }
  550. return 0;
  551. }
  552. int
  553. #ifdef __STDC__
  554. ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
  555. #else
  556. ucdecomp_hangul(code, num, decomp)
  557. unsigned long code, *num, decomp[];
  558. #endif
  559. {
  560. if (!ucishangul(code))
  561. return 0;
  562. code -= 0xac00;
  563. decomp[0] = 0x1100 + (unsigned long) (code / 588);
  564. decomp[1] = 0x1161 + (unsigned long) ((code % 588) / 28);
  565. decomp[2] = 0x11a7 + (unsigned long) (code % 28);
  566. *num = (decomp[2] != 0x11a7) ? 3 : 2;
  567. return 1;
  568. }
  569. /**************************************************************************
  570. *
  571. * Support for combining classes.
  572. *
  573. **************************************************************************/
  574. static unsigned long _uccmcl_size;
  575. static unsigned long *_uccmcl_nodes;
  576. static void
  577. #ifdef __STDC__
  578. _uccmcl_load(char *paths, int reload)
  579. #else
  580. _uccmcl_load(paths, reload)
  581. char *paths;
  582. int reload;
  583. #endif
  584. {
  585. FILE *in;
  586. unsigned long i;
  587. _ucheader_t hdr;
  588. if (_uccmcl_size > 0) {
  589. if (!reload)
  590. /*
  591. * The combining classes have already been loaded.
  592. */
  593. return;
  594. free((char *) _uccmcl_nodes);
  595. _uccmcl_size = 0;
  596. }
  597. if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
  598. return;
  599. /*
  600. * Load the header.
  601. */
  602. fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
  603. if (hdr.bom == 0xfffe) {
  604. hdr.cnt = endian_short(hdr.cnt);
  605. hdr.size.bytes = endian_long(hdr.size.bytes);
  606. }
  607. _uccmcl_size = hdr.cnt * 3;
  608. _uccmcl_nodes = (unsigned long *) malloc(hdr.size.bytes);
  609. /*
  610. * Read the combining classes in.
  611. */
  612. fread((char *) _uccmcl_nodes, sizeof(unsigned long), _uccmcl_size, in);
  613. /*
  614. * Do an endian swap if necessary.
  615. */
  616. if (hdr.bom == 0xfffe) {
  617. for (i = 0; i < _uccmcl_size; i++)
  618. _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
  619. }
  620. }
  621. static void
  622. #ifdef __STDC__
  623. _uccmcl_unload(void)
  624. #else
  625. _uccmcl_unload()
  626. #endif
  627. {
  628. if (_uccmcl_size == 0)
  629. return;
  630. free((char *) _uccmcl_nodes);
  631. _uccmcl_size = 0;
  632. }
  633. unsigned long
  634. #ifdef __STDC__
  635. uccombining_class(unsigned long code)
  636. #else
  637. uccombining_class(code)
  638. unsigned long code;
  639. #endif
  640. {
  641. long l, r, m;
  642. l = 0;
  643. r = _uccmcl_size - 1;
  644. while (l <= r) {
  645. m = (l + r) >> 1;
  646. m -= (m % 3);
  647. if (code > _uccmcl_nodes[m + 1])
  648. l = m + 3;
  649. else if (code < _uccmcl_nodes[m])
  650. r = m - 3;
  651. else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
  652. return _uccmcl_nodes[m + 2];
  653. }
  654. return 0;
  655. }
  656. /**************************************************************************
  657. *
  658. * Support for numeric values.
  659. *
  660. **************************************************************************/
  661. static unsigned long *_ucnum_nodes;
  662. static unsigned long _ucnum_size;
  663. static short *_ucnum_vals;
  664. static void
  665. #ifdef __STDC__
  666. _ucnumb_load(char *paths, int reload)
  667. #else
  668. _ucnumb_load(paths, reload)
  669. char *paths;
  670. int reload;
  671. #endif
  672. {
  673. FILE *in;
  674. unsigned long size, i;
  675. _ucheader_t hdr;
  676. if (_ucnum_size > 0) {
  677. if (!reload)
  678. /*
  679. * The numbers have already been loaded.
  680. */
  681. return;
  682. free((char *) _ucnum_nodes);
  683. _ucnum_size = 0;
  684. }
  685. if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
  686. return;
  687. /*
  688. * Load the header.
  689. */
  690. fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
  691. if (hdr.bom == 0xfffe) {
  692. hdr.cnt = endian_short(hdr.cnt);
  693. hdr.size.bytes = endian_long(hdr.size.bytes);
  694. }
  695. _ucnum_size = hdr.cnt;
  696. _ucnum_nodes = (unsigned long *) malloc(hdr.size.bytes);
  697. _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
  698. /*
  699. * Read the combining classes in.
  700. */
  701. fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
  702. /*
  703. * Do an endian swap if necessary.
  704. */
  705. if (hdr.bom == 0xfffe) {
  706. for (i = 0; i < _ucnum_size; i++)
  707. _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
  708. /*
  709. * Determine the number of values that have to be adjusted.
  710. */
  711. size = (hdr.size.bytes -
  712. (_ucnum_size * (sizeof(unsigned long) << 1))) /
  713. sizeof(short);
  714. for (i = 0; i < size; i++)
  715. _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
  716. }
  717. }
  718. static void
  719. #ifdef __STDC__
  720. _ucnumb_unload(void)
  721. #else
  722. _ucnumb_unload()
  723. #endif
  724. {
  725. if (_ucnum_size == 0)
  726. return;
  727. free((char *) _ucnum_nodes);
  728. _ucnum_size = 0;
  729. }
  730. int
  731. #ifdef __STDC__
  732. ucnumber_lookup(unsigned long code, struct ucnumber *num)
  733. #else
  734. ucnumber_lookup(code, num)
  735. unsigned long code;
  736. struct ucnumber *num;
  737. #endif
  738. {
  739. long l, r, m;
  740. short *vp;
  741. l = 0;
  742. r = _ucnum_size - 1;
  743. while (l <= r) {
  744. /*
  745. * Determine a "mid" point and adjust to make sure the mid point is at
  746. * the beginning of a code+offset pair.
  747. */
  748. m = (l + r) >> 1;
  749. m -= (m & 1);
  750. if (code > _ucnum_nodes[m])
  751. l = m + 2;
  752. else if (code < _ucnum_nodes[m])
  753. r = m - 2;
  754. else {
  755. vp = _ucnum_vals + _ucnum_nodes[m + 1];
  756. num->numerator = (int) *vp++;
  757. num->denominator = (int) *vp;
  758. return 1;
  759. }
  760. }
  761. return 0;
  762. }
  763. int
  764. #ifdef __STDC__
  765. ucdigit_lookup(unsigned long code, int *digit)
  766. #else
  767. ucdigit_lookup(code, digit)
  768. unsigned long code;
  769. int *digit;
  770. #endif
  771. {
  772. long l, r, m;
  773. short *vp;
  774. l = 0;
  775. r = _ucnum_size - 1;
  776. while (l <= r) {
  777. /*
  778. * Determine a "mid" point and adjust to make sure the mid point is at
  779. * the beginning of a code+offset pair.
  780. */
  781. m = (l + r) >> 1;
  782. m -= (m & 1);
  783. if (code > _ucnum_nodes[m])
  784. l = m + 2;
  785. else if (code < _ucnum_nodes[m])
  786. r = m - 2;
  787. else {
  788. vp = _ucnum_vals + _ucnum_nodes[m + 1];
  789. if (*vp == *(vp + 1)) {
  790. *digit = *vp;
  791. return 1;
  792. }
  793. return 0;
  794. }
  795. }
  796. return 0;
  797. }
  798. struct ucnumber
  799. #ifdef __STDC__
  800. ucgetnumber(unsigned long code)
  801. #else
  802. ucgetnumber(code)
  803. unsigned long code;
  804. #endif
  805. {
  806. struct ucnumber num;
  807. /*
  808. * Initialize with some arbitrary value, because the caller simply cannot
  809. * tell for sure if the code is a number without calling the ucisnumber()
  810. * macro before calling this function.
  811. */
  812. num.numerator = num.denominator = -111;
  813. (void) ucnumber_lookup(code, &num);
  814. return num;
  815. }
  816. int
  817. #ifdef __STDC__
  818. ucgetdigit(unsigned long code)
  819. #else
  820. ucgetdigit(code)
  821. unsigned long code;
  822. #endif
  823. {
  824. int dig;
  825. /*
  826. * Initialize with some arbitrary value, because the caller simply cannot
  827. * tell for sure if the code is a number without calling the ucisdigit()
  828. * macro before calling this function.
  829. */
  830. dig = -111;
  831. (void) ucdigit_lookup(code, &dig);
  832. return dig;
  833. }
  834. /**************************************************************************
  835. *
  836. * Setup and cleanup routines.
  837. *
  838. **************************************************************************/
  839. void
  840. #ifdef __STDC__
  841. ucdata_load(char *paths, int masks)
  842. #else
  843. ucdata_load(paths, masks)
  844. char *paths;
  845. int masks;
  846. #endif
  847. {
  848. if (masks & UCDATA_CTYPE)
  849. _ucprop_load(paths, 0);
  850. if (masks & UCDATA_CASE)
  851. _uccase_load(paths, 0);
  852. if (masks & UCDATA_DECOMP)
  853. _ucdcmp_load(paths, 0);
  854. if (masks & UCDATA_CMBCL)
  855. _uccmcl_load(paths, 0);
  856. if (masks & UCDATA_NUM)
  857. _ucnumb_load(paths, 0);
  858. }
  859. void
  860. #ifdef __STDC__
  861. ucdata_unload(int masks)
  862. #else
  863. ucdata_unload(masks)
  864. int masks;
  865. #endif
  866. {
  867. if (masks & UCDATA_CTYPE)
  868. _ucprop_unload();
  869. if (masks & UCDATA_CASE)
  870. _uccase_unload();
  871. if (masks & UCDATA_DECOMP)
  872. _ucdcmp_unload();
  873. if (masks & UCDATA_CMBCL)
  874. _uccmcl_unload();
  875. if (masks & UCDATA_NUM)
  876. _ucnumb_unload();
  877. }
  878. void
  879. #ifdef __STDC__
  880. ucdata_reload(char *paths, int masks)
  881. #else
  882. ucdata_reload(paths, masks)
  883. char *paths;
  884. int masks;
  885. #endif
  886. {
  887. if (masks & UCDATA_CTYPE)
  888. _ucprop_load(paths, 1);
  889. if (masks & UCDATA_CASE)
  890. _uccase_load(paths, 1);
  891. if (masks & UCDATA_DECOMP)
  892. _ucdcmp_load(paths, 1);
  893. if (masks & UCDATA_CMBCL)
  894. _uccmcl_load(paths, 1);
  895. if (masks & UCDATA_NUM)
  896. _ucnumb_load(paths, 1);
  897. }
  898. #ifdef TEST
  899. void
  900. #ifdef __STDC__
  901. main(void)
  902. #else
  903. main()
  904. #endif
  905. {
  906. int dig;
  907. unsigned long i, lo, *dec;
  908. struct ucnumber num;
  909. ucdata_setup(".");
  910. if (ucisweak(0x30))
  911. printf("WEAK\n");
  912. else
  913. printf("NOT WEAK\n");
  914. printf("LOWER 0x%04lX\n", uctolower(0xff3a));
  915. printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
  916. if (ucisalpha(0x1d5))
  917. printf("ALPHA\n");
  918. else
  919. printf("NOT ALPHA\n");
  920. if (ucisupper(0x1d5)) {
  921. printf("UPPER\n");
  922. lo = uctolower(0x1d5);
  923. printf("0x%04lx\n", lo);
  924. lo = uctotitle(0x1d5);
  925. printf("0x%04lx\n", lo);
  926. } else
  927. printf("NOT UPPER\n");
  928. if (ucistitle(0x1d5))
  929. printf("TITLE\n");
  930. else
  931. printf("NOT TITLE\n");
  932. if (uciscomposite(0x1d5))
  933. printf("COMPOSITE\n");
  934. else
  935. printf("NOT COMPOSITE\n");
  936. if (ucdecomp(0x1d5, &lo, &dec)) {
  937. for (i = 0; i < lo; i++)
  938. printf("0x%04lx ", dec[i]);
  939. putchar('\n');
  940. }
  941. if ((lo = uccombining_class(0x41)) != 0)
  942. printf("0x41 CCL %ld\n", lo);
  943. if (ucisxdigit(0xfeff))
  944. printf("0xFEFF HEX DIGIT\n");
  945. else
  946. printf("0xFEFF NOT HEX DIGIT\n");
  947. if (ucisdefined(0x10000))
  948. printf("0x10000 DEFINED\n");
  949. else
  950. printf("0x10000 NOT DEFINED\n");
  951. if (ucnumber_lookup(0x30, &num)) {
  952. if (num.numerator != num.denominator)
  953. printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
  954. else
  955. printf("UCNUMBER: 0x30 = %d\n", num.numerator);
  956. } else
  957. printf("UCNUMBER: 0x30 NOT A NUMBER\n");
  958. if (ucnumber_lookup(0xbc, &num)) {
  959. if (num.numerator != num.denominator)
  960. printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
  961. else
  962. printf("UCNUMBER: 0xbc = %d\n", num.numerator);
  963. } else
  964. printf("UCNUMBER: 0xbc NOT A NUMBER\n");
  965. if (ucnumber_lookup(0xff19, &num)) {
  966. if (num.numerator != num.denominator)
  967. printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
  968. else
  969. printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
  970. } else
  971. printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
  972. if (ucnumber_lookup(0x4e00, &num)) {
  973. if (num.numerator != num.denominator)
  974. printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
  975. else
  976. printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
  977. } else
  978. printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
  979. if (ucdigit_lookup(0x06f9, &dig))
  980. printf("UCDIGIT: 0x6f9 = %d\n", dig);
  981. else
  982. printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
  983. dig = ucgetdigit(0x0969);
  984. printf("UCGETDIGIT: 0x969 = %d\n", dig);
  985. num = ucgetnumber(0x30);
  986. if (num.numerator != num.denominator)
  987. printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
  988. else
  989. printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
  990. num = ucgetnumber(0xbc);
  991. if (num.numerator != num.denominator)
  992. printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
  993. else
  994. printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
  995. num = ucgetnumber(0xff19);
  996. if (num.numerator != num.denominator)
  997. printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
  998. else
  999. printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
  1000. ucdata_cleanup();
  1001. exit(0);
  1002. }
  1003. #endif /* TEST */