srfi-14.c 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147
  1. /* Copyright 2001,2004,2006-2007,2009,2011,2018-2019,2022
  2. Free Software Foundation, Inc.
  3. This file is part of Guile.
  4. Guile is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. Guile is distributed in the hope that it will be useful, but WITHOUT
  9. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  11. License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with Guile. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #ifdef HAVE_CONFIG_H
  16. # include <config.h>
  17. #endif
  18. #include <stdio.h>
  19. #include <string.h>
  20. #include <unictype.h>
  21. #include "boolean.h"
  22. #include "chars.h"
  23. #include "eval.h"
  24. #include "gsubr.h"
  25. #include "list.h"
  26. #include "modules.h"
  27. #include "numbers.h"
  28. #include "pairs.h"
  29. #include "ports.h"
  30. #include "procs.h"
  31. #include "smob.h"
  32. #include "strings.h"
  33. #include "symbols.h"
  34. #include "values.h"
  35. #include "srfi-14.h"
  36. /* Include the pre-computed standard charset data. */
  37. #include "srfi-14.i.c"
  38. static const scm_t_bits SCM_CHARSET_F_IMMUTABLE = 1 << 16;
  39. static inline int
  40. scm_charset_is_immutable (SCM charset)
  41. {
  42. return SCM_SMOB_DATA_0 (charset) & SCM_CHARSET_F_IMMUTABLE;
  43. }
  44. #define SCM_VALIDATE_MUTABLE_CHARSET(pos, x) \
  45. do { \
  46. SCM_ASSERT_TYPE (SCM_CHARSETP (x) && !scm_charset_is_immutable (x), \
  47. x, pos, FUNC_NAME, "mutable charset"); \
  48. } while (0)
  49. static const scm_t_char_range cs_full_ranges[] = {
  50. {0x0000, SCM_CODEPOINT_SURROGATE_START - 1},
  51. {SCM_CODEPOINT_SURROGATE_END + 1, SCM_CODEPOINT_MAX}
  52. };
  53. static const size_t cs_full_len = 2;
  54. #define SCM_CHARSET_DATA(charset) ((scm_t_char_set *) SCM_SMOB_DATA (charset))
  55. #define SCM_CHARSET_DATA(charset) ((scm_t_char_set *) SCM_SMOB_DATA (charset))
  56. #define SCM_CHARSET_SET(cs, idx) \
  57. scm_i_charset_set (SCM_CHARSET_DATA (cs), idx)
  58. #define SCM_CHARSET_UNSET(cs, idx) \
  59. scm_i_charset_unset (SCM_CHARSET_DATA (cs), idx)
  60. /* Smob type code for character sets. */
  61. int scm_tc16_charset = 0;
  62. int scm_tc16_charset_cursor = 0;
  63. /* True if N exists in charset CS. */
  64. int
  65. scm_i_charset_get (scm_t_char_set *cs, scm_t_wchar n)
  66. {
  67. size_t i;
  68. i = 0;
  69. while (i < cs->len)
  70. {
  71. if (cs->ranges[i].lo <= n && n <= cs->ranges[i].hi)
  72. return 1;
  73. i++;
  74. }
  75. return 0;
  76. }
  77. /* Put N into charset CS. */
  78. void
  79. scm_i_charset_set (scm_t_char_set *cs, scm_t_wchar n)
  80. {
  81. size_t i;
  82. size_t len;
  83. len = cs->len;
  84. i = 0;
  85. while (i < len)
  86. {
  87. /* Already in this range */
  88. if (cs->ranges[i].lo <= n && n <= cs->ranges[i].hi)
  89. {
  90. return;
  91. }
  92. if (n == cs->ranges[i].lo - 1)
  93. {
  94. /* This char is one below the current range. */
  95. if (i > 0 && cs->ranges[i - 1].hi + 1 == n)
  96. {
  97. /* It is also one above the previous range. */
  98. /* This is an impossible condition: in the previous
  99. iteration, the test for 'one above the current range'
  100. should already have inserted the character here. */
  101. abort ();
  102. }
  103. else
  104. {
  105. /* Expand the range down by one. */
  106. cs->ranges[i].lo = n;
  107. return;
  108. }
  109. }
  110. else if (n == cs->ranges[i].hi + 1)
  111. {
  112. /* This char is one above the current range. */
  113. if (i < len - 1 && cs->ranges[i + 1].lo - 1 == n)
  114. {
  115. /* It is also one below the next range, so combine them. */
  116. cs->ranges[i].hi = cs->ranges[i + 1].hi;
  117. if (i < len - 2)
  118. memmove (cs->ranges + (i + 1), cs->ranges + (i + 2),
  119. sizeof (scm_t_char_range) * (len - i - 2));
  120. cs->ranges = scm_gc_realloc (cs->ranges,
  121. sizeof (scm_t_char_range) * len,
  122. sizeof (scm_t_char_range) * (len -
  123. 1),
  124. "character-set");
  125. cs->len = len - 1;
  126. return;
  127. }
  128. else
  129. {
  130. /* Expand the range up by one. */
  131. cs->ranges[i].hi = n;
  132. return;
  133. }
  134. }
  135. else if (n < cs->ranges[i].lo - 1)
  136. {
  137. /* This is a new range below the current one. */
  138. cs->ranges = scm_gc_realloc (cs->ranges,
  139. sizeof (scm_t_char_range) * len,
  140. sizeof (scm_t_char_range) * (len + 1),
  141. "character-set");
  142. memmove (cs->ranges + (i + 1), cs->ranges + i,
  143. sizeof (scm_t_char_range) * (len - i));
  144. cs->ranges[i].lo = n;
  145. cs->ranges[i].hi = n;
  146. cs->len = len + 1;
  147. return;
  148. }
  149. i++;
  150. }
  151. /* This is a new range above all previous ranges. */
  152. if (len == 0)
  153. {
  154. cs->ranges = scm_gc_malloc (sizeof (scm_t_char_range), "character-set");
  155. }
  156. else
  157. {
  158. cs->ranges = scm_gc_realloc (cs->ranges,
  159. sizeof (scm_t_char_range) * len,
  160. sizeof (scm_t_char_range) * (len + 1),
  161. "character-set");
  162. }
  163. cs->ranges[len].lo = n;
  164. cs->ranges[len].hi = n;
  165. cs->len = len + 1;
  166. return;
  167. }
  168. /* Put LO to HI inclusive into charset CS. */
  169. static void
  170. scm_i_charset_set_range (scm_t_char_set *cs, scm_t_wchar lo, scm_t_wchar hi)
  171. {
  172. size_t i;
  173. i = 0;
  174. while (i < cs->len)
  175. {
  176. /* Already in this range */
  177. if (cs->ranges[i].lo <= lo && cs->ranges[i].hi >= hi)
  178. return;
  179. /* cur: +---+
  180. new: +---+
  181. */
  182. if (cs->ranges[i].lo - 1 > hi)
  183. {
  184. /* Add a new range below the current one. */
  185. cs->ranges = scm_gc_realloc (cs->ranges,
  186. sizeof (scm_t_char_range) * cs->len,
  187. sizeof (scm_t_char_range) * (cs->len + 1),
  188. "character-set");
  189. memmove (cs->ranges + (i + 1), cs->ranges + i,
  190. sizeof (scm_t_char_range) * (cs->len - i));
  191. cs->ranges[i].lo = lo;
  192. cs->ranges[i].hi = hi;
  193. cs->len += 1;
  194. return;
  195. }
  196. /* cur: +---+ or +---+ or +---+
  197. new: +---+ +---+ +---+
  198. */
  199. if (cs->ranges[i].lo > lo
  200. && (cs->ranges[i].lo - 1 <= hi && cs->ranges[i].hi >= hi))
  201. {
  202. cs->ranges[i].lo = lo;
  203. return;
  204. }
  205. /* cur: +---+ or +---+ or +---+
  206. new: +---+ +---+ +---+
  207. */
  208. else if (cs->ranges[i].hi + 1 >= lo && cs->ranges[i].hi < hi)
  209. {
  210. if (cs->ranges[i].lo > lo)
  211. cs->ranges[i].lo = lo;
  212. if (cs->ranges[i].hi < hi)
  213. cs->ranges[i].hi = hi;
  214. while (i < cs->len - 1)
  215. {
  216. /* cur: --+ +---+
  217. new: -----+
  218. */
  219. if (cs->ranges[i + 1].lo - 1 > hi)
  220. break;
  221. /* cur: --+ +---+ or --+ +---+ or --+ +--+
  222. new: -----+ ------+ ---------+
  223. */
  224. /* Combine this range with the previous one. */
  225. if (cs->ranges[i + 1].hi > hi)
  226. cs->ranges[i].hi = cs->ranges[i + 1].hi;
  227. if (i + 1 < cs->len)
  228. memmove (cs->ranges + i + 1, cs->ranges + i + 2,
  229. sizeof (scm_t_char_range) * (cs->len - i - 2));
  230. cs->ranges = scm_gc_realloc (cs->ranges,
  231. sizeof (scm_t_char_range) * cs->len,
  232. sizeof (scm_t_char_range) * (cs->len - 1),
  233. "character-set");
  234. cs->len -= 1;
  235. }
  236. return;
  237. }
  238. i ++;
  239. }
  240. /* This is a new range above all previous ranges. */
  241. if (cs->len == 0)
  242. {
  243. cs->ranges = scm_gc_malloc (sizeof (scm_t_char_range), "character-set");
  244. }
  245. else
  246. {
  247. cs->ranges = scm_gc_realloc (cs->ranges,
  248. sizeof (scm_t_char_range) * cs->len,
  249. sizeof (scm_t_char_range) * (cs->len + 1),
  250. "character-set");
  251. }
  252. cs->len += 1;
  253. cs->ranges[cs->len - 1].lo = lo;
  254. cs->ranges[cs->len - 1].hi = hi;
  255. return;
  256. }
  257. /* If N is in charset CS, remove it. */
  258. void
  259. scm_i_charset_unset (scm_t_char_set *cs, scm_t_wchar n)
  260. {
  261. size_t i;
  262. size_t len;
  263. len = cs->len;
  264. i = 0;
  265. while (i < len)
  266. {
  267. if (n < cs->ranges[i].lo)
  268. /* Not in this set. */
  269. return;
  270. if (n == cs->ranges[i].lo && n == cs->ranges[i].hi)
  271. {
  272. /* Remove this one-character range. */
  273. if (len == 1)
  274. {
  275. scm_gc_free (cs->ranges,
  276. sizeof (scm_t_char_range) * cs->len,
  277. "character-set");
  278. cs->ranges = NULL;
  279. cs->len = 0;
  280. return;
  281. }
  282. else if (i < len - 1)
  283. {
  284. memmove (cs->ranges + i, cs->ranges + (i + 1),
  285. sizeof (scm_t_char_range) * (len - i - 1));
  286. cs->ranges = scm_gc_realloc (cs->ranges,
  287. sizeof (scm_t_char_range) * len,
  288. sizeof (scm_t_char_range) * (len -
  289. 1),
  290. "character-set");
  291. cs->len = len - 1;
  292. return;
  293. }
  294. else if (i == len - 1)
  295. {
  296. cs->ranges = scm_gc_realloc (cs->ranges,
  297. sizeof (scm_t_char_range) * len,
  298. sizeof (scm_t_char_range) * (len -
  299. 1),
  300. "character-set");
  301. cs->len = len - 1;
  302. return;
  303. }
  304. }
  305. else if (n == cs->ranges[i].lo)
  306. {
  307. /* Shrink this range from the left. */
  308. cs->ranges[i].lo = n + 1;
  309. return;
  310. }
  311. else if (n == cs->ranges[i].hi)
  312. {
  313. /* Shrink this range from the right. */
  314. cs->ranges[i].hi = n - 1;
  315. return;
  316. }
  317. else if (n > cs->ranges[i].lo && n < cs->ranges[i].hi)
  318. {
  319. /* Split this range into two pieces. */
  320. cs->ranges = scm_gc_realloc (cs->ranges,
  321. sizeof (scm_t_char_range) * len,
  322. sizeof (scm_t_char_range) * (len + 1),
  323. "character-set");
  324. if (i < len - 1)
  325. memmove (cs->ranges + (i + 2), cs->ranges + (i + 1),
  326. sizeof (scm_t_char_range) * (len - i - 1));
  327. cs->ranges[i + 1].hi = cs->ranges[i].hi;
  328. cs->ranges[i + 1].lo = n + 1;
  329. cs->ranges[i].hi = n - 1;
  330. cs->len = len + 1;
  331. return;
  332. }
  333. i++;
  334. }
  335. /* This value is above all ranges, so do nothing here. */
  336. return;
  337. }
  338. static int
  339. charsets_equal (scm_t_char_set *a, scm_t_char_set *b)
  340. {
  341. if (a->len != b->len)
  342. return 0;
  343. /* Empty charsets may have ranges == NULL. We must avoid passing
  344. NULL to memcmp, even if the length is zero, to avoid undefined
  345. behavior. */
  346. if (a->len == 0)
  347. return 1;
  348. if (memcmp (a->ranges, b->ranges, sizeof (scm_t_char_range) * a->len) != 0)
  349. return 0;
  350. return 1;
  351. }
  352. /* Return true if every character in A is also in B. */
  353. static int
  354. charsets_leq (scm_t_char_set *a, scm_t_char_set *b)
  355. {
  356. size_t i = 0, j = 0;
  357. scm_t_wchar alo, ahi;
  358. if (a->len == 0)
  359. return 1;
  360. if (b->len == 0)
  361. return 0;
  362. while (i < a->len)
  363. {
  364. alo = a->ranges[i].lo;
  365. ahi = a->ranges[i].hi;
  366. while (b->ranges[j].hi < alo)
  367. {
  368. if (j < b->len - 1)
  369. j++;
  370. else
  371. return 0;
  372. }
  373. if (alo < b->ranges[j].lo || ahi > b->ranges[j].hi)
  374. return 0;
  375. i++;
  376. }
  377. return 1;
  378. }
  379. /* Merge B into A. */
  380. static void
  381. charsets_union (scm_t_char_set *a, scm_t_char_set *b)
  382. {
  383. size_t i = 0;
  384. scm_t_wchar blo, bhi;
  385. if (b->len == 0)
  386. return;
  387. if (a->len == 0)
  388. {
  389. a->len = b->len;
  390. a->ranges = scm_gc_malloc (sizeof (scm_t_char_range) * b->len,
  391. "character-set");
  392. memcpy (a->ranges, b->ranges, sizeof (scm_t_char_range) * b->len);
  393. return;
  394. }
  395. while (i < b->len)
  396. {
  397. blo = b->ranges[i].lo;
  398. bhi = b->ranges[i].hi;
  399. scm_i_charset_set_range (a, blo, bhi);
  400. i++;
  401. }
  402. return;
  403. }
  404. /* Remove elements not both in A and B from A. */
  405. static void
  406. charsets_intersection (scm_t_char_set *a, scm_t_char_set *b)
  407. {
  408. size_t i = 0;
  409. scm_t_wchar blo, bhi, n;
  410. scm_t_char_set *c;
  411. if (a->len == 0)
  412. return;
  413. if (b->len == 0)
  414. {
  415. scm_gc_free (a->ranges, sizeof (scm_t_char_range) * a->len,
  416. "character-set");
  417. a->len = 0;
  418. return;
  419. }
  420. c = (scm_t_char_set *) scm_malloc (sizeof (scm_t_char_set));
  421. c->len = 0;
  422. c->ranges = NULL;
  423. while (i < b->len)
  424. {
  425. blo = b->ranges[i].lo;
  426. bhi = b->ranges[i].hi;
  427. for (n = blo; n <= bhi; n++)
  428. if (scm_i_charset_get (a, n))
  429. scm_i_charset_set (c, n);
  430. i++;
  431. }
  432. scm_gc_free (a->ranges, sizeof (scm_t_char_range) * a->len,
  433. "character-set");
  434. a->len = c->len;
  435. if (c->len != 0)
  436. a->ranges = c->ranges;
  437. else
  438. a->ranges = NULL;
  439. free (c);
  440. return;
  441. }
  442. #define SCM_ADD_RANGE(low, high) \
  443. do { \
  444. p->ranges[idx].lo = (low); \
  445. p->ranges[idx++].hi = (high); \
  446. } while (0)
  447. #define SCM_ADD_RANGE_SKIP_SURROGATES(low, high) \
  448. do { \
  449. p->ranges[idx].lo = (low); \
  450. p->ranges[idx++].hi = SCM_CODEPOINT_SURROGATE_START - 1; \
  451. p->ranges[idx].lo = SCM_CODEPOINT_SURROGATE_END + 1; \
  452. p->ranges[idx++].hi = (high); \
  453. } while (0)
  454. /* Make P the compelement of Q. */
  455. static void
  456. charsets_complement (scm_t_char_set *p, scm_t_char_set *q)
  457. {
  458. int k, idx;
  459. idx = 0;
  460. if (q->len == 0)
  461. {
  462. /* Fill with all valid codepoints. */
  463. p->len = 2;
  464. p->ranges = scm_gc_malloc (sizeof (scm_t_char_range) * 2,
  465. "character-set");
  466. SCM_ADD_RANGE_SKIP_SURROGATES (0, SCM_CODEPOINT_MAX);
  467. return;
  468. }
  469. if (p->len > 0)
  470. scm_gc_free (p->ranges, sizeof (scm_t_char_set) * p->len,
  471. "character-set");
  472. /* Count the number of ranges needed for the output. */
  473. p->len = 0;
  474. if (q->ranges[0].lo > 0)
  475. p->len++;
  476. if (q->ranges[q->len - 1].hi < SCM_CODEPOINT_MAX)
  477. p->len++;
  478. p->len += q->len;
  479. p->ranges =
  480. (scm_t_char_range *) scm_gc_malloc (sizeof (scm_t_char_range) * p->len,
  481. "character-set");
  482. if (q->ranges[0].lo > 0)
  483. {
  484. if (q->ranges[0].lo > SCM_CODEPOINT_SURROGATE_END)
  485. SCM_ADD_RANGE_SKIP_SURROGATES (0, q->ranges[0].lo - 1);
  486. else
  487. SCM_ADD_RANGE (0, q->ranges[0].lo - 1);
  488. }
  489. for (k = 1; k < q->len; k++)
  490. {
  491. if (q->ranges[k - 1].hi < SCM_CODEPOINT_SURROGATE_START
  492. && q->ranges[k].lo - 1 > SCM_CODEPOINT_SURROGATE_END)
  493. SCM_ADD_RANGE_SKIP_SURROGATES (q->ranges[k - 1].hi + 1, q->ranges[k].lo - 1);
  494. else
  495. SCM_ADD_RANGE (q->ranges[k - 1].hi + 1, q->ranges[k].lo - 1);
  496. }
  497. if (q->ranges[q->len - 1].hi < SCM_CODEPOINT_MAX)
  498. {
  499. if (q->ranges[q->len - 1].hi < SCM_CODEPOINT_SURROGATE_START)
  500. SCM_ADD_RANGE_SKIP_SURROGATES (q->ranges[q->len - 1].hi + 1, SCM_CODEPOINT_MAX);
  501. else
  502. SCM_ADD_RANGE (q->ranges[q->len - 1].hi + 1, SCM_CODEPOINT_MAX);
  503. }
  504. return;
  505. }
  506. #undef SCM_ADD_RANGE
  507. #undef SCM_ADD_RANGE_SKIP_SURROGATES
  508. /* Replace A with elements only found in one of A or B. */
  509. static void
  510. charsets_xor (scm_t_char_set *a, scm_t_char_set *b)
  511. {
  512. size_t i = 0;
  513. scm_t_wchar blo, bhi, n;
  514. if (b->len == 0)
  515. {
  516. return;
  517. }
  518. if (a->len == 0)
  519. {
  520. a->ranges =
  521. (scm_t_char_range *) scm_gc_malloc (sizeof (scm_t_char_range) *
  522. b->len, "character-set");
  523. a->len = b->len;
  524. memcpy (a->ranges, b->ranges, sizeof (scm_t_char_range) * a->len);
  525. return;
  526. }
  527. while (i < b->len)
  528. {
  529. blo = b->ranges[i].lo;
  530. bhi = b->ranges[i].hi;
  531. for (n = blo; n <= bhi; n++)
  532. {
  533. if (scm_i_charset_get (a, n))
  534. scm_i_charset_unset (a, n);
  535. else
  536. scm_i_charset_set (a, n);
  537. }
  538. i++;
  539. }
  540. return;
  541. }
  542. /* Smob print hook for character sets. */
  543. static int
  544. charset_print (SCM charset, SCM port, scm_print_state *pstate SCM_UNUSED)
  545. {
  546. size_t i;
  547. int first = 1;
  548. scm_t_char_set *p;
  549. const size_t max_ranges_to_print = 50;
  550. p = SCM_CHARSET_DATA (charset);
  551. scm_puts ("#<charset {", port);
  552. for (i = 0; i < p->len; i++)
  553. {
  554. if (first)
  555. first = 0;
  556. else
  557. scm_puts (" ", port);
  558. scm_write (SCM_MAKE_CHAR (p->ranges[i].lo), port);
  559. if (p->ranges[i].lo != p->ranges[i].hi)
  560. {
  561. scm_puts ("..", port);
  562. scm_write (SCM_MAKE_CHAR (p->ranges[i].hi), port);
  563. }
  564. if (i >= max_ranges_to_print)
  565. {
  566. /* Too many to print here. Quit early. */
  567. scm_puts (" ...", port);
  568. break;
  569. }
  570. }
  571. scm_puts ("}>", port);
  572. return 1;
  573. }
  574. /* Smob print hook for character sets cursors. */
  575. static int
  576. charset_cursor_print (SCM cursor, SCM port,
  577. scm_print_state *pstate SCM_UNUSED)
  578. {
  579. scm_t_char_set_cursor *cur;
  580. cur = (scm_t_char_set_cursor *) SCM_SMOB_DATA (cursor);
  581. scm_puts ("#<charset-cursor ", port);
  582. if (cur->range == (size_t) (-1))
  583. scm_puts ("(empty)", port);
  584. else
  585. {
  586. scm_write (scm_from_size_t (cur->range), port);
  587. scm_puts (":", port);
  588. scm_write (scm_from_int32 (cur->n), port);
  589. }
  590. scm_puts (">", port);
  591. return 1;
  592. }
  593. /* Create a new, empty character set. */
  594. static SCM
  595. make_char_set (const char *func_name)
  596. {
  597. scm_t_char_set *p;
  598. p = scm_gc_malloc (sizeof (scm_t_char_set), "character-set");
  599. memset (p, 0, sizeof (scm_t_char_set));
  600. SCM_RETURN_NEWSMOB (scm_tc16_charset, p);
  601. }
  602. SCM_DEFINE (scm_char_set_p, "char-set?", 1, 0, 0,
  603. (SCM obj),
  604. "Return @code{#t} if @var{obj} is a character set, @code{#f}\n"
  605. "otherwise.")
  606. #define FUNC_NAME s_scm_char_set_p
  607. {
  608. return scm_from_bool (SCM_SMOB_PREDICATE (scm_tc16_charset, obj));
  609. }
  610. #undef FUNC_NAME
  611. SCM_DEFINE (scm_char_set_eq, "char-set=", 0, 0, 1,
  612. (SCM char_sets),
  613. "Return @code{#t} if all given character sets are equal.")
  614. #define FUNC_NAME s_scm_char_set_eq
  615. {
  616. int argnum = 1;
  617. scm_t_char_set *cs1_data = NULL;
  618. SCM_VALIDATE_REST_ARGUMENT (char_sets);
  619. while (!scm_is_null (char_sets))
  620. {
  621. SCM csi = SCM_CAR (char_sets);
  622. scm_t_char_set *csi_data;
  623. SCM_VALIDATE_SMOB (argnum, csi, charset);
  624. argnum++;
  625. csi_data = SCM_CHARSET_DATA (csi);
  626. if (cs1_data == NULL)
  627. cs1_data = csi_data;
  628. else if (!charsets_equal (cs1_data, csi_data))
  629. return SCM_BOOL_F;
  630. char_sets = SCM_CDR (char_sets);
  631. }
  632. return SCM_BOOL_T;
  633. }
  634. #undef FUNC_NAME
  635. SCM_DEFINE (scm_char_set_leq, "char-set<=", 0, 0, 1,
  636. (SCM char_sets),
  637. "Return @code{#t} if every character set @var{char_set}i is a subset\n"
  638. "of character set @var{char_set}i+1.")
  639. #define FUNC_NAME s_scm_char_set_leq
  640. {
  641. int argnum = 1;
  642. scm_t_char_set *prev_data = NULL;
  643. SCM_VALIDATE_REST_ARGUMENT (char_sets);
  644. while (!scm_is_null (char_sets))
  645. {
  646. SCM csi = SCM_CAR (char_sets);
  647. scm_t_char_set *csi_data;
  648. SCM_VALIDATE_SMOB (argnum, csi, charset);
  649. argnum++;
  650. csi_data = SCM_CHARSET_DATA (csi);
  651. if (prev_data)
  652. {
  653. if (!charsets_leq (prev_data, csi_data))
  654. return SCM_BOOL_F;
  655. }
  656. prev_data = csi_data;
  657. char_sets = SCM_CDR (char_sets);
  658. }
  659. return SCM_BOOL_T;
  660. }
  661. #undef FUNC_NAME
  662. SCM_DEFINE (scm_char_set_hash, "char-set-hash", 1, 1, 0,
  663. (SCM cs, SCM bound),
  664. "Compute a hash value for the character set @var{cs}. If\n"
  665. "@var{bound} is given and non-zero, it restricts the\n"
  666. "returned value to the range 0 @dots{} @var{bound} - 1.")
  667. #define FUNC_NAME s_scm_char_set_hash
  668. {
  669. const unsigned long default_bnd = 871;
  670. unsigned long bnd;
  671. scm_t_char_set *p;
  672. unsigned long val = 0;
  673. int k;
  674. scm_t_wchar c;
  675. SCM_VALIDATE_SMOB (1, cs, charset);
  676. if (SCM_UNBNDP (bound))
  677. bnd = default_bnd;
  678. else
  679. {
  680. bnd = scm_to_ulong (bound);
  681. if (bnd == 0)
  682. bnd = default_bnd;
  683. }
  684. p = SCM_CHARSET_DATA (cs);
  685. for (k = 0; k < p->len; k++)
  686. {
  687. for (c = p->ranges[k].lo; c <= p->ranges[k].hi; c++)
  688. val = c + (val << 1);
  689. }
  690. return scm_from_ulong (val % bnd);
  691. }
  692. #undef FUNC_NAME
  693. SCM_DEFINE (scm_char_set_cursor, "char-set-cursor", 1, 0, 0,
  694. (SCM cs), "Return a cursor into the character set @var{cs}.")
  695. #define FUNC_NAME s_scm_char_set_cursor
  696. {
  697. scm_t_char_set *cs_data;
  698. scm_t_char_set_cursor *cur_data;
  699. SCM_VALIDATE_SMOB (1, cs, charset);
  700. cs_data = SCM_CHARSET_DATA (cs);
  701. cur_data =
  702. (scm_t_char_set_cursor *) scm_gc_malloc (sizeof (scm_t_char_set_cursor),
  703. "charset-cursor");
  704. if (cs_data->len == 0)
  705. {
  706. cur_data->range = (size_t) (-1);
  707. cur_data->n = 0;
  708. }
  709. else
  710. {
  711. cur_data->range = 0;
  712. cur_data->n = cs_data->ranges[0].lo;
  713. }
  714. SCM_RETURN_NEWSMOB (scm_tc16_charset_cursor, cur_data);
  715. }
  716. #undef FUNC_NAME
  717. SCM_DEFINE (scm_char_set_ref, "char-set-ref", 2, 0, 0,
  718. (SCM cs, SCM cursor),
  719. "Return the character at the current cursor position\n"
  720. "@var{cursor} in the character set @var{cs}. It is an error to\n"
  721. "pass a cursor for which @code{end-of-char-set?} returns true.")
  722. #define FUNC_NAME s_scm_char_set_ref
  723. {
  724. scm_t_char_set *cs_data;
  725. scm_t_char_set_cursor *cur_data;
  726. size_t i;
  727. SCM_VALIDATE_SMOB (1, cs, charset);
  728. SCM_VALIDATE_SMOB (2, cursor, charset_cursor);
  729. cs_data = SCM_CHARSET_DATA (cs);
  730. cur_data = (scm_t_char_set_cursor *) SCM_SMOB_DATA (cursor);
  731. /* Validate that this cursor is still true. */
  732. i = cur_data->range;
  733. if (i == (size_t) (-1)
  734. || i >= cs_data->len
  735. || cur_data->n < cs_data->ranges[i].lo
  736. || cur_data->n > cs_data->ranges[i].hi)
  737. SCM_MISC_ERROR ("invalid character set cursor: ~A", scm_list_1 (cursor));
  738. return SCM_MAKE_CHAR (cur_data->n);
  739. }
  740. #undef FUNC_NAME
  741. SCM_DEFINE (scm_char_set_cursor_next, "char-set-cursor-next", 2, 0, 0,
  742. (SCM cs, SCM cursor),
  743. "Advance the character set cursor @var{cursor} to the next\n"
  744. "character in the character set @var{cs}. It is an error if the\n"
  745. "cursor given satisfies @code{end-of-char-set?}.")
  746. #define FUNC_NAME s_scm_char_set_cursor_next
  747. {
  748. scm_t_char_set *cs_data;
  749. scm_t_char_set_cursor *cur_data;
  750. size_t i;
  751. SCM_VALIDATE_SMOB (1, cs, charset);
  752. SCM_VALIDATE_SMOB (2, cursor, charset_cursor);
  753. cs_data = SCM_CHARSET_DATA (cs);
  754. cur_data = (scm_t_char_set_cursor *) SCM_SMOB_DATA (cursor);
  755. /* Validate that this cursor is still true. */
  756. i = cur_data->range;
  757. if (i == (size_t) (-1)
  758. || i >= cs_data->len
  759. || cur_data->n < cs_data->ranges[i].lo
  760. || cur_data->n > cs_data->ranges[i].hi)
  761. SCM_MISC_ERROR ("invalid character set cursor: ~A", scm_list_1 (cursor));
  762. /* Increment the cursor. */
  763. if (cur_data->n == cs_data->ranges[i].hi)
  764. {
  765. if (i + 1 < cs_data->len)
  766. {
  767. cur_data->range = i + 1;
  768. cur_data->n = cs_data->ranges[i + 1].lo;
  769. }
  770. else
  771. {
  772. /* This is the end of the road. */
  773. cur_data->range = (size_t) (-1);
  774. cur_data->n = 0;
  775. }
  776. }
  777. else
  778. {
  779. cur_data->n = cur_data->n + 1;
  780. }
  781. return cursor;
  782. }
  783. #undef FUNC_NAME
  784. SCM_DEFINE (scm_end_of_char_set_p, "end-of-char-set?", 1, 0, 0,
  785. (SCM cursor),
  786. "Return @code{#t} if @var{cursor} has reached the end of a\n"
  787. "character set, @code{#f} otherwise.")
  788. #define FUNC_NAME s_scm_end_of_char_set_p
  789. {
  790. scm_t_char_set_cursor *cur_data;
  791. SCM_VALIDATE_SMOB (1, cursor, charset_cursor);
  792. cur_data = (scm_t_char_set_cursor *) SCM_SMOB_DATA (cursor);
  793. if (cur_data->range == (size_t) (-1))
  794. return SCM_BOOL_T;
  795. return SCM_BOOL_F;
  796. }
  797. #undef FUNC_NAME
  798. SCM_DEFINE (scm_char_set_fold, "char-set-fold", 3, 0, 0,
  799. (SCM kons, SCM knil, SCM cs),
  800. "Fold the procedure @var{kons} over the character set @var{cs},\n"
  801. "initializing it with @var{knil}.")
  802. #define FUNC_NAME s_scm_char_set_fold
  803. {
  804. scm_t_char_set *cs_data;
  805. int k;
  806. scm_t_wchar n;
  807. SCM_VALIDATE_PROC (1, kons);
  808. SCM_VALIDATE_SMOB (3, cs, charset);
  809. cs_data = SCM_CHARSET_DATA (cs);
  810. if (cs_data->len == 0)
  811. return knil;
  812. for (k = 0; k < cs_data->len; k++)
  813. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  814. {
  815. knil = scm_call_2 (kons, SCM_MAKE_CHAR (n), knil);
  816. }
  817. return knil;
  818. }
  819. #undef FUNC_NAME
  820. SCM_DEFINE (scm_char_set_unfold, "char-set-unfold", 4, 1, 0,
  821. (SCM p, SCM f, SCM g, SCM seed, SCM base_cs),
  822. "This is a fundamental constructor for character sets.\n"
  823. "@itemize @bullet\n"
  824. "@item @var{g} is used to generate a series of ``seed'' values\n"
  825. "from the initial seed: @var{seed}, (@var{g} @var{seed}),\n"
  826. "(@var{g}^2 @var{seed}), (@var{g}^3 @var{seed}), @dots{}\n"
  827. "@item @var{p} tells us when to stop -- when it returns true\n"
  828. "when applied to one of the seed values.\n"
  829. "@item @var{f} maps each seed value to a character. These\n"
  830. "characters are added to the base character set @var{base_cs} to\n"
  831. "form the result; @var{base_cs} defaults to the empty set.\n"
  832. "@end itemize")
  833. #define FUNC_NAME s_scm_char_set_unfold
  834. {
  835. SCM result, tmp;
  836. SCM_VALIDATE_PROC (1, p);
  837. SCM_VALIDATE_PROC (2, f);
  838. SCM_VALIDATE_PROC (3, g);
  839. if (!SCM_UNBNDP (base_cs))
  840. {
  841. SCM_VALIDATE_SMOB (5, base_cs, charset);
  842. result = scm_char_set_copy (base_cs);
  843. }
  844. else
  845. result = make_char_set (FUNC_NAME);
  846. tmp = scm_call_1 (p, seed);
  847. while (scm_is_false (tmp))
  848. {
  849. SCM ch = scm_call_1 (f, seed);
  850. if (!SCM_CHARP (ch))
  851. SCM_MISC_ERROR ("procedure ~S returned non-char", scm_list_1 (f));
  852. SCM_CHARSET_SET (result, SCM_CHAR (ch));
  853. seed = scm_call_1 (g, seed);
  854. tmp = scm_call_1 (p, seed);
  855. }
  856. return result;
  857. }
  858. #undef FUNC_NAME
  859. SCM_DEFINE (scm_char_set_unfold_x, "char-set-unfold!", 5, 0, 0,
  860. (SCM p, SCM f, SCM g, SCM seed, SCM base_cs),
  861. "This is a fundamental constructor for character sets.\n"
  862. "@itemize @bullet\n"
  863. "@item @var{g} is used to generate a series of ``seed'' values\n"
  864. "from the initial seed: @var{seed}, (@var{g} @var{seed}),\n"
  865. "(@var{g}^2 @var{seed}), (@var{g}^3 @var{seed}), @dots{}\n"
  866. "@item @var{p} tells us when to stop -- when it returns true\n"
  867. "when applied to one of the seed values.\n"
  868. "@item @var{f} maps each seed value to a character. These\n"
  869. "characters are added to the base character set @var{base_cs} to\n"
  870. "form the result; @var{base_cs} defaults to the empty set.\n"
  871. "@end itemize")
  872. #define FUNC_NAME s_scm_char_set_unfold_x
  873. {
  874. SCM tmp;
  875. SCM_VALIDATE_PROC (1, p);
  876. SCM_VALIDATE_PROC (2, f);
  877. SCM_VALIDATE_PROC (3, g);
  878. SCM_VALIDATE_MUTABLE_CHARSET (5, base_cs);
  879. tmp = scm_call_1 (p, seed);
  880. while (scm_is_false (tmp))
  881. {
  882. SCM ch = scm_call_1 (f, seed);
  883. if (!SCM_CHARP (ch))
  884. SCM_MISC_ERROR ("procedure ~S returned non-char", scm_list_1 (f));
  885. SCM_CHARSET_SET (base_cs, SCM_CHAR (ch));
  886. seed = scm_call_1 (g, seed);
  887. tmp = scm_call_1 (p, seed);
  888. }
  889. return base_cs;
  890. }
  891. #undef FUNC_NAME
  892. SCM_DEFINE (scm_char_set_for_each, "char-set-for-each", 2, 0, 0,
  893. (SCM proc, SCM cs),
  894. "Apply @var{proc} to every character in the character set\n"
  895. "@var{cs}. The return value is not specified.")
  896. #define FUNC_NAME s_scm_char_set_for_each
  897. {
  898. scm_t_char_set *cs_data;
  899. int k;
  900. scm_t_wchar n;
  901. SCM_VALIDATE_PROC (1, proc);
  902. SCM_VALIDATE_SMOB (2, cs, charset);
  903. cs_data = SCM_CHARSET_DATA (cs);
  904. if (cs_data->len == 0)
  905. return SCM_UNSPECIFIED;
  906. for (k = 0; k < cs_data->len; k++)
  907. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  908. {
  909. scm_call_1 (proc, SCM_MAKE_CHAR (n));
  910. }
  911. return SCM_UNSPECIFIED;
  912. }
  913. #undef FUNC_NAME
  914. SCM_DEFINE (scm_char_set_map, "char-set-map", 2, 0, 0,
  915. (SCM proc, SCM cs),
  916. "Map the procedure @var{proc} over every character in @var{cs}.\n"
  917. "@var{proc} must be a character -> character procedure.")
  918. #define FUNC_NAME s_scm_char_set_map
  919. {
  920. SCM result;
  921. int k;
  922. scm_t_char_set *cs_data;
  923. scm_t_wchar n;
  924. SCM_VALIDATE_PROC (1, proc);
  925. SCM_VALIDATE_SMOB (2, cs, charset);
  926. result = make_char_set (FUNC_NAME);
  927. cs_data = SCM_CHARSET_DATA (cs);
  928. if (cs_data->len == 0)
  929. return result;
  930. for (k = 0; k < cs_data->len; k++)
  931. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  932. {
  933. SCM ch = scm_call_1 (proc, SCM_MAKE_CHAR (n));
  934. if (!SCM_CHARP (ch))
  935. SCM_MISC_ERROR ("procedure ~S returned non-char",
  936. scm_list_1 (proc));
  937. SCM_CHARSET_SET (result, SCM_CHAR (ch));
  938. }
  939. return result;
  940. }
  941. #undef FUNC_NAME
  942. SCM_DEFINE (scm_char_set_copy, "char-set-copy", 1, 0, 0,
  943. (SCM cs),
  944. "Return a newly allocated character set containing all\n"
  945. "characters in @var{cs}.")
  946. #define FUNC_NAME s_scm_char_set_copy
  947. {
  948. SCM ret;
  949. scm_t_char_set *p1, *p2;
  950. SCM_VALIDATE_SMOB (1, cs, charset);
  951. ret = make_char_set (FUNC_NAME);
  952. p1 = SCM_CHARSET_DATA (cs);
  953. p2 = SCM_CHARSET_DATA (ret);
  954. p2->len = p1->len;
  955. if (p1->len == 0)
  956. p2->ranges = NULL;
  957. else
  958. {
  959. p2->ranges = scm_gc_malloc (sizeof (scm_t_char_range) * p1->len,
  960. "character-set");
  961. memcpy (p2->ranges, p1->ranges, sizeof (scm_t_char_range) * p1->len);
  962. }
  963. return ret;
  964. }
  965. #undef FUNC_NAME
  966. SCM_DEFINE (scm_char_set, "char-set", 0, 0, 1,
  967. (SCM rest),
  968. "Return a character set containing all given characters.")
  969. #define FUNC_NAME s_scm_char_set
  970. {
  971. SCM cs;
  972. int argnum = 1;
  973. SCM_VALIDATE_REST_ARGUMENT (rest);
  974. cs = make_char_set (FUNC_NAME);
  975. while (!scm_is_null (rest))
  976. {
  977. scm_t_wchar c;
  978. SCM_VALIDATE_CHAR_COPY (argnum, SCM_CAR (rest), c);
  979. argnum++;
  980. rest = SCM_CDR (rest);
  981. SCM_CHARSET_SET (cs, c);
  982. }
  983. return cs;
  984. }
  985. #undef FUNC_NAME
  986. SCM_DEFINE (scm_list_to_char_set, "list->char-set", 1, 1, 0,
  987. (SCM list, SCM base_cs),
  988. "Convert the character list @var{list} to a character set. If\n"
  989. "the character set @var{base_cs} is given, the character in this\n"
  990. "set are also included in the result.")
  991. #define FUNC_NAME s_scm_list_to_char_set
  992. {
  993. SCM cs;
  994. SCM_VALIDATE_LIST (1, list);
  995. if (SCM_UNBNDP (base_cs))
  996. cs = make_char_set (FUNC_NAME);
  997. else
  998. {
  999. SCM_VALIDATE_SMOB (2, base_cs, charset);
  1000. cs = scm_char_set_copy (base_cs);
  1001. }
  1002. while (!scm_is_null (list))
  1003. {
  1004. SCM chr = SCM_CAR (list);
  1005. scm_t_wchar c;
  1006. SCM_VALIDATE_CHAR_COPY (0, chr, c);
  1007. list = SCM_CDR (list);
  1008. SCM_CHARSET_SET (cs, c);
  1009. }
  1010. return cs;
  1011. }
  1012. #undef FUNC_NAME
  1013. SCM_DEFINE (scm_list_to_char_set_x, "list->char-set!", 2, 0, 0,
  1014. (SCM list, SCM base_cs),
  1015. "Convert the character list @var{list} to a character set. The\n"
  1016. "characters are added to @var{base_cs} and @var{base_cs} is\n"
  1017. "returned.")
  1018. #define FUNC_NAME s_scm_list_to_char_set_x
  1019. {
  1020. SCM_VALIDATE_LIST (1, list);
  1021. SCM_VALIDATE_MUTABLE_CHARSET (2, base_cs);
  1022. while (!scm_is_null (list))
  1023. {
  1024. SCM chr = SCM_CAR (list);
  1025. scm_t_wchar c;
  1026. SCM_VALIDATE_CHAR_COPY (0, chr, c);
  1027. list = SCM_CDR (list);
  1028. SCM_CHARSET_SET (base_cs, c);
  1029. }
  1030. return base_cs;
  1031. }
  1032. #undef FUNC_NAME
  1033. SCM_DEFINE (scm_string_to_char_set, "string->char-set", 1, 1, 0,
  1034. (SCM str, SCM base_cs),
  1035. "Convert the string @var{str} to a character set. If the\n"
  1036. "character set @var{base_cs} is given, the characters in this\n"
  1037. "set are also included in the result.")
  1038. #define FUNC_NAME s_scm_string_to_char_set
  1039. {
  1040. SCM cs;
  1041. size_t k = 0, len;
  1042. SCM_VALIDATE_STRING (1, str);
  1043. if (SCM_UNBNDP (base_cs))
  1044. cs = make_char_set (FUNC_NAME);
  1045. else
  1046. {
  1047. SCM_VALIDATE_SMOB (2, base_cs, charset);
  1048. cs = scm_char_set_copy (base_cs);
  1049. }
  1050. len = scm_i_string_length (str);
  1051. while (k < len)
  1052. {
  1053. scm_t_wchar c = scm_i_string_ref (str, k++);
  1054. SCM_CHARSET_SET (cs, c);
  1055. }
  1056. scm_remember_upto_here_1 (str);
  1057. return cs;
  1058. }
  1059. #undef FUNC_NAME
  1060. SCM_DEFINE (scm_string_to_char_set_x, "string->char-set!", 2, 0, 0,
  1061. (SCM str, SCM base_cs),
  1062. "Convert the string @var{str} to a character set. The\n"
  1063. "characters from the string are added to @var{base_cs}, and\n"
  1064. "@var{base_cs} is returned.")
  1065. #define FUNC_NAME s_scm_string_to_char_set_x
  1066. {
  1067. size_t k = 0, len;
  1068. SCM_VALIDATE_STRING (1, str);
  1069. SCM_VALIDATE_MUTABLE_CHARSET (2, base_cs);
  1070. len = scm_i_string_length (str);
  1071. while (k < len)
  1072. {
  1073. scm_t_wchar c = scm_i_string_ref (str, k++);
  1074. SCM_CHARSET_SET (base_cs, c);
  1075. }
  1076. scm_remember_upto_here_1 (str);
  1077. return base_cs;
  1078. }
  1079. #undef FUNC_NAME
  1080. SCM_DEFINE (scm_char_set_filter, "char-set-filter", 2, 1, 0,
  1081. (SCM pred, SCM cs, SCM base_cs),
  1082. "Return a character set containing every character from @var{cs}\n"
  1083. "so that it satisfies @var{pred}. If provided, the characters\n"
  1084. "from @var{base_cs} are added to the result.")
  1085. #define FUNC_NAME s_scm_char_set_filter
  1086. {
  1087. SCM ret;
  1088. int k;
  1089. scm_t_wchar n;
  1090. scm_t_char_set *p;
  1091. SCM_VALIDATE_PROC (1, pred);
  1092. SCM_VALIDATE_SMOB (2, cs, charset);
  1093. if (!SCM_UNBNDP (base_cs))
  1094. {
  1095. SCM_VALIDATE_SMOB (3, base_cs, charset);
  1096. ret = scm_char_set_copy (base_cs);
  1097. }
  1098. else
  1099. ret = make_char_set (FUNC_NAME);
  1100. p = SCM_CHARSET_DATA (cs);
  1101. if (p->len == 0)
  1102. return ret;
  1103. for (k = 0; k < p->len; k++)
  1104. for (n = p->ranges[k].lo; n <= p->ranges[k].hi; n++)
  1105. {
  1106. SCM res = scm_call_1 (pred, SCM_MAKE_CHAR (n));
  1107. if (scm_is_true (res))
  1108. SCM_CHARSET_SET (ret, n);
  1109. }
  1110. return ret;
  1111. }
  1112. #undef FUNC_NAME
  1113. SCM_DEFINE (scm_char_set_filter_x, "char-set-filter!", 3, 0, 0,
  1114. (SCM pred, SCM cs, SCM base_cs),
  1115. "Return a character set containing every character from @var{cs}\n"
  1116. "so that it satisfies @var{pred}. The characters are added to\n"
  1117. "@var{base_cs} and @var{base_cs} is returned.")
  1118. #define FUNC_NAME s_scm_char_set_filter_x
  1119. {
  1120. int k;
  1121. scm_t_wchar n;
  1122. scm_t_char_set *p;
  1123. SCM_VALIDATE_PROC (1, pred);
  1124. SCM_VALIDATE_SMOB (2, cs, charset);
  1125. SCM_VALIDATE_MUTABLE_CHARSET (3, base_cs);
  1126. p = SCM_CHARSET_DATA (cs);
  1127. if (p->len == 0)
  1128. return base_cs;
  1129. for (k = 0; k < p->len; k++)
  1130. for (n = p->ranges[k].lo; n <= p->ranges[k].hi; n++)
  1131. {
  1132. SCM res = scm_call_1 (pred, SCM_MAKE_CHAR (n));
  1133. if (scm_is_true (res))
  1134. SCM_CHARSET_SET (base_cs, n);
  1135. }
  1136. return base_cs;
  1137. }
  1138. #undef FUNC_NAME
  1139. /* Return a character set containing all the characters from [LOWER,UPPER),
  1140. giving range errors if ERROR, adding chars from BASE_CS, and recycling
  1141. BASE_CS if REUSE is true. */
  1142. static SCM
  1143. scm_i_ucs_range_to_char_set (const char *FUNC_NAME, SCM lower, SCM upper,
  1144. SCM error, SCM base_cs, int reuse)
  1145. {
  1146. SCM cs;
  1147. size_t clower, cupper;
  1148. clower = scm_to_size_t (lower);
  1149. cupper = scm_to_size_t (upper) - 1;
  1150. SCM_ASSERT_RANGE (2, upper, cupper >= clower);
  1151. if (!SCM_UNBNDP (error))
  1152. {
  1153. if (scm_is_true (error))
  1154. {
  1155. SCM_ASSERT_RANGE (1, lower, SCM_IS_UNICODE_CHAR (clower));
  1156. SCM_ASSERT_RANGE (2, upper, SCM_IS_UNICODE_CHAR (cupper));
  1157. if (clower < SCM_CODEPOINT_SURROGATE_START
  1158. && cupper > SCM_CODEPOINT_SURROGATE_END)
  1159. scm_error(scm_out_of_range_key,
  1160. FUNC_NAME, "invalid range - contains surrogate characters: ~S to ~S",
  1161. scm_list_2 (lower, upper), scm_list_1 (upper));
  1162. }
  1163. }
  1164. if (SCM_UNBNDP (base_cs))
  1165. cs = make_char_set (FUNC_NAME);
  1166. else
  1167. {
  1168. if (reuse)
  1169. {
  1170. SCM_VALIDATE_MUTABLE_CHARSET (3, base_cs);
  1171. cs = base_cs;
  1172. }
  1173. else
  1174. {
  1175. SCM_VALIDATE_SMOB (3, base_cs, charset);
  1176. cs = scm_char_set_copy (base_cs);
  1177. }
  1178. }
  1179. if ((clower >= SCM_CODEPOINT_SURROGATE_START && clower <= SCM_CODEPOINT_SURROGATE_END)
  1180. && (cupper >= SCM_CODEPOINT_SURROGATE_START && cupper <= SCM_CODEPOINT_SURROGATE_END))
  1181. return cs;
  1182. if (clower > SCM_CODEPOINT_MAX)
  1183. clower = SCM_CODEPOINT_MAX;
  1184. if (clower >= SCM_CODEPOINT_SURROGATE_START && clower <= SCM_CODEPOINT_SURROGATE_END)
  1185. clower = SCM_CODEPOINT_SURROGATE_END + 1;
  1186. if (cupper > SCM_CODEPOINT_MAX)
  1187. cupper = SCM_CODEPOINT_MAX;
  1188. if (cupper >= SCM_CODEPOINT_SURROGATE_START && cupper <= SCM_CODEPOINT_SURROGATE_END)
  1189. cupper = SCM_CODEPOINT_SURROGATE_START - 1;
  1190. if (clower < SCM_CODEPOINT_SURROGATE_START && cupper > SCM_CODEPOINT_SURROGATE_END)
  1191. {
  1192. scm_i_charset_set_range (SCM_CHARSET_DATA (cs), clower, SCM_CODEPOINT_SURROGATE_START - 1);
  1193. scm_i_charset_set_range (SCM_CHARSET_DATA (cs), SCM_CODEPOINT_SURROGATE_END + 1, cupper);
  1194. }
  1195. else
  1196. scm_i_charset_set_range (SCM_CHARSET_DATA (cs), clower, cupper);
  1197. return cs;
  1198. }
  1199. SCM_DEFINE (scm_ucs_range_to_char_set, "ucs-range->char-set", 2, 2, 0,
  1200. (SCM lower, SCM upper, SCM error, SCM base_cs),
  1201. "Return a character set containing all characters whose\n"
  1202. "character codes lie in the half-open range\n"
  1203. "[@var{lower},@var{upper}).\n"
  1204. "\n"
  1205. "If @var{error} is a true value, an error is signaled if the\n"
  1206. "specified range contains characters which are not valid\n"
  1207. "Unicode code points. If @var{error} is @code{#f},\n"
  1208. "these characters are silently left out of the resulting\n"
  1209. "character set.\n"
  1210. "\n"
  1211. "The characters in @var{base_cs} are added to the result, if\n"
  1212. "given.")
  1213. #define FUNC_NAME s_scm_ucs_range_to_char_set
  1214. {
  1215. return scm_i_ucs_range_to_char_set (FUNC_NAME, lower, upper,
  1216. error, base_cs, 0);
  1217. }
  1218. #undef FUNC_NAME
  1219. SCM_DEFINE (scm_ucs_range_to_char_set_x, "ucs-range->char-set!", 4, 0, 0,
  1220. (SCM lower, SCM upper, SCM error, SCM base_cs),
  1221. "Return a character set containing all characters whose\n"
  1222. "character codes lie in the half-open range\n"
  1223. "[@var{lower},@var{upper}).\n"
  1224. "\n"
  1225. "If @var{error} is a true value, an error is signaled if the\n"
  1226. "specified range contains characters which are not contained in\n"
  1227. "the implemented character range. If @var{error} is @code{#f},\n"
  1228. "these characters are silently left out of the resulting\n"
  1229. "character set.\n"
  1230. "\n"
  1231. "The characters are added to @var{base_cs} and @var{base_cs} is\n"
  1232. "returned.")
  1233. #define FUNC_NAME s_scm_ucs_range_to_char_set_x
  1234. {
  1235. SCM_VALIDATE_SMOB (4, base_cs, charset);
  1236. return scm_i_ucs_range_to_char_set (FUNC_NAME, lower, upper,
  1237. error, base_cs, 1);
  1238. }
  1239. #undef FUNC_NAME
  1240. SCM_DEFINE (scm_to_char_set, "->char-set", 1, 0, 0,
  1241. (SCM x),
  1242. "Coerces x into a char-set. @var{x} may be a string, character or char-set. A string is converted to the set of its constituent characters; a character is converted to a singleton set; a char-set is returned as-is.")
  1243. #define FUNC_NAME s_scm_to_char_set
  1244. {
  1245. if (scm_is_string (x))
  1246. return scm_string_to_char_set (x, SCM_UNDEFINED);
  1247. else if (SCM_CHARP (x))
  1248. return scm_char_set (scm_list_1 (x));
  1249. else if (SCM_SMOB_PREDICATE (scm_tc16_charset, x))
  1250. return x;
  1251. else
  1252. scm_wrong_type_arg (NULL, 0, x);
  1253. }
  1254. #undef FUNC_NAME
  1255. SCM_DEFINE (scm_char_set_size, "char-set-size", 1, 0, 0,
  1256. (SCM cs),
  1257. "Return the number of elements in character set @var{cs}.")
  1258. #define FUNC_NAME s_scm_char_set_size
  1259. {
  1260. int k, count = 0;
  1261. scm_t_char_set *cs_data;
  1262. SCM_VALIDATE_SMOB (1, cs, charset);
  1263. cs_data = SCM_CHARSET_DATA (cs);
  1264. if (cs_data->len == 0)
  1265. return scm_from_int (0);
  1266. for (k = 0; k < cs_data->len; k++)
  1267. count += cs_data->ranges[k].hi - cs_data->ranges[k].lo + 1;
  1268. return scm_from_int (count);
  1269. }
  1270. #undef FUNC_NAME
  1271. SCM_DEFINE (scm_char_set_count, "char-set-count", 2, 0, 0,
  1272. (SCM pred, SCM cs),
  1273. "Return the number of the elements int the character set\n"
  1274. "@var{cs} which satisfy the predicate @var{pred}.")
  1275. #define FUNC_NAME s_scm_char_set_count
  1276. {
  1277. int k, count = 0;
  1278. scm_t_wchar n;
  1279. scm_t_char_set *cs_data;
  1280. SCM_VALIDATE_PROC (1, pred);
  1281. SCM_VALIDATE_SMOB (2, cs, charset);
  1282. cs_data = SCM_CHARSET_DATA (cs);
  1283. if (cs_data->len == 0)
  1284. return scm_from_int (0);
  1285. for (k = 0; k < cs_data->len; k++)
  1286. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  1287. {
  1288. SCM res = scm_call_1 (pred, SCM_MAKE_CHAR (n));
  1289. if (scm_is_true (res))
  1290. count++;
  1291. }
  1292. return SCM_I_MAKINUM (count);
  1293. }
  1294. #undef FUNC_NAME
  1295. SCM_DEFINE (scm_char_set_to_list, "char-set->list", 1, 0, 0,
  1296. (SCM cs),
  1297. "Return a list containing the elements of the character set\n"
  1298. "@var{cs}.")
  1299. #define FUNC_NAME s_scm_char_set_to_list
  1300. {
  1301. int k;
  1302. scm_t_wchar n;
  1303. SCM result = SCM_EOL;
  1304. scm_t_char_set *p;
  1305. SCM_VALIDATE_SMOB (1, cs, charset);
  1306. p = SCM_CHARSET_DATA (cs);
  1307. if (p->len == 0)
  1308. return SCM_EOL;
  1309. for (k = p->len - 1; k >= 0; k--)
  1310. for (n = p->ranges[k].hi; n >= p->ranges[k].lo; n--)
  1311. result = scm_cons (SCM_MAKE_CHAR (n), result);
  1312. return result;
  1313. }
  1314. #undef FUNC_NAME
  1315. SCM_DEFINE (scm_char_set_to_string, "char-set->string", 1, 0, 0,
  1316. (SCM cs),
  1317. "Return a string containing the elements of the character set\n"
  1318. "@var{cs}. The order in which the characters are placed in the\n"
  1319. "string is not defined.")
  1320. #define FUNC_NAME s_scm_char_set_to_string
  1321. {
  1322. int k;
  1323. int count = 0;
  1324. int idx = 0;
  1325. int wide = 0;
  1326. SCM result;
  1327. scm_t_wchar n;
  1328. scm_t_char_set *cs_data;
  1329. char *buf;
  1330. scm_t_wchar *wbuf;
  1331. SCM_VALIDATE_SMOB (1, cs, charset);
  1332. cs_data = SCM_CHARSET_DATA (cs);
  1333. if (cs_data->len == 0)
  1334. return scm_nullstr;
  1335. if (cs_data->ranges[cs_data->len - 1].hi > 255)
  1336. wide = 1;
  1337. count = scm_to_int (scm_char_set_size (cs));
  1338. if (wide)
  1339. result = scm_i_make_wide_string (count, &wbuf, 0);
  1340. else
  1341. result = scm_i_make_string (count, &buf, 0);
  1342. for (k = 0; k < cs_data->len; k++)
  1343. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  1344. {
  1345. if (wide)
  1346. wbuf[idx++] = n;
  1347. else
  1348. buf[idx++] = n;
  1349. }
  1350. return result;
  1351. }
  1352. #undef FUNC_NAME
  1353. SCM_DEFINE (scm_char_set_contains_p, "char-set-contains?", 2, 0, 0,
  1354. (SCM cs, SCM ch),
  1355. "Return @code{#t} iff the character @var{ch} is contained in the\n"
  1356. "character set @var{cs}.")
  1357. #define FUNC_NAME s_scm_char_set_contains_p
  1358. {
  1359. SCM_VALIDATE_SMOB (1, cs, charset);
  1360. SCM_VALIDATE_CHAR (2, ch);
  1361. return scm_from_bool (SCM_CHARSET_GET (cs, SCM_CHAR (ch)));
  1362. }
  1363. #undef FUNC_NAME
  1364. SCM_DEFINE (scm_char_set_every, "char-set-every", 2, 0, 0,
  1365. (SCM pred, SCM cs),
  1366. "Return a true value if every character in the character set\n"
  1367. "@var{cs} satisfies the predicate @var{pred}.")
  1368. #define FUNC_NAME s_scm_char_set_every
  1369. {
  1370. int k;
  1371. scm_t_wchar n;
  1372. SCM res = SCM_BOOL_T;
  1373. scm_t_char_set *cs_data;
  1374. SCM_VALIDATE_PROC (1, pred);
  1375. SCM_VALIDATE_SMOB (2, cs, charset);
  1376. cs_data = SCM_CHARSET_DATA (cs);
  1377. if (cs_data->len == 0)
  1378. return SCM_BOOL_T;
  1379. for (k = 0; k < cs_data->len; k++)
  1380. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  1381. {
  1382. res = scm_call_1 (pred, SCM_MAKE_CHAR (n));
  1383. if (scm_is_false (res))
  1384. return res;
  1385. }
  1386. return SCM_BOOL_T;
  1387. }
  1388. #undef FUNC_NAME
  1389. SCM_DEFINE (scm_char_set_any, "char-set-any", 2, 0, 0,
  1390. (SCM pred, SCM cs),
  1391. "Return a true value if any character in the character set\n"
  1392. "@var{cs} satisfies the predicate @var{pred}.")
  1393. #define FUNC_NAME s_scm_char_set_any
  1394. {
  1395. int k;
  1396. scm_t_wchar n;
  1397. scm_t_char_set *cs_data;
  1398. SCM_VALIDATE_PROC (1, pred);
  1399. SCM_VALIDATE_SMOB (2, cs, charset);
  1400. cs_data = SCM_CHARSET_DATA (cs);
  1401. if (cs_data->len == 0)
  1402. return SCM_BOOL_T;
  1403. for (k = 0; k < cs_data->len; k++)
  1404. for (n = cs_data->ranges[k].lo; n <= cs_data->ranges[k].hi; n++)
  1405. {
  1406. SCM res = scm_call_1 (pred, SCM_MAKE_CHAR (n));
  1407. if (scm_is_true (res))
  1408. return res;
  1409. }
  1410. return SCM_BOOL_F;
  1411. }
  1412. #undef FUNC_NAME
  1413. SCM_DEFINE (scm_char_set_adjoin, "char-set-adjoin", 1, 0, 1,
  1414. (SCM cs, SCM rest),
  1415. "Add all character arguments to the first argument, which must\n"
  1416. "be a character set.")
  1417. #define FUNC_NAME s_scm_char_set_adjoin
  1418. {
  1419. SCM_VALIDATE_SMOB (1, cs, charset);
  1420. SCM_VALIDATE_REST_ARGUMENT (rest);
  1421. cs = scm_char_set_copy (cs);
  1422. while (!scm_is_null (rest))
  1423. {
  1424. SCM chr = SCM_CAR (rest);
  1425. scm_t_wchar c;
  1426. SCM_VALIDATE_CHAR_COPY (1, chr, c);
  1427. rest = SCM_CDR (rest);
  1428. SCM_CHARSET_SET (cs, c);
  1429. }
  1430. return cs;
  1431. }
  1432. #undef FUNC_NAME
  1433. SCM_DEFINE (scm_char_set_delete, "char-set-delete", 1, 0, 1,
  1434. (SCM cs, SCM rest),
  1435. "Delete all character arguments from the first argument, which\n"
  1436. "must be a character set.")
  1437. #define FUNC_NAME s_scm_char_set_delete
  1438. {
  1439. SCM_VALIDATE_SMOB (1, cs, charset);
  1440. SCM_VALIDATE_REST_ARGUMENT (rest);
  1441. cs = scm_char_set_copy (cs);
  1442. while (!scm_is_null (rest))
  1443. {
  1444. SCM chr = SCM_CAR (rest);
  1445. scm_t_wchar c;
  1446. SCM_VALIDATE_CHAR_COPY (1, chr, c);
  1447. rest = SCM_CDR (rest);
  1448. SCM_CHARSET_UNSET (cs, c);
  1449. }
  1450. return cs;
  1451. }
  1452. #undef FUNC_NAME
  1453. SCM_DEFINE (scm_char_set_adjoin_x, "char-set-adjoin!", 1, 0, 1,
  1454. (SCM cs, SCM rest),
  1455. "Add all character arguments to the first argument, which must\n"
  1456. "be a character set.")
  1457. #define FUNC_NAME s_scm_char_set_adjoin_x
  1458. {
  1459. SCM_VALIDATE_MUTABLE_CHARSET (1, cs);
  1460. SCM_VALIDATE_REST_ARGUMENT (rest);
  1461. while (!scm_is_null (rest))
  1462. {
  1463. SCM chr = SCM_CAR (rest);
  1464. scm_t_wchar c;
  1465. SCM_VALIDATE_CHAR_COPY (1, chr, c);
  1466. rest = SCM_CDR (rest);
  1467. SCM_CHARSET_SET (cs, c);
  1468. }
  1469. return cs;
  1470. }
  1471. #undef FUNC_NAME
  1472. SCM_DEFINE (scm_char_set_delete_x, "char-set-delete!", 1, 0, 1,
  1473. (SCM cs, SCM rest),
  1474. "Delete all character arguments from the first argument, which\n"
  1475. "must be a character set.")
  1476. #define FUNC_NAME s_scm_char_set_delete_x
  1477. {
  1478. SCM_VALIDATE_MUTABLE_CHARSET (1, cs);
  1479. SCM_VALIDATE_REST_ARGUMENT (rest);
  1480. while (!scm_is_null (rest))
  1481. {
  1482. SCM chr = SCM_CAR (rest);
  1483. scm_t_wchar c;
  1484. SCM_VALIDATE_CHAR_COPY (1, chr, c);
  1485. rest = SCM_CDR (rest);
  1486. SCM_CHARSET_UNSET (cs, c);
  1487. }
  1488. return cs;
  1489. }
  1490. #undef FUNC_NAME
  1491. SCM_DEFINE (scm_char_set_complement, "char-set-complement", 1, 0, 0,
  1492. (SCM cs), "Return the complement of the character set @var{cs}.")
  1493. #define FUNC_NAME s_scm_char_set_complement
  1494. {
  1495. SCM res;
  1496. scm_t_char_set *p, *q;
  1497. SCM_VALIDATE_SMOB (1, cs, charset);
  1498. res = make_char_set (FUNC_NAME);
  1499. p = SCM_CHARSET_DATA (res);
  1500. q = SCM_CHARSET_DATA (cs);
  1501. charsets_complement (p, q);
  1502. return res;
  1503. }
  1504. #undef FUNC_NAME
  1505. SCM_DEFINE (scm_char_set_union, "char-set-union", 0, 0, 1,
  1506. (SCM rest),
  1507. "Return the union of all argument character sets.")
  1508. #define FUNC_NAME s_scm_char_set_union
  1509. {
  1510. int c = 1;
  1511. SCM res;
  1512. scm_t_char_set *p;
  1513. SCM_VALIDATE_REST_ARGUMENT (rest);
  1514. res = make_char_set (FUNC_NAME);
  1515. p = SCM_CHARSET_DATA (res);
  1516. while (!scm_is_null (rest))
  1517. {
  1518. SCM cs = SCM_CAR (rest);
  1519. SCM_VALIDATE_SMOB (c, cs, charset);
  1520. c++;
  1521. rest = SCM_CDR (rest);
  1522. charsets_union (p, (scm_t_char_set *) SCM_SMOB_DATA (cs));
  1523. }
  1524. return res;
  1525. }
  1526. #undef FUNC_NAME
  1527. SCM_DEFINE (scm_char_set_intersection, "char-set-intersection", 0, 0, 1,
  1528. (SCM rest),
  1529. "Return the intersection of all argument character sets.")
  1530. #define FUNC_NAME s_scm_char_set_intersection
  1531. {
  1532. SCM res;
  1533. SCM_VALIDATE_REST_ARGUMENT (rest);
  1534. if (scm_is_null (rest))
  1535. res = make_char_set (FUNC_NAME);
  1536. else
  1537. {
  1538. scm_t_char_set *p;
  1539. int argnum = 2;
  1540. res = scm_char_set_copy (SCM_CAR (rest));
  1541. p = SCM_CHARSET_DATA (res);
  1542. rest = SCM_CDR (rest);
  1543. while (scm_is_pair (rest))
  1544. {
  1545. SCM cs = SCM_CAR (rest);
  1546. scm_t_char_set *cs_data;
  1547. SCM_VALIDATE_SMOB (argnum, cs, charset);
  1548. argnum++;
  1549. cs_data = SCM_CHARSET_DATA (cs);
  1550. rest = SCM_CDR (rest);
  1551. charsets_intersection (p, cs_data);
  1552. }
  1553. }
  1554. return res;
  1555. }
  1556. #undef FUNC_NAME
  1557. SCM_DEFINE (scm_char_set_difference, "char-set-difference", 1, 0, 1,
  1558. (SCM cs1, SCM rest),
  1559. "Return the difference of all argument character sets.")
  1560. #define FUNC_NAME s_scm_char_set_difference
  1561. {
  1562. int c = 2;
  1563. SCM res, compl;
  1564. scm_t_char_set *p, *q;
  1565. SCM_VALIDATE_SMOB (1, cs1, charset);
  1566. SCM_VALIDATE_REST_ARGUMENT (rest);
  1567. res = scm_char_set_copy (cs1);
  1568. p = SCM_CHARSET_DATA (res);
  1569. compl = make_char_set (FUNC_NAME);
  1570. q = SCM_CHARSET_DATA (compl);
  1571. while (!scm_is_null (rest))
  1572. {
  1573. SCM cs = SCM_CAR (rest);
  1574. SCM_VALIDATE_SMOB (c, cs, charset);
  1575. c++;
  1576. rest = SCM_CDR (rest);
  1577. charsets_complement (q, SCM_CHARSET_DATA (cs));
  1578. charsets_intersection (p, q);
  1579. }
  1580. return res;
  1581. }
  1582. #undef FUNC_NAME
  1583. SCM_DEFINE (scm_char_set_xor, "char-set-xor", 0, 0, 1,
  1584. (SCM rest),
  1585. "Return the exclusive-or of all argument character sets.")
  1586. #define FUNC_NAME s_scm_char_set_xor
  1587. {
  1588. SCM res;
  1589. SCM_VALIDATE_REST_ARGUMENT (rest);
  1590. if (scm_is_null (rest))
  1591. res = make_char_set (FUNC_NAME);
  1592. else
  1593. {
  1594. int argnum = 2;
  1595. scm_t_char_set *p;
  1596. res = scm_char_set_copy (SCM_CAR (rest));
  1597. p = SCM_CHARSET_DATA (res);
  1598. rest = SCM_CDR (rest);
  1599. while (scm_is_pair (rest))
  1600. {
  1601. SCM cs = SCM_CAR (rest);
  1602. scm_t_char_set *cs_data;
  1603. SCM_VALIDATE_SMOB (argnum, cs, charset);
  1604. argnum++;
  1605. cs_data = SCM_CHARSET_DATA (cs);
  1606. rest = SCM_CDR (rest);
  1607. charsets_xor (p, cs_data);
  1608. }
  1609. }
  1610. return res;
  1611. }
  1612. #undef FUNC_NAME
  1613. SCM_DEFINE (scm_char_set_diff_plus_intersection, "char-set-diff+intersection", 1, 0, 1,
  1614. (SCM cs1, SCM rest),
  1615. "Return the difference and the intersection of all argument\n"
  1616. "character sets.")
  1617. #define FUNC_NAME s_scm_char_set_diff_plus_intersection
  1618. {
  1619. int c = 2;
  1620. SCM res1, res2;
  1621. scm_t_char_set *p, *q;
  1622. SCM_VALIDATE_SMOB (1, cs1, charset);
  1623. SCM_VALIDATE_REST_ARGUMENT (rest);
  1624. res1 = scm_char_set_copy (cs1);
  1625. res2 = make_char_set (FUNC_NAME);
  1626. p = SCM_CHARSET_DATA (res1);
  1627. q = SCM_CHARSET_DATA (res2);
  1628. while (!scm_is_null (rest))
  1629. {
  1630. SCM cs = SCM_CAR (rest);
  1631. scm_t_char_set *r;
  1632. SCM_VALIDATE_SMOB (c, cs, charset);
  1633. c++;
  1634. r = SCM_CHARSET_DATA (cs);
  1635. charsets_union (q, r);
  1636. charsets_intersection (p, r);
  1637. rest = SCM_CDR (rest);
  1638. }
  1639. return scm_values_2 (res1, res2);
  1640. }
  1641. #undef FUNC_NAME
  1642. SCM_DEFINE (scm_char_set_complement_x, "char-set-complement!", 1, 0, 0,
  1643. (SCM cs), "Return the complement of the character set @var{cs}.")
  1644. #define FUNC_NAME s_scm_char_set_complement_x
  1645. {
  1646. SCM_VALIDATE_MUTABLE_CHARSET (1, cs);
  1647. cs = scm_char_set_complement (cs);
  1648. return cs;
  1649. }
  1650. #undef FUNC_NAME
  1651. SCM_DEFINE (scm_char_set_union_x, "char-set-union!", 1, 0, 1,
  1652. (SCM cs1, SCM rest),
  1653. "Return the union of all argument character sets.")
  1654. #define FUNC_NAME s_scm_char_set_union_x
  1655. {
  1656. SCM_VALIDATE_MUTABLE_CHARSET (1, cs1);
  1657. SCM_VALIDATE_REST_ARGUMENT (rest);
  1658. cs1 = scm_char_set_union (scm_cons (cs1, rest));
  1659. return cs1;
  1660. }
  1661. #undef FUNC_NAME
  1662. SCM_DEFINE (scm_char_set_intersection_x, "char-set-intersection!", 1, 0, 1,
  1663. (SCM cs1, SCM rest),
  1664. "Return the intersection of all argument character sets.")
  1665. #define FUNC_NAME s_scm_char_set_intersection_x
  1666. {
  1667. SCM_VALIDATE_MUTABLE_CHARSET (1, cs1);
  1668. SCM_VALIDATE_REST_ARGUMENT (rest);
  1669. cs1 = scm_char_set_intersection (scm_cons (cs1, rest));
  1670. return cs1;
  1671. }
  1672. #undef FUNC_NAME
  1673. SCM_DEFINE (scm_char_set_difference_x, "char-set-difference!", 1, 0, 1,
  1674. (SCM cs1, SCM rest),
  1675. "Return the difference of all argument character sets.")
  1676. #define FUNC_NAME s_scm_char_set_difference_x
  1677. {
  1678. SCM_VALIDATE_MUTABLE_CHARSET (1, cs1);
  1679. SCM_VALIDATE_REST_ARGUMENT (rest);
  1680. cs1 = scm_char_set_difference (cs1, rest);
  1681. return cs1;
  1682. }
  1683. #undef FUNC_NAME
  1684. SCM_DEFINE (scm_char_set_xor_x, "char-set-xor!", 1, 0, 1,
  1685. (SCM cs1, SCM rest),
  1686. "Return the exclusive-or of all argument character sets.")
  1687. #define FUNC_NAME s_scm_char_set_xor_x
  1688. {
  1689. SCM_VALIDATE_MUTABLE_CHARSET (1, cs1);
  1690. /* a side-effecting variant should presumably give consistent results:
  1691. (define a (char-set #\a))
  1692. (char-set-xor a a a) -> char set #\a
  1693. (char-set-xor! a a a) -> char set #\a
  1694. */
  1695. cs1 = scm_char_set_xor (scm_cons (cs1, rest));
  1696. return cs1;
  1697. }
  1698. #undef FUNC_NAME
  1699. SCM_DEFINE (scm_char_set_diff_plus_intersection_x,
  1700. "char-set-diff+intersection!", 2, 0, 1, (SCM cs1, SCM cs2,
  1701. SCM rest),
  1702. "Return the difference and the intersection of all argument\n"
  1703. "character sets.")
  1704. #define FUNC_NAME s_scm_char_set_diff_plus_intersection_x
  1705. {
  1706. SCM diff, intersect;
  1707. diff = scm_char_set_difference (cs1, scm_cons (cs2, rest));
  1708. intersect =
  1709. scm_char_set_intersection (scm_cons (cs1, scm_cons (cs2, rest)));
  1710. cs1 = diff;
  1711. cs2 = intersect;
  1712. return scm_values_2 (cs1, cs2);
  1713. }
  1714. #undef FUNC_NAME
  1715. /* Standard character sets. */
  1716. SCM scm_char_set_lower_case;
  1717. SCM scm_char_set_upper_case;
  1718. SCM scm_char_set_title_case;
  1719. SCM scm_char_set_letter;
  1720. SCM scm_char_set_digit;
  1721. SCM scm_char_set_letter_and_digit;
  1722. SCM scm_char_set_graphic;
  1723. SCM scm_char_set_printing;
  1724. SCM scm_char_set_whitespace;
  1725. SCM scm_char_set_iso_control;
  1726. SCM scm_char_set_punctuation;
  1727. SCM scm_char_set_symbol;
  1728. SCM scm_char_set_hex_digit;
  1729. SCM scm_char_set_blank;
  1730. SCM scm_char_set_ascii;
  1731. SCM scm_char_set_empty;
  1732. SCM scm_char_set_designated;
  1733. SCM scm_char_set_full;
  1734. /* Create an empty character set and return it after binding it to NAME. */
  1735. static inline SCM
  1736. define_charset (const char *name, size_t len, const scm_t_char_range *ranges)
  1737. {
  1738. SCM cs;
  1739. scm_t_char_set *p = scm_gc_malloc_pointerless (sizeof (scm_t_char_set),
  1740. "charset");
  1741. p->len = len;
  1742. /* Strip const qualifier but add immutable flag on SCM. */
  1743. p->ranges = (scm_t_char_range *) ranges;
  1744. SCM_NEWSMOB (cs, scm_tc16_charset | SCM_CHARSET_F_IMMUTABLE, p);
  1745. scm_c_define (name, cs);
  1746. return cs;
  1747. }
  1748. SCM_DEFINE (scm_sys_char_set_dump, "%char-set-dump", 1, 0, 0, (SCM charset),
  1749. "Returns an association list containing debugging information\n"
  1750. "for @var{charset}. The association list has the following entries."
  1751. "@table @code\n"
  1752. "@item char-set\n"
  1753. "The char-set itself.\n"
  1754. "@item len\n"
  1755. "The number of character ranges the char-set contains\n"
  1756. "@item ranges\n"
  1757. "A list of lists where each sublist a range of code points\n"
  1758. "and their associated characters"
  1759. "@end table")
  1760. #define FUNC_NAME s_scm_sys_char_set_dump
  1761. {
  1762. SCM e1, e2, e3;
  1763. SCM ranges = SCM_EOL, elt;
  1764. size_t i;
  1765. scm_t_char_set *cs;
  1766. char codepoint_string_lo[13], codepoint_string_hi[13];
  1767. SCM_VALIDATE_SMOB (1, charset, charset);
  1768. cs = SCM_CHARSET_DATA (charset);
  1769. e1 = scm_cons (scm_from_latin1_symbol ("char-set"),
  1770. charset);
  1771. e2 = scm_cons (scm_from_latin1_symbol ("n"),
  1772. scm_from_size_t (cs->len));
  1773. for (i = 0; i < cs->len; i++)
  1774. {
  1775. if (cs->ranges[i].lo > 0xFFFF)
  1776. sprintf (codepoint_string_lo, "U+%06x", cs->ranges[i].lo);
  1777. else
  1778. sprintf (codepoint_string_lo, "U+%04x", cs->ranges[i].lo);
  1779. if (cs->ranges[i].hi > 0xFFFF)
  1780. sprintf (codepoint_string_hi, "U+%06x", cs->ranges[i].hi);
  1781. else
  1782. sprintf (codepoint_string_hi, "U+%04x", cs->ranges[i].hi);
  1783. elt = scm_list_4 (SCM_MAKE_CHAR (cs->ranges[i].lo),
  1784. SCM_MAKE_CHAR (cs->ranges[i].hi),
  1785. scm_from_locale_string (codepoint_string_lo),
  1786. scm_from_locale_string (codepoint_string_hi));
  1787. ranges = scm_append (scm_list_2 (ranges,
  1788. scm_list_1 (elt)));
  1789. }
  1790. e3 = scm_cons (scm_from_latin1_symbol ("ranges"),
  1791. ranges);
  1792. return scm_list_3 (e1, e2, e3);
  1793. }
  1794. #undef FUNC_NAME
  1795. #define DEFINE_CHARSET(name, stem) \
  1796. define_charset ("char-set:" name, cs_##stem##_len, cs_##stem##_ranges)
  1797. void
  1798. scm_init_srfi_14 (void)
  1799. {
  1800. scm_tc16_charset = scm_make_smob_type ("character-set", 0);
  1801. scm_set_smob_print (scm_tc16_charset, charset_print);
  1802. scm_tc16_charset_cursor = scm_make_smob_type ("char-set-cursor", 0);
  1803. scm_set_smob_print (scm_tc16_charset_cursor, charset_cursor_print);
  1804. scm_char_set_upper_case = DEFINE_CHARSET ("upper-case", upper_case);
  1805. scm_char_set_lower_case = DEFINE_CHARSET ("lower-case", lower_case);
  1806. scm_char_set_title_case = DEFINE_CHARSET ("title-case", title_case);
  1807. scm_char_set_letter = DEFINE_CHARSET ("letter", letter);
  1808. scm_char_set_digit = DEFINE_CHARSET ("digit", digit);
  1809. scm_char_set_letter_and_digit = DEFINE_CHARSET ("letter+digit",
  1810. letter_plus_digit);
  1811. scm_char_set_graphic = DEFINE_CHARSET ("graphic", graphic);
  1812. scm_char_set_printing = DEFINE_CHARSET ("printing", printing);
  1813. scm_char_set_whitespace = DEFINE_CHARSET ("whitespace", whitespace);
  1814. scm_char_set_iso_control = DEFINE_CHARSET ("iso-control", iso_control);
  1815. scm_char_set_punctuation = DEFINE_CHARSET ("punctuation", punctuation);
  1816. scm_char_set_symbol = DEFINE_CHARSET ("symbol", symbol);
  1817. scm_char_set_hex_digit = DEFINE_CHARSET ("hex-digit", hex_digit);
  1818. scm_char_set_blank = DEFINE_CHARSET ("blank", blank);
  1819. scm_char_set_ascii = DEFINE_CHARSET ("ascii", ascii);
  1820. scm_char_set_empty = DEFINE_CHARSET ("empty", empty);
  1821. scm_char_set_designated = DEFINE_CHARSET ("designated", designated);
  1822. scm_char_set_full = DEFINE_CHARSET ("full", full);
  1823. #include "srfi-14.x"
  1824. }
  1825. /* End of srfi-14.c. */