unidata_to_charset.pl 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. #!/usr/bin/perl
  2. # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
  3. #
  4. # Copyright (C) 2009, 2010 Free Software Foundation, Inc.
  5. #
  6. # This library is free software; you can redistribute it and/or
  7. # modify it under the terms of the GNU Lesser General Public
  8. # License as published by the Free Software Foundation; either
  9. # version 3 of the License, or (at your option) any later version.
  10. #
  11. # This library is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. # Lesser General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU Lesser General Public
  17. # License along with this library; if not, write to the Free Software
  18. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!";
  20. open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
  21. # For Unicode, we follow Java's specification: a character is
  22. # lowercase if
  23. # * it is not in the range [U+2000,U+2FFF], and
  24. # * the Unicode attribute table does not give a lowercase mapping
  25. # for it, and
  26. # * at least one of the following is true:
  27. # o the Unicode attribute table gives a mapping to uppercase
  28. # for the character, or
  29. # o the name for the character in the Unicode attribute table
  30. # contains the words "SMALL LETTER" or "SMALL LIGATURE".
  31. sub lower_case {
  32. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  33. if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
  34. && (!defined($lowercase) || $lowercase eq "")
  35. && ((defined($uppercase) && $uppercase ne "")
  36. || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
  37. return 1;
  38. } else {
  39. return 0;
  40. }
  41. }
  42. # For Unicode, we follow Java's specification: a character is
  43. # uppercase if
  44. # * it is not in the range [U+2000,U+2FFF], and
  45. # * the Unicode attribute table does not give an uppercase mapping
  46. # for it (this excludes titlecase characters), and
  47. # * at least one of the following is true:
  48. # o the Unicode attribute table gives a mapping to lowercase
  49. # for the character, or
  50. # o the name for the character in the Unicode attribute table
  51. # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
  52. sub upper_case {
  53. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  54. if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
  55. && (!defined($uppercase) || $uppercase eq "")
  56. && ((defined($lowercase) && $lowercase ne "")
  57. || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
  58. return 1;
  59. } else {
  60. return 0;
  61. }
  62. }
  63. # A character is titlecase if it has the category Lt in the character
  64. # attribute database.
  65. sub title_case {
  66. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  67. if (defined($category) && $category eq "Lt") {
  68. return 1;
  69. } else {
  70. return 0;
  71. }
  72. }
  73. # A letter is any character with one of the letter categories (Lu, Ll,
  74. # Lt, Lm, Lo) in the Unicode character database.
  75. sub letter {
  76. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  77. if (defined($category) && ($category eq "Lu"
  78. || $category eq "Ll"
  79. || $category eq "Lt"
  80. || $category eq "Lm"
  81. || $category eq "Lo")) {
  82. return 1;
  83. } else {
  84. return 0;
  85. }
  86. }
  87. # A character is a digit if it has the category Nd in the character
  88. # attribute database. In Latin-1 and ASCII, the only such characters
  89. # are 0123456789. In Unicode, there are other digit characters in
  90. # other code blocks, such as Gujarati digits and Tibetan digits.
  91. sub digit {
  92. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  93. if (defined($category) && $category eq "Nd") {
  94. return 1;
  95. } else {
  96. return 0;
  97. }
  98. }
  99. # The only hex digits are 0123456789abcdefABCDEF.
  100. sub hex_digit {
  101. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  102. if (($codepoint >= 0x30 && $codepoint <= 0x39)
  103. || ($codepoint >= 0x41 && $codepoint <= 0x46)
  104. || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
  105. return 1;
  106. } else {
  107. return 0;
  108. }
  109. }
  110. # The union of char-set:letter and char-set:digit.
  111. sub letter_plus_digit {
  112. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  113. if (letter($codepoint, $name, $category, $uppercase, $lowercase)
  114. || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
  115. return 1;
  116. } else {
  117. return 0;
  118. }
  119. }
  120. # Characters that would 'use ink' when printed
  121. sub graphic {
  122. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  123. if ($category =~ (/L|M|N|P|S/)) {
  124. return 1;
  125. } else {
  126. return 0;
  127. }
  128. }
  129. # A whitespace character is either
  130. # * a character with one of the space, line, or paragraph separator
  131. # categories (Zs, Zl or Zp) of the Unicode character database.
  132. # * U+0009 Horizontal tabulation (\t control-I)
  133. # * U+000A Line feed (\n control-J)
  134. # * U+000B Vertical tabulation (\v control-K)
  135. # * U+000C Form feed (\f control-L)
  136. # * U+000D Carriage return (\r control-M)
  137. sub whitespace {
  138. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  139. if ($category =~ (/Zs|Zl|Zp/)
  140. || $codepoint == 0x9
  141. || $codepoint == 0xA
  142. || $codepoint == 0xB
  143. || $codepoint == 0xC
  144. || $codepoint == 0xD) {
  145. return 1;
  146. } else {
  147. return 0;
  148. }
  149. }
  150. # A printing character is one that would occupy space when printed,
  151. # i.e., a graphic character or a space character. char-set:printing is
  152. # the union of char-set:whitespace and char-set:graphic.
  153. sub printing {
  154. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  155. if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
  156. || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
  157. return 1;
  158. } else {
  159. return 0;
  160. }
  161. }
  162. # The ISO control characters are the Unicode/Latin-1 characters in the
  163. # ranges [U+0000,U+001F] and [U+007F,U+009F].
  164. sub iso_control {
  165. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  166. if (($codepoint >= 0x00 && $codepoint <= 0x1F)
  167. || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
  168. return 1;
  169. } else {
  170. return 0;
  171. }
  172. }
  173. # A punctuation character is any character that has one of the
  174. # punctuation categories in the Unicode character database (Pc, Pd,
  175. # Ps, Pe, Pi, Pf, or Po.)
  176. # Note that srfi-14 gives conflicting requirements!! It claims that
  177. # only the Unicode punctuation is necessary, but, explicitly calls out
  178. # the soft hyphen character (U+00AD) as punctution. Current versions
  179. # of Unicode consider U+00AD to be a formatting character, not
  180. # punctuation.
  181. sub punctuation {
  182. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  183. if ($category =~ (/P/)) {
  184. return 1;
  185. } else {
  186. return 0;
  187. }
  188. }
  189. # A symbol is any character that has one of the symbol categories in
  190. # the Unicode character database (Sm, Sc, Sk, or So).
  191. sub symbol {
  192. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  193. if ($category =~ (/S/)) {
  194. return 1;
  195. } else {
  196. return 0;
  197. }
  198. }
  199. # Blank chars are horizontal whitespace. A blank character is either
  200. # * a character with the space separator category (Zs) in the
  201. # Unicode character database.
  202. # * U+0009 Horizontal tabulation (\t control-I)
  203. sub blank {
  204. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  205. if ($category =~ (/Zs/)
  206. || $codepoint == 0x9) {
  207. return 1;
  208. } else {
  209. return 0;
  210. }
  211. }
  212. # ASCII
  213. sub ascii {
  214. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  215. if ($codepoint <= 0x7F) {
  216. return 1;
  217. } else {
  218. return 0;
  219. }
  220. }
  221. # Empty
  222. sub empty {
  223. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  224. return 0;
  225. }
  226. # Designated -- All characters except for the surrogates
  227. sub designated {
  228. my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  229. if ($category =~ (/Cs/)) {
  230. return 0;
  231. } else {
  232. return 1;
  233. }
  234. }
  235. # The procedure generates the two C structures necessary to describe a
  236. # given category.
  237. sub compute {
  238. my($f) = @_;
  239. my $start = -1;
  240. my $end = -1;
  241. my $len = 0;
  242. my @rstart = (-1);
  243. my @rend = (-1);
  244. seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
  245. print "$f\n";
  246. while (<$in>) {
  247. # Parse the 14 column, semicolon-delimited UnicodeData.txt
  248. # file
  249. chomp;
  250. my(@fields) = split(/;/);
  251. # The codepoint: an integer
  252. my $codepoint = hex($fields[0]);
  253. # If this is a character range, the last character in this
  254. # range
  255. my $codepoint_end = $codepoint;
  256. # The name of the character
  257. my $name = $fields[1];
  258. # A two-character category code, such as Ll (lower-case
  259. # letter)
  260. my $category = $fields[2];
  261. # The codepoint of the uppercase version of this char
  262. my $uppercase = $fields[12];
  263. # The codepoint of the lowercase version of this char
  264. my $lowercase = $fields[13];
  265. my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
  266. if ($pass == 1) {
  267. # Some pairs of lines in UnicodeData.txt delimit ranges of
  268. # characters.
  269. if ($name =~ /First/) {
  270. $line = <$in>;
  271. die $! if $!;
  272. $codepoint_end = hex( (split(/;/, $line))[0] );
  273. }
  274. # Compute ranges of characters [start:end] that meet the
  275. # criteria. Store the ranges.
  276. if ($start == -1) {
  277. $start = $codepoint;
  278. $end = $codepoint_end;
  279. } elsif ($end + 1 == $codepoint) {
  280. $end = $codepoint_end;
  281. } else {
  282. $rstart[$len] = $start;
  283. $rend[$len] = $end;
  284. $len++;
  285. $start = $codepoint;
  286. $end = $codepoint_end;
  287. }
  288. }
  289. }
  290. # Extra logic to ensure that the last range is included
  291. if ($start != -1) {
  292. if ($len > 0 && $rstart[@rstart-1] != $start) {
  293. $rstart[$len] = $start;
  294. $rend[$len] = $end;
  295. $len++;
  296. } elsif ($len == 0) {
  297. $rstart[0] = $start;
  298. $rend[0] = $end;
  299. $len++;
  300. }
  301. }
  302. # Print the C struct that contains the range list.
  303. print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
  304. if ($rstart[0] != -1) {
  305. for (my $i=0; $i<@rstart-1; $i++) {
  306. printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
  307. }
  308. printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
  309. }
  310. print $out "};\n\n";
  311. # Print the C struct that contains the range list length and
  312. # pointer to the range list.
  313. print $out "scm_t_char_set cs_${f} = {\n";
  314. print $out " $len,\n";
  315. print $out " cs_" . $f . "_ranges\n";
  316. print $out "};\n\n";
  317. }
  318. # Write a bit of a header
  319. print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
  320. print $out "/* This file is #include'd by srfi-14.c. */\n\n";
  321. print $out "/* This file was generated from\n";
  322. print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
  323. print $out " with the unidata_to_charset.pl script. */\n\n";
  324. # Write the C structs for each SRFI-14 charset
  325. compute "lower_case";
  326. compute "upper_case";
  327. compute "title_case";
  328. compute "letter";
  329. compute "digit";
  330. compute "hex_digit";
  331. compute "letter_plus_digit";
  332. compute "graphic";
  333. compute "whitespace";
  334. compute "printing";
  335. compute "iso_control";
  336. compute "punctuation";
  337. compute "symbol";
  338. compute "blank";
  339. compute "ascii";
  340. compute "empty";
  341. compute "designated";
  342. close $in;
  343. close $out;
  344. exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
  345. # And we're done.