unicode-muncher.pl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. #!/usr/bin/perl -w
  2. # unicode-muncher.pl -- generate Unicode database for java.lang.Character
  3. # Copyright (C) 1998, 2002 Free Software Foundation, Inc.
  4. #
  5. # This file is part of GNU Classpath.
  6. #
  7. # GNU Classpath is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation; either version 2, or (at your option)
  10. # any later version.
  11. #
  12. # GNU Classpath is distributed in the hope that it will be useful, but
  13. # WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. # General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with GNU Classpath; see the file COPYING. If not, write to the
  19. # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20. # 02111-1307 USA.
  21. #
  22. # Linking this library statically or dynamically with other modules is
  23. # making a combined work based on this library. Thus, the terms and
  24. # conditions of the GNU General Public License cover the whole
  25. # combination.
  26. #
  27. # As a special exception, the copyright holders of this library give you
  28. # permission to link this library with independent modules to produce an
  29. # executable, regardless of the license terms of these independent
  30. # modules, and to copy and distribute the resulting executable under
  31. # terms of your choice, provided that you also meet, for each linked
  32. # independent module, the terms and conditions of the license of that
  33. # module. An independent module is a module which is not derived from
  34. # or based on this library. If you modify this library, you may extend
  35. # this exception to your version of the library, but you are not
  36. # obligated to do so. If you do not wish to do so, delete this
  37. # exception statement from your version.
  38. # Code for reading UnicodeData.txt and generating the code for
  39. # gnu.java.lang.CharData. For now, the relevant Unicode definition files
  40. # are found in libjava/gnu/gcj/convert/.
  41. #
  42. # Inspired by code from Jochen Hoenicke.
  43. # author Eric Blake <ebb9@email.byu.edu>
  44. #
  45. # Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
  46. # where <UnicodeData.txt> is obtained from www.unicode.org (named
  47. # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
  48. # is the final location for the Java interface gnu.java.lang.CharData.
  49. # As of JDK 1.4, use Unicode version 3.0.0 for best results.
  50. ##
  51. ## Convert a 16-bit integer to a Java source code String literal character
  52. ##
  53. sub javaChar($) {
  54. my ($char) = @_;
  55. die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
  56. $char += 0x10000 if $char < 0;
  57. # Special case characters that must be escaped, or are shorter as ASCII
  58. return sprintf("\\%03o", $char) if $char < 0x20;
  59. return "\\\"" if $char == 0x22;
  60. return "\\\\" if $char == 0x5c;
  61. return pack("C", $char) if $char < 0x7f;
  62. return sprintf("\\u%04x", $char);
  63. }
  64. ##
  65. ## Convert the text UnicodeData file from www.unicode.org into a Java
  66. ## interface with string constants holding the compressed information.
  67. ##
  68. my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  69. SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  70. my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  71. my $NOBREAK_FLAG = 32;
  72. my $MIRRORED_FLAG = 64;
  73. my @info = ();
  74. my $titlecase = "";
  75. my $count = 0;
  76. my $range = 0;
  77. die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
  78. open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
  79. # Stage 1: Parse the attribute file
  80. $| = 1;
  81. print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
  82. print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
  83. print "Parsing attributes file";
  84. while(<UNICODE>) {
  85. print "." unless $count++ % 1000;
  86. chomp;
  87. s/\r//g;
  88. my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
  89. $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
  90. $ch = hex($ch);
  91. next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
  92. my ($type, $numValue, $upperchar, $lowerchar, $direction);
  93. $type = 0;
  94. while ($category !~ /^$TYPECODES[$type]$/) {
  95. if (++$type == @TYPECODES) {
  96. die "$ch: Unknown type: $category";
  97. }
  98. }
  99. $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
  100. $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
  101. if ($numeric =~ /^[0-9]+$/) {
  102. $numValue = $numeric;
  103. die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
  104. } elsif ($numeric eq "") {
  105. # Special case sequences of 'a'-'z'
  106. if ($ch >= 0x0041 && $ch <= 0x005a) {
  107. $numValue = $ch - 0x0037;
  108. } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
  109. $numValue = $ch - 0x0057;
  110. } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
  111. $numValue = $ch - 0xff17;
  112. } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
  113. $numValue = $ch - 0xff37;
  114. } else {
  115. $numValue = -1;
  116. }
  117. } else {
  118. $numValue = -2;
  119. }
  120. $upperchar = $upcase ? hex($upcase) - $ch : 0;
  121. $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
  122. if ($title ne $upcase) {
  123. my $titlechar = $title ? hex($title) : $ch;
  124. $titlecase .= pack("n2", $ch, $titlechar);
  125. }
  126. $direction = 0;
  127. while ($bidir !~ /^$DIRCODES[$direction]$/) {
  128. if (++$direction == @DIRCODES) {
  129. $direction = -1;
  130. last;
  131. }
  132. }
  133. if ($range) {
  134. die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
  135. for ($range + 1 .. $ch - 1) {
  136. $info[$_] = pack("n5", $type, $numValue, $upperchar,
  137. $lowerchar, $direction);
  138. }
  139. $range = 0;
  140. } elsif ($name =~ /First>$/) {
  141. $range = $ch;
  142. }
  143. $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
  144. $direction);
  145. }
  146. close UNICODE;
  147. # Stage 2: Compress the data structures
  148. printf "\nCompressing data structures";
  149. $count = 0;
  150. my $info = ();
  151. my %charhash = ();
  152. my @charinfo = ();
  153. for my $ch (0 .. 0xffff) {
  154. print "." unless $count++ % 0x1000;
  155. if (! defined $info[$ch]) {
  156. $info[$ch] = pack("n5", 0, -1, 0, 0, -1);
  157. }
  158. my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
  159. if (! exists $charhash{$info[$ch]}) {
  160. push @charinfo, [ $numVal, $upper, $lower, $direction ];
  161. $charhash{$info[$ch]} = $#charinfo;
  162. }
  163. $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
  164. }
  165. my $charlen = @charinfo;
  166. my $bestshift;
  167. my $bestest = 1000000;
  168. my $bestblkstr;
  169. die "Too many unique character entries: $charlen\n" if $charlen > 512;
  170. print "\nUnique character entries: $charlen\n";
  171. for my $i (3 .. 8) {
  172. my $blksize = 1 << $i;
  173. my %blocks = ();
  174. my @blkarray = ();
  175. my ($j, $k);
  176. print "shift: $i";
  177. for ($j = 0; $j < 0x10000; $j += $blksize) {
  178. my $blkkey = substr $info, 2 * $j, 2 * $blksize;
  179. if (! exists $blocks{$blkkey}) {
  180. push @blkarray, $blkkey;
  181. $blocks{$blkkey} = $#blkarray;
  182. }
  183. }
  184. my $blknum = @blkarray;
  185. my $blocklen = $blknum * $blksize;
  186. printf " before %5d", $blocklen;
  187. # Now we try to pack the blkarray as tight as possible by finding matching
  188. # heads and tails.
  189. for ($j = $blksize - 1; $j > 0; $j--) {
  190. my %tails = ();
  191. for $k (0 .. $#blkarray) {
  192. next if ! defined $blkarray[$k];
  193. my $len = length $blkarray[$k];
  194. my $tail = substr $blkarray[$k], $len - $j * 2;
  195. if (exists $tails{$tail}) {
  196. push @{$tails{$tail}}, $k;
  197. } else {
  198. $tails{$tail} = [ $k ];
  199. }
  200. }
  201. # tails are calculated, now calculate the heads and merge.
  202. BLOCK:
  203. for $k (0 .. $#blkarray) {
  204. next if ! defined $blkarray[$k];
  205. my $tomerge = $k;
  206. while (1) {
  207. my $head = substr($blkarray[$tomerge], 0, $j * 2);
  208. my $entry = $tails{$head};
  209. next BLOCK if ! defined $entry;
  210. my $other = shift @{$entry};
  211. if ($other == $tomerge) {
  212. if (@{$entry}) {
  213. push @{$entry}, $other;
  214. $other = shift @{$entry};
  215. } else {
  216. push @{$entry}, $other;
  217. next BLOCK;
  218. }
  219. }
  220. if (@{$entry} == 0) {
  221. delete $tails{$head};
  222. }
  223. # a match was found
  224. my $merge = $blkarray[$other]
  225. . substr($blkarray[$tomerge], $j * 2);
  226. $blocklen -= $j;
  227. $blknum--;
  228. if ($other < $tomerge) {
  229. $blkarray[$tomerge] = undef;
  230. $blkarray[$other] = $merge;
  231. my $len = length $merge;
  232. my $tail = substr $merge, $len - $j * 2;
  233. $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
  234. @{$tails{$tail}} ];
  235. next BLOCK;
  236. }
  237. $blkarray[$tomerge] = $merge;
  238. $blkarray[$other] = undef;
  239. }
  240. }
  241. }
  242. my $blockstr;
  243. for $k (0 .. $#blkarray) {
  244. $blockstr .= $blkarray[$k] if defined $blkarray[$k];
  245. }
  246. die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
  247. my $estimate = 2 * $blocklen + (0x20000 >> $i);
  248. printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
  249. if ($estimate < $bestest) {
  250. $bestest = $estimate;
  251. $bestshift = $i;
  252. $bestblkstr = $blockstr;
  253. }
  254. }
  255. my @blocks;
  256. my $blksize = 1 << $bestshift;
  257. for (my $j = 0; $j < 0x10000; $j += $blksize) {
  258. my $blkkey = substr $info, 2 * $j, 2 * $blksize;
  259. my $index = index $bestblkstr, $blkkey;
  260. while ($index & 1) {
  261. die "not found: $j" if $index == -1;
  262. $index = index $bestblkstr, $blkkey, $index + 1;
  263. }
  264. push @blocks, ($index / 2 - $j) & 0xffff;
  265. }
  266. # Phase 3: Generate the file
  267. die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
  268. if @blocks > 0xffff / 3;
  269. die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
  270. if length($bestblkstr) > 0xffff / 3;
  271. {
  272. print "Generating $ARGV[1] with shift of $bestshift";
  273. my ($i, $j);
  274. open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
  275. print OUTPUT <<EOF;
  276. /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
  277. Copyright (C) 2002 Free Software Foundation, Inc.
  278. *** This file is generated by scripts/unicode-muncher.pl ***
  279. This file is part of GNU Classpath.
  280. GNU Classpath is free software; you can redistribute it and/or modify
  281. it under the terms of the GNU General Public License as published by
  282. the Free Software Foundation; either version 2, or (at your option)
  283. any later version.
  284. GNU Classpath is distributed in the hope that it will be useful, but
  285. WITHOUT ANY WARRANTY; without even the implied warranty of
  286. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  287. General Public License for more details.
  288. You should have received a copy of the GNU General Public License
  289. along with GNU Classpath; see the file COPYING. If not, write to the
  290. Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  291. 02111-1307 USA.
  292. Linking this library statically or dynamically with other modules is
  293. making a combined work based on this library. Thus, the terms and
  294. conditions of the GNU General Public License cover the whole
  295. combination.
  296. As a special exception, the copyright holders of this library give you
  297. permission to link this library with independent modules to produce an
  298. executable, regardless of the license terms of these independent
  299. modules, and to copy and distribute the resulting executable under
  300. terms of your choice, provided that you also meet, for each linked
  301. independent module, the terms and conditions of the license of that
  302. module. An independent module is a module which is not derived from
  303. or based on this library. If you modify this library, you may extend
  304. this exception to your version of the library, but you are not
  305. obligated to do so. If you do not wish to do so, delete this
  306. exception statement from your version. */
  307. package gnu.java.lang;
  308. /**
  309. * This contains the info about the unicode characters, that
  310. * java.lang.Character needs. It is generated automatically from
  311. * <code>$ARGV[0]</code>, by some
  312. * perl scripts. This Unicode definition file can be found on the
  313. * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
  314. * JDK 1.4 uses Unicode version 3.0.0.
  315. *
  316. * The data is stored as string constants, but Character will convert these
  317. * Strings to their respective <code>char[]</code> components. The field
  318. * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
  319. * characters within <code>DATA</code>. The DATA field, in turn, stores
  320. * information about each character in the low order bits, and an offset
  321. * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
  322. * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
  323. * attribute tables are much smaller than 0xffff entries; as many characters
  324. * in Unicode share common attributes. Finally, there is a listing for
  325. * <code>TITLE</code> exceptions (most characters just have the same
  326. * title case as upper case).
  327. *
  328. * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
  329. * Eric Blake)
  330. * \@see Character
  331. */
  332. public interface CharData
  333. {
  334. /**
  335. * The Unicode definition file that was parsed to build this database.
  336. */
  337. String SOURCE = \"$ARGV[0]\";
  338. /**
  339. * The character shift amount to look up the block offset. In other words,
  340. * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
  341. * <code>ch</code> is described in <code>DATA</code>.
  342. */
  343. int SHIFT = $bestshift;
  344. /**
  345. * The mapping of character blocks to their location in <code>DATA</code>.
  346. * Each entry has been adjusted so that the 16-bit sum with the desired
  347. * character gives the actual index into <code>DATA</code>.
  348. */
  349. String BLOCKS
  350. EOF
  351. for ($i = 0; $i < @blocks / 11; $i++) {
  352. print OUTPUT $i ? "\n + \"" : " = \"";
  353. for $j (0 .. 10) {
  354. last if @blocks <= $i * 11 + $j;
  355. my $val = $blocks[$i * 11 + $j];
  356. print OUTPUT javaChar($val);
  357. }
  358. print OUTPUT "\"";
  359. }
  360. print OUTPUT <<EOF;
  361. ;
  362. /**
  363. * Information about each character. The low order 5 bits form the
  364. * character type, the next bit is a flag for non-breaking spaces, and the
  365. * next bit is a flag for mirrored directionality. The high order 9 bits
  366. * form the offset into the attribute tables. Note that this limits the
  367. * number of unique character attributes to 512, which is not a problem
  368. * as of Unicode version 3.2.0, but may soon become one.
  369. */
  370. String DATA
  371. EOF
  372. my $len = length($bestblkstr) / 2;
  373. for ($i = 0; $i < $len / 11; $i++) {
  374. print OUTPUT $i ? "\n + \"" : " = \"";
  375. for $j (0 .. 10) {
  376. last if $len <= $i * 11 + $j;
  377. my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
  378. print OUTPUT javaChar($val);
  379. }
  380. print OUTPUT "\"";
  381. }
  382. print OUTPUT <<EOF;
  383. ;
  384. /**
  385. * This is the attribute table for computing the numeric value of a
  386. * character. The value is -1 if Unicode does not define a value, -2
  387. * if the value is not a positive integer, otherwise it is the value.
  388. * Note that this is a signed value, but stored as an unsigned char
  389. * since this is a String literal.
  390. */
  391. String NUM_VALUE
  392. EOF
  393. $len = @charinfo;
  394. for ($i = 0; $i < $len / 11; $i++) {
  395. print OUTPUT $i ? "\n + \"" : " = \"";
  396. for $j (0 .. 10) {
  397. last if $len <= $i * 11 + $j;
  398. my $val = $charinfo[$i * 11 + $j][0];
  399. print OUTPUT javaChar($val);
  400. }
  401. print OUTPUT "\"";
  402. }
  403. print OUTPUT <<EOF;
  404. ;
  405. /**
  406. * This is the attribute table for computing the uppercase representation
  407. * of a character. The value is the signed difference between the
  408. * character and its uppercase version. Note that this is stored as an
  409. * unsigned char since this is a String literal.
  410. */
  411. String UPPER
  412. EOF
  413. $len = @charinfo;
  414. for ($i = 0; $i < $len / 11; $i++) {
  415. print OUTPUT $i ? "\n + \"" : " = \"";
  416. for $j (0 .. 10) {
  417. last if $len <= $i * 11 + $j;
  418. my $val = $charinfo[$i * 11 + $j][1];
  419. print OUTPUT javaChar($val);
  420. }
  421. print OUTPUT "\"";
  422. }
  423. print OUTPUT <<EOF;
  424. ;
  425. /**
  426. * This is the attribute table for computing the lowercase representation
  427. * of a character. The value is the signed difference between the
  428. * character and its lowercase version. Note that this is stored as an
  429. * unsigned char since this is a String literal.
  430. */
  431. String LOWER
  432. EOF
  433. $len = @charinfo;
  434. for ($i = 0; $i < $len / 11; $i++) {
  435. print OUTPUT $i ? "\n + \"" : " = \"";
  436. for $j (0 .. 10) {
  437. last if $len <= $i * 11 + $j;
  438. my $val = $charinfo[$i * 11 + $j][2];
  439. print OUTPUT javaChar($val);
  440. }
  441. print OUTPUT "\"";
  442. }
  443. print OUTPUT <<EOF;
  444. ;
  445. /**
  446. * This is the attribute table for computing the directionality class
  447. * of a character. At present, the value is in the range 0 - 18 if the
  448. * character has a direction, otherwise it is -1. Note that this is
  449. * stored as an unsigned char since this is a String literal.
  450. */
  451. String DIRECTION
  452. EOF
  453. $len = @charinfo;
  454. for ($i = 0; $i < $len / 11; $i++) {
  455. print OUTPUT $i ? "\n + \"" : " = \"";
  456. for $j (0 .. 10) {
  457. last if $len <= $i * 11 + $j;
  458. my $val = $charinfo[$i * 11 + $j][3];
  459. print OUTPUT javaChar($val);
  460. }
  461. print OUTPUT "\"";
  462. }
  463. print OUTPUT <<EOF;
  464. ;
  465. /**
  466. * This is the listing of titlecase special cases (all other character
  467. * can use <code>UPPER</code> to determine their titlecase). The listing
  468. * is a sequence of character pairs; converting the first character of the
  469. * pair to titlecase produces the second character.
  470. */
  471. String TITLE
  472. EOF
  473. $len = length($titlecase) / 2;
  474. for ($i = 0; $i < $len / 11; $i++) {
  475. print OUTPUT $i ? "\n + \"" : " = \"";
  476. for $j (0 .. 10) {
  477. last if $len <= $i * 11 + $j;
  478. my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
  479. print OUTPUT javaChar($val);
  480. }
  481. print OUTPUT "\"";
  482. }
  483. print OUTPUT ";\n}\n";
  484. close OUTPUT;
  485. }
  486. print "\nDone.\n";