Utf8CaseGenerate.php 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. <?php
  2. # Copyright (C) 2004,2008 Brion Vibber <brion@pobox.com>
  3. # http://www.mediawiki.org/
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. # http://www.gnu.org/copyleft/gpl.html
  19. /**
  20. * This script generates Utf8Case.inc from the Unicode Character Database
  21. * and supplementary files.
  22. *
  23. * @ingroup UtfNormal
  24. * @access private
  25. */
  26. /** */
  27. if( php_sapi_name() != 'cli' ) {
  28. die( "Run me from the command line please.\n" );
  29. }
  30. require_once 'UtfNormalUtil.php';
  31. $in = fopen("UnicodeData.txt", "rt" );
  32. if( !$in ) {
  33. print "Can't open UnicodeData.txt for reading.\n";
  34. print "If necessary, fetch this file from the internet:\n";
  35. print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
  36. exit(-1);
  37. }
  38. $wikiUpperChars = array();
  39. $wikiLowerChars = array();
  40. print "Reading character definitions...\n";
  41. while( false !== ($line = fgets( $in ) ) ) {
  42. $columns = split(';', $line);
  43. $codepoint = $columns[0];
  44. $name = $columns[1];
  45. $simpleUpper = $columns[12];
  46. $simpleLower = $columns[13];
  47. $source = codepointToUtf8( hexdec( $codepoint ) );
  48. if( $simpleUpper ) {
  49. $wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
  50. }
  51. if( $simpleLower ) {
  52. $wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
  53. }
  54. }
  55. fclose( $in );
  56. $out = fopen("Utf8Case.php", "wt");
  57. if( $out ) {
  58. $outUpperChars = escapeArray( $wikiUpperChars );
  59. $outLowerChars = escapeArray( $wikiLowerChars );
  60. $outdata = "<" . "?php
  61. /**
  62. * Simple 1:1 upper/lowercase switching arrays for utf-8 text
  63. * Won't get context-sensitive things yet
  64. *
  65. * Hack for bugs in ucfirst() and company
  66. *
  67. * These are pulled from memcached if possible, as this is faster than filling
  68. * up a big array manually.
  69. * @ingroup Language
  70. */
  71. /*
  72. * Translation array to get upper case character
  73. */
  74. \$wikiUpperChars = $outUpperChars;
  75. /*
  76. * Translation array to get lower case character
  77. */
  78. \$wikiLowerChars = $outLowerChars;\n";
  79. fputs( $out, $outdata );
  80. fclose( $out );
  81. print "Wrote out Utf8Case.php\n";
  82. } else {
  83. print "Can't create file Utf8Case.php\n";
  84. exit(-1);
  85. }
  86. function escapeArray( $arr ) {
  87. return "array(\n" .
  88. implode( ",\n",
  89. array_map( "escapeLine",
  90. array_keys( $arr ),
  91. array_values( $arr ) ) ) .
  92. "\n)";
  93. }
  94. function escapeLine( $key, $val ) {
  95. $encKey = escapeSingleString( $key );
  96. $encVal = escapeSingleString( $val );
  97. return "\t'$encKey' => '$encVal'";
  98. }