YarrCanonicalizeUCS2.js 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. /*
  2. * Copyright (C) 2012 Apple Inc. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions
  6. * are met:
  7. * 1. Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * 2. Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. *
  13. * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
  17. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. // See ES 5.1, 15.10.2.8
  26. function canonicalize(ch)
  27. {
  28. var u = String.fromCharCode(ch).toUpperCase();
  29. if (u.length > 1)
  30. return ch;
  31. var cu = u.charCodeAt(0);
  32. if (ch >= 128 && cu < 128)
  33. return ch;
  34. return cu;
  35. }
  36. var MAX_UCS2 = 0xFFFF;
  37. var MAX_LATIN = 0xFF;
  38. var groupedCanonically = [];
  39. // Pass 1: populate groupedCanonically - this is mapping from canonicalized
  40. // values back to the set of character code that canonicalize to them.
  41. for (var i = 0; i <= MAX_UCS2; ++i) {
  42. var ch = canonicalize(i);
  43. if (!groupedCanonically[ch])
  44. groupedCanonically[ch] = [];
  45. groupedCanonically[ch].push(i);
  46. }
  47. var typeInfo = [];
  48. var latinTypeInfo = [];
  49. var characterSetInfo = [];
  50. // Pass 2: populate typeInfo & characterSetInfo. For every character calculate
  51. // a typeInfo value, described by the types above, and a value payload.
  52. for (cu in groupedCanonically) {
  53. // The set of characters that canonicalize to cu
  54. var characters = groupedCanonically[cu];
  55. // If there is only one, it is unique.
  56. if (characters.length == 1) {
  57. typeInfo[characters[0]] = "CanonicalizeUnique:0";
  58. latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
  59. continue;
  60. }
  61. // Sort the array.
  62. characters.sort(function(x,y){return x-y;});
  63. // If there are more than two characters, create an entry in characterSetInfo.
  64. if (characters.length > 2) {
  65. for (i in characters)
  66. typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
  67. characterSetInfo.push(characters);
  68. if (characters[1] <= MAX_LATIN)
  69. throw new Error("sets with more than one latin character not supported!");
  70. if (characters[0] <= MAX_LATIN) {
  71. for (i in characters)
  72. latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
  73. latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
  74. } else {
  75. for (i in characters)
  76. latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
  77. }
  78. continue;
  79. }
  80. // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
  81. var lo = characters[0];
  82. var hi = characters[1];
  83. var delta = hi - lo;
  84. if (delta == 1) {
  85. var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
  86. typeInfo[lo] = type;
  87. typeInfo[hi] = type;
  88. } else {
  89. typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
  90. typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
  91. }
  92. if (lo > MAX_LATIN) {
  93. latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
  94. latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
  95. } else if (hi > MAX_LATIN) {
  96. latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
  97. latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
  98. } else {
  99. if (delta != 0x20 || lo & 0x20)
  100. throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
  101. latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
  102. latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
  103. }
  104. }
  105. var rangeInfo = [];
  106. // Pass 3: coallesce types into ranges.
  107. for (var end = 0; end <= MAX_UCS2; ++end) {
  108. var begin = end;
  109. var type = typeInfo[end];
  110. while (end < MAX_UCS2 && typeInfo[end + 1] == type)
  111. ++end;
  112. rangeInfo.push({begin:begin, end:end, type:type});
  113. }
  114. var latinRangeInfo = [];
  115. // Pass 4: coallesce latin-1 types into ranges.
  116. for (var end = 0; end <= MAX_UCS2; ++end) {
  117. var begin = end;
  118. var type = latinTypeInfo[end];
  119. while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
  120. ++end;
  121. latinRangeInfo.push({begin:begin, end:end, type:type});
  122. }
  123. // Helper function to convert a number to a fixed width hex representation of a C uint16_t.
  124. function hex(x)
  125. {
  126. var s = Number(x).toString(16);
  127. while (s.length < 4)
  128. s = 0 + s;
  129. return "0x" + s + "u";
  130. }
  131. var copyright = (
  132. "/*" + "\n" +
  133. " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
  134. " *" + "\n" +
  135. " * Redistribution and use in source and binary forms, with or without" + "\n" +
  136. " * modification, are permitted provided that the following conditions" + "\n" +
  137. " * are met:" + "\n" +
  138. " * 1. Redistributions of source code must retain the above copyright" + "\n" +
  139. " * notice, this list of conditions and the following disclaimer." + "\n" +
  140. " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
  141. " * notice, this list of conditions and the following disclaimer in the" + "\n" +
  142. " * documentation and/or other materials provided with the distribution." + "\n" +
  143. " *" + "\n" +
  144. " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
  145. " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
  146. " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
  147. " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
  148. " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
  149. " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
  150. " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
  151. " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
  152. " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
  153. " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
  154. " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
  155. " */");
  156. print(copyright);
  157. print();
  158. print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
  159. print();
  160. print('#include "config.h"');
  161. print('#include "YarrCanonicalizeUCS2.h"');
  162. print();
  163. print("namespace JSC { namespace Yarr {");
  164. print();
  165. print("#include <stdint.h>");
  166. print();
  167. for (i in characterSetInfo) {
  168. var characters = ""
  169. var set = characterSetInfo[i];
  170. for (var j in set)
  171. characters += hex(set[j]) + ", ";
  172. print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
  173. }
  174. print();
  175. print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
  176. print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
  177. for (i in characterSetInfo)
  178. print(" ucs2CharacterSet" + i + ",");
  179. print("};");
  180. print();
  181. print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
  182. print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
  183. for (i in rangeInfo) {
  184. var info = rangeInfo[i];
  185. var typeAndValue = info.type.split(':');
  186. print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
  187. }
  188. print("};");
  189. print();
  190. print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
  191. print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
  192. for (i in latinRangeInfo) {
  193. var info = latinRangeInfo[i];
  194. var typeAndValue = info.type.split(':');
  195. print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
  196. }
  197. print("};");
  198. print();
  199. print("} } // JSC::Yarr");
  200. print();