unicode_parsedata.nim 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. import strutils, algorithm
  2. let
  3. # this file was obtained from:
  4. # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
  5. filename = "tools/UnicodeData.txt"
  6. data = readFile(filename).strip.splitLines()
  7. const
  8. # see the table here:
  9. # https://www.unicode.org/reports/tr44/#GC_Values_Table
  10. letters = ["Lu", "Ll", "Lt", "Lm", "Lo"]
  11. spaces = ["Zs", "Zl", "Zp"]
  12. type
  13. Ranges = tuple[start, stop, diff: int]
  14. Singlets = tuple[code, diff: int]
  15. NonLetterRanges = tuple[start, stop: int]
  16. var
  17. toUpper = newSeq[Singlets]()
  18. toLower = newSeq[Singlets]()
  19. toTitle = newSeq[Singlets]()
  20. alphas = newSeq[int]()
  21. unispaces = newSeq[int]()
  22. proc parseData(data: seq[string]) =
  23. proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
  24. if category notin spaces and category notin letters:
  25. return
  26. if firstCode != lastCode:
  27. doAssert uc == "" and lc == "" and tc == ""
  28. if uc.len > 0:
  29. let diff = 500 + uc.parseHexInt() - firstCode
  30. toUpper.add (firstCode, diff)
  31. if lc.len > 0:
  32. let diff = 500 + lc.parseHexInt() - firstCode
  33. toLower.add (firstCode, diff)
  34. if tc.len > 0 and tc != uc:
  35. # if titlecase is different than uppercase
  36. let diff = 500 + tc.parseHexInt() - firstCode
  37. if diff != 500:
  38. toTitle.add (firstCode, diff)
  39. for code in firstCode..lastCode:
  40. if category in spaces:
  41. unispaces.add code
  42. else:
  43. alphas.add code
  44. var idx = 0
  45. while idx < data.len:
  46. let
  47. line = data[idx]
  48. fields = line.split(';')
  49. code = fields[0].parseHexInt()
  50. name = fields[1]
  51. category = fields[2]
  52. uc = fields[12]
  53. lc = fields[13]
  54. tc = fields[14]
  55. inc(idx)
  56. if name.endsWith(", First>"):
  57. doAssert idx < data.len
  58. let
  59. nextLine = data[idx]
  60. nextFields = nextLine.split(';')
  61. nextCode = nextFields[0].parseHexInt()
  62. nextName = nextFields[1]
  63. inc(idx)
  64. doAssert nextName.endsWith(", Last>")
  65. doAdd(code, nextCode, category, uc, lc, tc)
  66. else:
  67. doAdd(code, code, category, uc, lc, tc)
  68. proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
  69. ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
  70. ## - `r` contains continuous ranges with the same characteristics
  71. ## (their upper/lower version is the same distance away)
  72. ## - `s` contains single code points
  73. var i, j: int
  74. while i < a.len:
  75. j = 1
  76. let
  77. startCode = a[i].code
  78. startDiff = a[i].diff
  79. while i + j <= a.len:
  80. if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff:
  81. if j == 1:
  82. s.add (startCode, startDiff)
  83. else:
  84. r.add (startCode, a[i+j-1].code, startDiff)
  85. i += j-1
  86. break
  87. else:
  88. inc j
  89. inc i
  90. proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
  91. ## Splits `alphas` and `unispaces` into separate sequences:
  92. ## - `r` contains continuous ranges
  93. ## - `s` contains single code points
  94. var i, j: int
  95. while i < a.len:
  96. j = 1
  97. let startCode = a[i]
  98. while i + j <= a.len:
  99. if i+j >= a.len or a[i+j] != startCode+j:
  100. if j == 1:
  101. s.add startCode
  102. else:
  103. r.add (startCode, a[i+j-1])
  104. i += j-1
  105. break
  106. else:
  107. inc j
  108. inc i
  109. proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
  110. ## Spaces are special because of the way how `isWhiteSpace` and `split`
  111. ## are implemented.
  112. ##
  113. ## All spaces are added both to `r` (ranges) and `s` (singlets).
  114. var i, j: int
  115. while i < a.len:
  116. j = 1
  117. let startCode = a[i]
  118. while i + j <= a.len:
  119. if i+j >= a.len or a[i+j] != startCode+j:
  120. r.add (startCode, a[i+j-1])
  121. i += j-1
  122. break
  123. else:
  124. inc j
  125. inc i
  126. s = a
  127. var
  128. toupperRanges = newSeq[Ranges]()
  129. toupperSinglets = newSeq[Singlets]()
  130. tolowerRanges = newSeq[Ranges]()
  131. tolowerSinglets = newSeq[Singlets]()
  132. totitleRanges = newSeq[Ranges]()
  133. totitleSinglets = newSeq[Singlets]()
  134. spaceRanges = newSeq[NonLetterRanges]()
  135. unicodeSpaces = newSeq[int]()
  136. alphaRanges = newSeq[NonLetterRanges]()
  137. alphaSinglets = newSeq[int]()
  138. parseData(data)
  139. splitRanges(toLower, tolowerRanges, tolowerSinglets)
  140. splitRanges(toUpper, toUpperRanges, toUpperSinglets)
  141. splitRanges(toTitle, toTitleRanges, toTitleSinglets)
  142. splitRanges(alphas, alphaRanges, alphaSinglets)
  143. # manually add "special" spaces
  144. for i in 9 .. 13:
  145. unispaces.add i
  146. unispaces.add 0x85
  147. unispaces.sort()
  148. splitSpaces(unispaces, spaceRanges, unicodeSpaces)
  149. var output: string
  150. proc createHeader(output: var string) =
  151. output.add "# This file was created from a script.\n\n"
  152. output.add "const\n"
  153. proc `$`(r: Ranges): string =
  154. let
  155. start = "0x" & toHex(r.start, 5) & "'i32"
  156. stop = "0x" & toHex(r.stop, 5) & "'i32"
  157. result = "$#, $#, $#,\n" % [start, stop, $r.diff]
  158. proc `$`(r: Singlets): string =
  159. let code = "0x" & toHex(r.code, 5) & "'i32"
  160. result = "$#, $#,\n" % [code, $r.diff]
  161. proc `$`(r: NonLetterRanges): string =
  162. let
  163. start = "0x" & toHex(r.start, 5) & "'i32"
  164. stop = "0x" & toHex(r.stop, 5) & "'i32"
  165. result = "$#, $#,\n" % [start, stop]
  166. proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
  167. output: var string) =
  168. output.add " $# = [\n" % name
  169. for r in s:
  170. output.add " " & $r
  171. output.add " ]\n\n"
  172. proc outputSeq(s: seq[int], name: string, output: var string) =
  173. output.add " $# = [\n" % name
  174. for i in s:
  175. output.add " 0x$#'i32,\n" % toHex(i, 5)
  176. output.add " ]\n\n"
  177. proc outputSpaces(s: seq[int], name: string, output: var string) =
  178. output.add " $# = [\n" % name
  179. for i in s:
  180. output.add " Rune 0x$#,\n" % toHex(i, 5)
  181. output.add " ]\n\n"
  182. output.createHeader()
  183. outputSeq(tolowerRanges, "toLowerRanges", output)
  184. outputSeq(tolowerSinglets, "toLowerSinglets", output)
  185. outputSeq(toupperRanges, "toUpperRanges", output)
  186. outputSeq(toupperSinglets, "toUpperSinglets", output)
  187. outputSeq(totitleSinglets, "toTitleSinglets", output)
  188. outputSeq(alphaRanges, "alphaRanges", output)
  189. outputSeq(alphaSinglets, "alphaSinglets", output)
  190. outputSeq(spaceRanges, "spaceRanges", output)
  191. outputSpaces(unispaces, "unicodeSpaces", output) # array of runes
  192. let outfile = "lib/pure/includes/unicode_ranges.nim"
  193. outfile.writeFile(output)