unicode_parsedata.nim 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import strutils, algorithm
  2. let
  3. # this file was obtained from:
  4. # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
  5. filename = "tools/UnicodeData.txt"
  6. data = readFile(filename).strip.splitLines()
  7. const
  8. # see the table here:
  9. # https://www.unicode.org/reports/tr44/#GC_Values_Table
  10. letters = ["Lu", "Ll", "Lt", "Lm", "Lo"]
  11. spaces = ["Zs", "Zl", "Zp"]
  12. type
  13. Ranges = tuple[start, stop, diff: int]
  14. Singlets = tuple[code, diff: int]
  15. NonLetterRanges = tuple[start, stop: int]
  16. var
  17. toUpper = newSeq[Singlets]()
  18. toLower = newSeq[Singlets]()
  19. toTitle = newSeq[Singlets]()
  20. alphas = newSeq[int]()
  21. unispaces = newSeq[int]()
  22. proc parseData(data: seq[string]) =
  23. for line in data:
  24. let
  25. fields = line.split(';')
  26. code = fields[0].parseHexInt()
  27. category = fields[2]
  28. uc = fields[12]
  29. lc = fields[13]
  30. tc = fields[14]
  31. if category notin spaces and category notin letters:
  32. continue
  33. if uc.len > 0:
  34. let diff = 500 + uc.parseHexInt() - code
  35. toUpper.add (code, diff)
  36. if lc.len > 0:
  37. let diff = 500 + lc.parseHexInt() - code
  38. toLower.add (code, diff)
  39. if tc.len > 0 and tc != uc:
  40. # if titlecase is different than uppercase
  41. let diff = 500 + tc.parseHexInt() - code
  42. if diff != 500:
  43. toTitle.add (code, diff)
  44. if category in spaces:
  45. unispaces.add code
  46. else:
  47. alphas.add code
  48. proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
  49. ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
  50. ## - `r` contains continuous ranges with the same characteristics
  51. ## (their upper/lower version is the same distance away)
  52. ## - `s` contains single code points
  53. var i, j: int
  54. while i < a.len:
  55. j = 1
  56. let
  57. startCode = a[i].code
  58. startDiff = a[i].diff
  59. while i + j <= a.len:
  60. if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff:
  61. if j == 1:
  62. s.add (startCode, startDiff)
  63. else:
  64. r.add (startCode, a[i+j-1].code, startDiff)
  65. i += j-1
  66. break
  67. else:
  68. inc j
  69. inc i
  70. proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
  71. ## Splits `alphas` and `unispaces` into separate sequences:
  72. ## - `r` contains continuous ranges
  73. ## - `s` contains single code points
  74. var i, j: int
  75. while i < a.len:
  76. j = 1
  77. let startCode = a[i]
  78. while i + j <= a.len:
  79. if i+j >= a.len or a[i+j] != startCode+j:
  80. if j == 1:
  81. s.add startCode
  82. else:
  83. r.add (startCode, a[i+j-1])
  84. i += j-1
  85. break
  86. else:
  87. inc j
  88. inc i
  89. proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
  90. ## Spaces are special because of the way how `isWhiteSpace` and `split`
  91. ## are implemented.
  92. ##
  93. ## All spaces are added both to `r` (ranges) and `s` (singlets).
  94. var i, j: int
  95. while i < a.len:
  96. j = 1
  97. let startCode = a[i]
  98. while i + j <= a.len:
  99. if i+j >= a.len or a[i+j] != startCode+j:
  100. r.add (startCode, a[i+j-1])
  101. i += j-1
  102. break
  103. else:
  104. inc j
  105. inc i
  106. s = a
  107. var
  108. toupperRanges = newSeq[Ranges]()
  109. toupperSinglets = newSeq[Singlets]()
  110. tolowerRanges = newSeq[Ranges]()
  111. tolowerSinglets = newSeq[Singlets]()
  112. totitleRanges = newSeq[Ranges]()
  113. totitleSinglets = newSeq[Singlets]()
  114. spaceRanges = newSeq[NonLetterRanges]()
  115. unicodeSpaces = newSeq[int]()
  116. alphaRanges = newSeq[NonLetterRanges]()
  117. alphaSinglets = newSeq[int]()
  118. parseData(data)
  119. splitRanges(toLower, tolowerRanges, tolowerSinglets)
  120. splitRanges(toUpper, toUpperRanges, toUpperSinglets)
  121. splitRanges(toTitle, toTitleRanges, toTitleSinglets)
  122. splitRanges(alphas, alphaRanges, alphaSinglets)
  123. # manually add "special" spaces
  124. for i in 9 .. 13:
  125. unispaces.add i
  126. unispaces.add 0x85
  127. unispaces.sort()
  128. splitSpaces(unispaces, spaceRanges, unicodeSpaces)
  129. var output: string
  130. proc createHeader(output: var string) =
  131. output.add "# This file was created from a script.\n\n"
  132. output.add "const\n"
  133. proc `$`(r: Ranges): string =
  134. let
  135. start = "0x" & toHex(r.start, 5)
  136. stop = "0x" & toHex(r.stop, 5)
  137. result = "$#, $#, $#,\n" % [start, stop, $r.diff]
  138. proc `$`(r: Singlets): string =
  139. let code = "0x" & toHex(r.code, 5)
  140. result = "$#, $#,\n" % [code, $r.diff]
  141. proc `$`(r: NonLetterRanges): string =
  142. let
  143. start = "0x" & toHex(r.start, 5)
  144. stop = "0x" & toHex(r.stop, 5)
  145. result = "$#, $#,\n" % [start, stop]
  146. proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
  147. output: var string) =
  148. output.add " $# = [\n" % name
  149. for r in s:
  150. output.add " " & $r
  151. output.add " ]\n\n"
  152. proc outputSeq(s: seq[int], name: string, output: var string) =
  153. output.add " $# = [\n" % name
  154. for i in s:
  155. output.add " 0x$#,\n" % toHex(i, 5)
  156. output.add " ]\n\n"
  157. proc outputSpaces(s: seq[int], name: string, output: var string) =
  158. output.add " $# = [\n" % name
  159. for i in s:
  160. output.add " Rune 0x$#,\n" % toHex(i, 5)
  161. output.add " ]\n\n"
  162. output.createHeader()
  163. outputSeq(tolowerRanges, "toLowerRanges", output)
  164. outputSeq(tolowerSinglets, "toLowerSinglets", output)
  165. outputSeq(toupperRanges, "toUpperRanges", output)
  166. outputSeq(toupperSinglets, "toUpperSinglets", output)
  167. outputSeq(totitleSinglets, "toTitleSinglets", output)
  168. outputSeq(alphaRanges, "alphaRanges", output)
  169. outputSeq(alphaSinglets, "alphaSinglets", output)
  170. outputSeq(spaceRanges, "spaceRanges", output)
  171. outputSpaces(unispaces, "unicodeSpaces", output) # array of runes
  172. let outfile = "lib/pure/includes/unicode_ranges.nim"
  173. outfile.writeFile(output)