punycode.nim 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2016 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Implements a representation of Unicode with the limited
  10. ## ASCII character subset.
  11. import strutils
  12. import unicode
  13. # issue #3045
  14. const
  15. Base = 36
  16. TMin = 1
  17. TMax = 26
  18. Skew = 38
  19. Damp = 700
  20. InitialBias = 72
  21. InitialN = 128
  22. Delimiter = '-'
  23. type
  24. PunyError* = object of ValueError
  25. proc decodeDigit(x: char): int {.raises: [PunyError].} =
  26. if '0' <= x and x <= '9':
  27. result = ord(x) - (ord('0') - 26)
  28. elif 'A' <= x and x <= 'Z':
  29. result = ord(x) - ord('A')
  30. elif 'a' <= x and x <= 'z':
  31. result = ord(x) - ord('a')
  32. else:
  33. raise newException(PunyError, "Bad input")
  34. proc encodeDigit(digit: int): Rune {.raises: [PunyError].} =
  35. if 0 <= digit and digit < 26:
  36. result = Rune(digit + ord('a'))
  37. elif 26 <= digit and digit < 36:
  38. result = Rune(digit + (ord('0') - 26))
  39. else:
  40. raise newException(PunyError, "internal error in punycode encoding")
  41. proc isBasic(c: char): bool = ord(c) < 0x80
  42. proc isBasic(r: Rune): bool = int(r) < 0x80
  43. proc adapt(delta, numPoints: int, first: bool): int =
  44. var d = if first: delta div Damp else: delta div 2
  45. d += d div numPoints
  46. var k = 0
  47. while d > ((Base-TMin)*TMax) div 2:
  48. d = d div (Base - TMin)
  49. k += Base
  50. result = k + (Base - TMin + 1) * d div (d + Skew)
  51. proc encode*(prefix, s: string): string {.raises: [PunyError].} =
  52. ## Encode a string that may contain Unicode.
  53. ## Prepend `prefix` to the result
  54. result = prefix
  55. var (d, n, bias) = (0, InitialN, InitialBias)
  56. var (b, remaining) = (0, 0)
  57. for r in s.runes:
  58. if r.isBasic:
  59. # basic Ascii character
  60. inc b
  61. result.add($r)
  62. else:
  63. # special character
  64. inc remaining
  65. var h = b
  66. if b > 0:
  67. result.add(Delimiter) # we have some Ascii chars
  68. while remaining != 0:
  69. var m: int = high(int32)
  70. for r in s.runes:
  71. if m > int(r) and int(r) >= n:
  72. m = int(r)
  73. d += (m - n) * (h + 1)
  74. if d < 0:
  75. raise newException(PunyError, "invalid label " & s)
  76. n = m
  77. for r in s.runes:
  78. if int(r) < n:
  79. inc d
  80. if d < 0:
  81. raise newException(PunyError, "invalid label " & s)
  82. continue
  83. if int(r) > n:
  84. continue
  85. var q = d
  86. var k = Base
  87. while true:
  88. var t = k - bias
  89. if t < TMin:
  90. t = TMin
  91. elif t > TMax:
  92. t = TMax
  93. if q < t:
  94. break
  95. result.add($encodeDigit(t + (q - t) mod (Base - t)))
  96. q = (q - t) div (Base - t)
  97. k += Base
  98. result.add($encodeDigit(q))
  99. bias = adapt(d, h + 1, h == b)
  100. d = 0
  101. inc h
  102. dec remaining
  103. inc d
  104. inc n
  105. proc encode*(s: string): string {.raises: [PunyError].} =
  106. ## Encode a string that may contain Unicode. Prefix is empty.
  107. result = encode("", s)
  108. proc decode*(encoded: string): string {.raises: [PunyError].} =
  109. ## Decode a Punycode-encoded string
  110. var
  111. n = InitialN
  112. i = 0
  113. bias = InitialBias
  114. var d = rfind(encoded, Delimiter)
  115. result = ""
  116. if d > 0:
  117. # found Delimiter
  118. for j in 0..<d:
  119. var c = encoded[j] # char
  120. if not c.isBasic:
  121. raise newException(PunyError, "Encoded contains a non-basic char")
  122. result.add(c) # add the character
  123. inc d
  124. else:
  125. d = 0 # set to first index
  126. while (d < len(encoded)):
  127. var oldi = i
  128. var w = 1
  129. var k = Base
  130. while true:
  131. if d == len(encoded):
  132. raise newException(PunyError, "Bad input: " & encoded)
  133. var c = encoded[d]; inc d
  134. var digit = int(decodeDigit(c))
  135. if digit > (high(int32) - i) div w:
  136. raise newException(PunyError, "Too large a value: " & $digit)
  137. i += digit * w
  138. var t: int
  139. if k <= bias:
  140. t = TMin
  141. elif k >= bias + TMax:
  142. t = TMax
  143. else:
  144. t = k - bias
  145. if digit < t:
  146. break
  147. w *= Base - t
  148. k += Base
  149. bias = adapt(i - oldi, runelen(result) + 1, oldi == 0)
  150. if i div (runelen(result) + 1) > high(int32) - n:
  151. raise newException(PunyError, "Value too large")
  152. n += i div (runelen(result) + 1)
  153. i = i mod (runelen(result) + 1)
  154. insert(result, $Rune(n), i)
  155. inc i
  156. runnableExamples:
  157. static:
  158. block:
  159. doAssert encode("") == ""
  160. doAssert encode("a") == "a-"
  161. doAssert encode("A") == "A-"
  162. doAssert encode("3") == "3-"
  163. doAssert encode("-") == "--"
  164. doAssert encode("--") == "---"
  165. doAssert encode("abc") == "abc-"
  166. doAssert encode("London") == "London-"
  167. doAssert encode("Lloyd-Atkinson") == "Lloyd-Atkinson-"
  168. doAssert encode("This has spaces") == "This has spaces-"
  169. doAssert encode("ü") == "tda"
  170. doAssert encode("München") == "Mnchen-3ya"
  171. doAssert encode("Mnchen-3ya") == "Mnchen-3ya-"
  172. doAssert encode("München-Ost") == "Mnchen-Ost-9db"
  173. doAssert encode("Bahnhof München-Ost") == "Bahnhof Mnchen-Ost-u6b"
  174. block:
  175. doAssert decode("") == ""
  176. doAssert decode("a-") == "a"
  177. doAssert decode("A-") == "A"
  178. doAssert decode("3-") == "3"
  179. doAssert decode("--") == "-"
  180. doAssert decode("---") == "--"
  181. doAssert decode("abc-") == "abc"
  182. doAssert decode("London-") == "London"
  183. doAssert decode("Lloyd-Atkinson-") == "Lloyd-Atkinson"
  184. doAssert decode("This has spaces-") == "This has spaces"
  185. doAssert decode("tda") == "ü"
  186. doAssert decode("Mnchen-3ya") == "München"
  187. doAssert decode("Mnchen-3ya-") == "Mnchen-3ya"
  188. doAssert decode("Mnchen-Ost-9db") == "München-Ost"
  189. doAssert decode("Bahnhof Mnchen-Ost-u6b") == "Bahnhof München-Ost"
  190. when isMainModule:
  191. assert(decode(encode("", "bücher")) == "bücher")
  192. assert(decode(encode("münchen")) == "münchen")
  193. assert encode("xn--", "münchen") == "xn--mnchen-3ya"