unicode.nim 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. ##
  11. ## There are no specialized ``insert``, ``delete``, ``add`` and ``contains``
  12. ## procedures for ``seq[Rune]`` in this module because the generic variants
  13. ## of these procedures in the system module already work with it.
  14. ##
  15. ## The current version is compatible with Unicode v12.0.0.
  16. ##
  17. ## **See also:**
  18. ## * `strutils module <strutils.html>`_
  19. ## * `unidecode module <unidecode.html>`_
  20. ## * `encodings module <encodings.html>`_
  21. include "system/inclrtl"
  22. import std/strbasics
  23. template toOa(s: string): auto = s.toOpenArray(0, s.high)
  24. proc substr(s: openArray[char] , first, last: int): string =
  25. # Copied substr from system
  26. let first = max(first, 0)
  27. let L = max(min(last, high(s)) - first + 1, 0)
  28. result = newString(L)
  29. for i in 0 .. L-1:
  30. result[i] = s[i+first]
  31. type
  32. RuneImpl = int32 # underlying type of Rune
  33. Rune* = distinct RuneImpl ## \
  34. ## Type that can hold a single Unicode code point.
  35. ##
  36. ## A Rune may be composed with other Runes to a character on the screen.
  37. ## `RuneImpl` is the underlying type used to store Runes, currently `int32`.
  38. template ones(n: untyped): untyped = ((1 shl n)-1)
  39. proc runeLen*(s: openArray[char]): int {.rtl, extern: "nuc$1".} =
  40. ## Returns the number of runes of the string ``s``.
  41. runnableExamples:
  42. let a = "añyóng"
  43. doAssert a.runeLen == 6
  44. ## note: a.len == 8
  45. result = 0
  46. var i = 0
  47. while i < len(s):
  48. if uint(s[i]) <= 127: inc(i)
  49. elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
  50. elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
  51. elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
  52. elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
  53. elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
  54. else: inc i
  55. inc(result)
  56. proc runeLenAt*(s: openArray[char], i: Natural): int =
  57. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  58. ##
  59. ## See also:
  60. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  61. runnableExamples:
  62. let a = "añyóng"
  63. doAssert a.runeLenAt(0) == 1
  64. doAssert a.runeLenAt(1) == 2
  65. if uint(s[i]) <= 127: result = 1
  66. elif uint(s[i]) shr 5 == 0b110: result = 2
  67. elif uint(s[i]) shr 4 == 0b1110: result = 3
  68. elif uint(s[i]) shr 3 == 0b11110: result = 4
  69. elif uint(s[i]) shr 2 == 0b111110: result = 5
  70. elif uint(s[i]) shr 1 == 0b1111110: result = 6
  71. else: result = 1
  72. const replRune = Rune(0xFFFD)
  73. template fastRuneAt*(s: openArray[char] or string, i: int, result: untyped, doInc = true) =
  74. ## Returns the rune ``s[i]`` in ``result``.
  75. ##
  76. ## If ``doInc == true`` (default), ``i`` is incremented by the number
  77. ## of bytes that have been processed.
  78. bind ones
  79. if uint(s[i]) <= 127:
  80. result = Rune(uint(s[i]))
  81. when doInc: inc(i)
  82. elif uint(s[i]) shr 5 == 0b110:
  83. # assert(uint(s[i+1]) shr 6 == 0b10)
  84. if i <= s.len - 2:
  85. result = Rune((uint(s[i]) and (ones(5))) shl 6 or
  86. (uint(s[i+1]) and ones(6)))
  87. when doInc: inc(i, 2)
  88. else:
  89. result = replRune
  90. when doInc: inc(i)
  91. elif uint(s[i]) shr 4 == 0b1110:
  92. # assert(uint(s[i+1]) shr 6 == 0b10)
  93. # assert(uint(s[i+2]) shr 6 == 0b10)
  94. if i <= s.len - 3:
  95. result = Rune((uint(s[i]) and ones(4)) shl 12 or
  96. (uint(s[i+1]) and ones(6)) shl 6 or
  97. (uint(s[i+2]) and ones(6)))
  98. when doInc: inc(i, 3)
  99. else:
  100. result = replRune
  101. when doInc: inc(i)
  102. elif uint(s[i]) shr 3 == 0b11110:
  103. # assert(uint(s[i+1]) shr 6 == 0b10)
  104. # assert(uint(s[i+2]) shr 6 == 0b10)
  105. # assert(uint(s[i+3]) shr 6 == 0b10)
  106. if i <= s.len - 4:
  107. result = Rune((uint(s[i]) and ones(3)) shl 18 or
  108. (uint(s[i+1]) and ones(6)) shl 12 or
  109. (uint(s[i+2]) and ones(6)) shl 6 or
  110. (uint(s[i+3]) and ones(6)))
  111. when doInc: inc(i, 4)
  112. else:
  113. result = replRune
  114. when doInc: inc(i)
  115. elif uint(s[i]) shr 2 == 0b111110:
  116. # assert(uint(s[i+1]) shr 6 == 0b10)
  117. # assert(uint(s[i+2]) shr 6 == 0b10)
  118. # assert(uint(s[i+3]) shr 6 == 0b10)
  119. # assert(uint(s[i+4]) shr 6 == 0b10)
  120. if i <= s.len - 5:
  121. result = Rune((uint(s[i]) and ones(2)) shl 24 or
  122. (uint(s[i+1]) and ones(6)) shl 18 or
  123. (uint(s[i+2]) and ones(6)) shl 12 or
  124. (uint(s[i+3]) and ones(6)) shl 6 or
  125. (uint(s[i+4]) and ones(6)))
  126. when doInc: inc(i, 5)
  127. else:
  128. result = replRune
  129. when doInc: inc(i)
  130. elif uint(s[i]) shr 1 == 0b1111110:
  131. # assert(uint(s[i+1]) shr 6 == 0b10)
  132. # assert(uint(s[i+2]) shr 6 == 0b10)
  133. # assert(uint(s[i+3]) shr 6 == 0b10)
  134. # assert(uint(s[i+4]) shr 6 == 0b10)
  135. # assert(uint(s[i+5]) shr 6 == 0b10)
  136. if i <= s.len - 6:
  137. result = Rune((uint(s[i]) and ones(1)) shl 30 or
  138. (uint(s[i+1]) and ones(6)) shl 24 or
  139. (uint(s[i+2]) and ones(6)) shl 18 or
  140. (uint(s[i+3]) and ones(6)) shl 12 or
  141. (uint(s[i+4]) and ones(6)) shl 6 or
  142. (uint(s[i+5]) and ones(6)))
  143. when doInc: inc(i, 6)
  144. else:
  145. result = replRune
  146. when doInc: inc(i)
  147. else:
  148. result = Rune(uint(s[i]))
  149. when doInc: inc(i)
  150. proc runeAt*(s: openArray[char], i: Natural): Rune =
  151. ## Returns the rune in ``s`` at **byte index** ``i``.
  152. ##
  153. ## See also:
  154. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  155. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  156. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  157. runnableExamples:
  158. let a = "añyóng"
  159. doAssert a.runeAt(1) == "ñ".runeAt(0)
  160. doAssert a.runeAt(2) == "ñ".runeAt(1)
  161. doAssert a.runeAt(3) == "y".runeAt(0)
  162. fastRuneAt(s, i, result, false)
  163. proc validateUtf8*(s: openArray[char]): int =
  164. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  165. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  166. ##
  167. ## See also:
  168. ## * `toUTF8 proc <#toUTF8,Rune>`_
  169. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  170. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  171. var i = 0
  172. let L = s.len
  173. while i < L:
  174. if uint(s[i]) <= 127:
  175. inc(i)
  176. elif uint(s[i]) shr 5 == 0b110:
  177. if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  178. if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2)
  179. else: return i
  180. elif uint(s[i]) shr 4 == 0b1110:
  181. if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10:
  182. inc i, 3
  183. else: return i
  184. elif uint(s[i]) shr 3 == 0b11110:
  185. if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and
  186. uint(s[i+2]) shr 6 == 0b10 and
  187. uint(s[i+3]) shr 6 == 0b10:
  188. inc i, 4
  189. else: return i
  190. else:
  191. return i
  192. return -1
  193. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  194. ## Copies UTF-8 representation of ``c`` into the preallocated string ``s``
  195. ## starting at position ``pos``.
  196. ##
  197. ## If ``doInc == true`` (default), ``pos`` is incremented
  198. ## by the number of bytes that have been processed.
  199. ##
  200. ## To be the most efficient, make sure ``s`` is preallocated
  201. ## with an additional amount equal to the byte length of ``c``.
  202. ##
  203. ## See also:
  204. ## * `validateUtf8 proc <#validateUtf8,string>`_
  205. ## * `toUTF8 proc <#toUTF8,Rune>`_
  206. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  207. var i = RuneImpl(c)
  208. if i <=% 127:
  209. s.setLen(pos+1)
  210. s[pos+0] = chr(i)
  211. when doInc: inc(pos)
  212. elif i <=% 0x07FF:
  213. s.setLen(pos+2)
  214. s[pos+0] = chr((i shr 6) or 0b110_00000)
  215. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  216. when doInc: inc(pos, 2)
  217. elif i <=% 0xFFFF:
  218. s.setLen(pos+3)
  219. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  220. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  221. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  222. when doInc: inc(pos, 3)
  223. elif i <=% 0x001FFFFF:
  224. s.setLen(pos+4)
  225. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  226. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  227. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  228. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  229. when doInc: inc(pos, 4)
  230. elif i <=% 0x03FFFFFF:
  231. s.setLen(pos+5)
  232. s[pos+0] = chr(i shr 24 or 0b111110_00)
  233. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  234. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  235. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  236. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  237. when doInc: inc(pos, 5)
  238. elif i <=% 0x7FFFFFFF:
  239. s.setLen(pos+6)
  240. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  241. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  242. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  243. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  244. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  245. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  246. when doInc: inc(pos, 6)
  247. else:
  248. discard # error, exception?
  249. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  250. ## Converts a rune into its UTF-8 representation.
  251. ##
  252. ## See also:
  253. ## * `validateUtf8 proc <#validateUtf8,string>`_
  254. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  255. ## * `utf8 iterator <#utf8.i,string>`_
  256. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  257. runnableExamples:
  258. let a = "añyóng"
  259. doAssert a.runeAt(1).toUTF8 == "ñ"
  260. result = ""
  261. fastToUTF8Copy(c, result, 0, false)
  262. proc add*(s: var string; c: Rune) =
  263. ## Adds a rune ``c`` to a string ``s``.
  264. runnableExamples:
  265. var s = "abc"
  266. let c = "ä".runeAt(0)
  267. s.add(c)
  268. doAssert s == "abcä"
  269. let pos = s.len
  270. fastToUTF8Copy(c, s, pos, false)
  271. proc `$`*(rune: Rune): string =
  272. ## An alias for `toUTF8 <#toUTF8,Rune>`_.
  273. ##
  274. ## See also:
  275. ## * `validateUtf8 proc <#validateUtf8,string>`_
  276. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  277. rune.toUTF8
  278. proc `$`*(runes: seq[Rune]): string =
  279. ## Converts a sequence of Runes to a string.
  280. ##
  281. ## See also:
  282. ## * `toRunes <#toRunes,string>`_ for a reverse operation
  283. runnableExamples:
  284. let
  285. someString = "öÑ"
  286. someRunes = toRunes(someString)
  287. doAssert $someRunes == someString
  288. result = ""
  289. for rune in runes:
  290. result.add rune
  291. proc runeOffset*(s: openArray[char], pos: Natural, start: Natural = 0): int =
  292. ## Returns the byte position of rune
  293. ## at position ``pos`` in ``s`` with an optional start byte position.
  294. ## Returns the special value -1 if it runs out of the string.
  295. ##
  296. ## **Beware:** This can lead to unoptimized code and slow execution!
  297. ## Most problems can be solved more efficiently by using an iterator
  298. ## or conversion to a seq of Rune.
  299. ##
  300. ## See also:
  301. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  302. runnableExamples:
  303. let a = "añyóng"
  304. doAssert a.runeOffset(1) == 1
  305. doAssert a.runeOffset(3) == 4
  306. doAssert a.runeOffset(4) == 6
  307. var
  308. i = 0
  309. o = start
  310. while i < pos:
  311. o += runeLenAt(s, o)
  312. if o >= s.len:
  313. return -1
  314. inc i
  315. return o
  316. proc runeReverseOffset*(s: openArray[char], rev: Positive): (int, int) =
  317. ## Returns a tuple with the byte offset of the
  318. ## rune at position ``rev`` in ``s``, counting
  319. ## from the end (starting with 1) and the total
  320. ## number of runes in the string.
  321. ##
  322. ## Returns a negative value for offset if there are too few runes in
  323. ## the string to satisfy the request.
  324. ##
  325. ## **Beware:** This can lead to unoptimized code and slow execution!
  326. ## Most problems can be solved more efficiently by using an iterator
  327. ## or conversion to a seq of Rune.
  328. ##
  329. ## See also:
  330. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  331. var
  332. a = rev.int
  333. o = 0
  334. x = 0
  335. let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int
  336. while o < s.len:
  337. let r = runeLenAt(s, o)
  338. o += r
  339. if a > times:
  340. x += r
  341. dec a
  342. result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int)
  343. proc runeAtPos*(s: openArray[char], pos: int): Rune =
  344. ## Returns the rune at position ``pos``.
  345. ##
  346. ## **Beware:** This can lead to unoptimized code and slow execution!
  347. ## Most problems can be solved more efficiently by using an iterator
  348. ## or conversion to a seq of Rune.
  349. ##
  350. ## See also:
  351. ## * `runeAt proc <#runeAt,string,Natural>`_
  352. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  353. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  354. fastRuneAt(s, runeOffset(s, pos), result, false)
  355. proc runeStrAtPos*(s: openArray[char], pos: Natural): string =
  356. ## Returns the rune at position ``pos`` as UTF8 String.
  357. ##
  358. ## **Beware:** This can lead to unoptimized code and slow execution!
  359. ## Most problems can be solved more efficiently by using an iterator
  360. ## or conversion to a seq of Rune.
  361. ##
  362. ## See also:
  363. ## * `runeAt proc <#runeAt,string,Natural>`_
  364. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  365. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  366. let o = runeOffset(s, pos)
  367. substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1)))
  368. proc runeSubStr*(s: openArray[char], pos: int, len: int = int.high): string =
  369. ## Returns the UTF-8 substring starting at code point ``pos``
  370. ## with ``len`` code points.
  371. ##
  372. ## If ``pos`` or ``len`` is negative they count from
  373. ## the end of the string. If ``len`` is not given it means the longest
  374. ## possible string.
  375. runnableExamples:
  376. let s = "Hänsel ««: 10,00€"
  377. doAssert(runeSubStr(s, 0, 2) == "Hä")
  378. doAssert(runeSubStr(s, 10, 1) == ":")
  379. doAssert(runeSubStr(s, -6) == "10,00€")
  380. doAssert(runeSubStr(s, 10) == ": 10,00€")
  381. doAssert(runeSubStr(s, 12, 5) == "10,00")
  382. doAssert(runeSubStr(s, -6, 3) == "10,")
  383. if pos < 0:
  384. let (o, rl) = runeReverseOffset(s, -pos)
  385. if len >= rl:
  386. result = s.substr(o, s.high)
  387. elif len < 0:
  388. let e = rl + len
  389. if e < 0:
  390. result = ""
  391. else:
  392. result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1)
  393. else:
  394. result = s.substr(o, runeOffset(s, len, o)-1)
  395. else:
  396. let o = runeOffset(s, pos)
  397. if o < 0:
  398. result = ""
  399. elif len == int.high:
  400. result = s.substr(o, s.len-1)
  401. elif len < 0:
  402. let (e, rl) = runeReverseOffset(s, -len)
  403. discard rl
  404. if e <= 0:
  405. result = ""
  406. else:
  407. result = s.substr(o, e-1)
  408. else:
  409. var e = runeOffset(s, len, o)
  410. if e < 0:
  411. e = s.len
  412. result = s.substr(o, e-1)
  413. proc `<=%`*(a, b: Rune): bool =
  414. ## Checks if code point of `a` is smaller or equal to code point of `b`.
  415. runnableExamples:
  416. let
  417. a = "ú".runeAt(0)
  418. b = "ü".runeAt(0)
  419. doAssert a <=% b
  420. return int(a) <=% int(b)
  421. proc `<%`*(a, b: Rune): bool =
  422. ## Checks if code point of `a` is smaller than code point of `b`.
  423. runnableExamples:
  424. let
  425. a = "ú".runeAt(0)
  426. b = "ü".runeAt(0)
  427. doAssert a <% b
  428. return int(a) <% int(b)
  429. proc `==`*(a, b: Rune): bool =
  430. ## Checks if two runes are equal.
  431. return int(a) == int(b)
  432. include "includes/unicode_ranges"
  433. proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
  434. var n = len
  435. var t = 0
  436. while n > 1:
  437. var m = n div 2
  438. var p = t + m*stride
  439. if c >= tab[p]:
  440. t = p
  441. n = n-m
  442. else:
  443. n = m
  444. if n != 0 and c >= tab[t]:
  445. return t
  446. return -1
  447. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  448. ## Converts ``c`` into lower case. This works for any rune.
  449. ##
  450. ## If possible, prefer ``toLower`` over ``toUpper``.
  451. ##
  452. ## See also:
  453. ## * `toUpper proc <#toUpper,Rune>`_
  454. ## * `toTitle proc <#toTitle,Rune>`_
  455. ## * `isLower proc <#isLower,Rune>`_
  456. var c = RuneImpl(c)
  457. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  458. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  459. return Rune(c + toLowerRanges[p+2] - 500)
  460. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  461. if p >= 0 and c == toLowerSinglets[p]:
  462. return Rune(c + toLowerSinglets[p+1] - 500)
  463. return Rune(c)
  464. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  465. ## Converts ``c`` into upper case. This works for any rune.
  466. ##
  467. ## If possible, prefer ``toLower`` over ``toUpper``.
  468. ##
  469. ## See also:
  470. ## * `toLower proc <#toLower,Rune>`_
  471. ## * `toTitle proc <#toTitle,Rune>`_
  472. ## * `isUpper proc <#isUpper,Rune>`_
  473. var c = RuneImpl(c)
  474. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  475. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  476. return Rune(c + toUpperRanges[p+2] - 500)
  477. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  478. if p >= 0 and c == toUpperSinglets[p]:
  479. return Rune(c + toUpperSinglets[p+1] - 500)
  480. return Rune(c)
  481. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  482. ## Converts ``c`` to title case.
  483. ##
  484. ## See also:
  485. ## * `toLower proc <#toLower,Rune>`_
  486. ## * `toUpper proc <#toUpper,Rune>`_
  487. ## * `isTitle proc <#isTitle,Rune>`_
  488. var c = RuneImpl(c)
  489. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  490. if p >= 0 and c == toTitleSinglets[p]:
  491. return Rune(c + toTitleSinglets[p+1] - 500)
  492. return Rune(c)
  493. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  494. ## Returns true if ``c`` is a lower case rune.
  495. ##
  496. ## If possible, prefer ``isLower`` over ``isUpper``.
  497. ##
  498. ## See also:
  499. ## * `toLower proc <#toLower,Rune>`_
  500. ## * `isUpper proc <#isUpper,Rune>`_
  501. ## * `isTitle proc <#isTitle,Rune>`_
  502. var c = RuneImpl(c)
  503. # Note: toUpperRanges is correct here!
  504. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  505. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  506. return true
  507. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  508. if p >= 0 and c == toUpperSinglets[p]:
  509. return true
  510. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  511. ## Returns true if ``c`` is a upper case rune.
  512. ##
  513. ## If possible, prefer ``isLower`` over ``isUpper``.
  514. ##
  515. ## See also:
  516. ## * `toUpper proc <#toUpper,Rune>`_
  517. ## * `isLower proc <#isLower,Rune>`_
  518. ## * `isTitle proc <#isTitle,Rune>`_
  519. ## * `isAlpha proc <#isAlpha,Rune>`_
  520. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  521. var c = RuneImpl(c)
  522. # Note: toLowerRanges is correct here!
  523. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  524. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  525. return true
  526. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  527. if p >= 0 and c == toLowerSinglets[p]:
  528. return true
  529. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  530. ## Returns true if ``c`` is an *alpha* rune (i.e., a letter).
  531. ##
  532. ## See also:
  533. ## * `isLower proc <#isLower,Rune>`_
  534. ## * `isTitle proc <#isTitle,Rune>`_
  535. ## * `isAlpha proc <#isAlpha,Rune>`_
  536. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  537. ## * `isCombining proc <#isCombining,Rune>`_
  538. if isUpper(c) or isLower(c):
  539. return true
  540. var c = RuneImpl(c)
  541. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  542. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  543. return true
  544. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  545. if p >= 0 and c == alphaSinglets[p]:
  546. return true
  547. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  548. ## Returns true if ``c`` is a Unicode titlecase code point.
  549. ##
  550. ## See also:
  551. ## * `toTitle proc <#toTitle,Rune>`_
  552. ## * `isLower proc <#isLower,Rune>`_
  553. ## * `isUpper proc <#isUpper,Rune>`_
  554. ## * `isAlpha proc <#isAlpha,Rune>`_
  555. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  556. return isUpper(c) and isLower(c)
  557. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  558. ## Returns true if ``c`` is a Unicode whitespace code point.
  559. ##
  560. ## See also:
  561. ## * `isLower proc <#isLower,Rune>`_
  562. ## * `isUpper proc <#isUpper,Rune>`_
  563. ## * `isTitle proc <#isTitle,Rune>`_
  564. ## * `isAlpha proc <#isAlpha,Rune>`_
  565. var c = RuneImpl(c)
  566. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  567. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  568. return true
  569. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  570. ## Returns true if ``c`` is a Unicode combining code unit.
  571. ##
  572. ## See also:
  573. ## * `isLower proc <#isLower,Rune>`_
  574. ## * `isUpper proc <#isUpper,Rune>`_
  575. ## * `isTitle proc <#isTitle,Rune>`_
  576. ## * `isAlpha proc <#isAlpha,Rune>`_
  577. var c = RuneImpl(c)
  578. # Optimized to return false immediately for ASCII
  579. return c >= 0x0300 and (c <= 0x036f or
  580. (c >= 0x1ab0 and c <= 0x1aff) or
  581. (c >= 0x1dc0 and c <= 0x1dff) or
  582. (c >= 0x20d0 and c <= 0x20ff) or
  583. (c >= 0xfe20 and c <= 0xfe2f))
  584. template runeCheck(s, runeProc) =
  585. ## Common code for isAlpha and isSpace.
  586. result = if len(s) == 0: false else: true
  587. var
  588. i = 0
  589. rune: Rune
  590. while i < len(s) and result:
  591. fastRuneAt(s, i, rune, doInc = true)
  592. result = runeProc(rune) and result
  593. proc isAlpha*(s: openArray[char]): bool {.noSideEffect,
  594. rtl, extern: "nuc$1Str".} =
  595. ## Returns true if ``s`` contains all alphabetic runes.
  596. runnableExamples:
  597. let a = "añyóng"
  598. doAssert a.isAlpha
  599. runeCheck(s, isAlpha)
  600. proc isSpace*(s: openArray[char]): bool {.noSideEffect,
  601. rtl, extern: "nuc$1Str".} =
  602. ## Returns true if ``s`` contains all whitespace runes.
  603. runnableExamples:
  604. let a = "\t\l \v\r\f"
  605. doAssert a.isSpace
  606. runeCheck(s, isWhiteSpace)
  607. template convertRune(s, runeProc) =
  608. ## Convert runes in ``s`` using ``runeProc`` as the converter.
  609. result = newString(len(s))
  610. var
  611. i = 0
  612. resultIndex = 0
  613. rune: Rune
  614. while i < len(s):
  615. fastRuneAt(s, i, rune, doInc = true)
  616. rune = runeProc(rune)
  617. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  618. proc toUpper*(s: openArray[char]): string {.noSideEffect,
  619. rtl, extern: "nuc$1Str".} =
  620. ## Converts ``s`` into upper-case runes.
  621. runnableExamples:
  622. doAssert toUpper("abγ") == "ABΓ"
  623. convertRune(s, toUpper)
  624. proc toLower*(s: openArray[char]): string {.noSideEffect,
  625. rtl, extern: "nuc$1Str".} =
  626. ## Converts ``s`` into lower-case runes.
  627. runnableExamples:
  628. doAssert toLower("ABΓ") == "abγ"
  629. convertRune(s, toLower)
  630. proc swapCase*(s: openArray[char]): string {.noSideEffect,
  631. rtl, extern: "nuc$1".} =
  632. ## Swaps the case of runes in ``s``.
  633. ##
  634. ## Returns a new string such that the cases of all runes
  635. ## are swapped if possible.
  636. runnableExamples:
  637. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  638. var
  639. i = 0
  640. resultIndex = 0
  641. rune: Rune
  642. result = newString(len(s))
  643. while i < len(s):
  644. fastRuneAt(s, i, rune)
  645. if rune.isUpper():
  646. rune = rune.toLower()
  647. elif rune.isLower():
  648. rune = rune.toUpper()
  649. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  650. proc capitalize*(s: openArray[char]): string {.noSideEffect,
  651. rtl, extern: "nuc$1".} =
  652. ## Converts the first character of ``s`` into an upper-case rune.
  653. runnableExamples:
  654. doAssert capitalize("βeta") == "Βeta"
  655. if len(s) == 0:
  656. return ""
  657. var
  658. rune: Rune
  659. i = 0
  660. fastRuneAt(s, i, rune, doInc = true)
  661. result = $toUpper(rune) & substr(s.toOpenArray(i, s.high))
  662. when not defined(nimHasEffectsOf):
  663. {.pragma: effectsOf.}
  664. proc translate*(s: openArray[char], replacements: proc(key: string): string): string {.
  665. rtl, extern: "nuc$1", effectsOf: replacements.} =
  666. ## Translates words in a string using the ``replacements`` proc to substitute
  667. ## words inside ``s`` with their replacements.
  668. ##
  669. ## ``replacements`` is any proc that takes a word and returns
  670. ## a new word to fill it's place.
  671. runnableExamples:
  672. proc wordToNumber(s: string): string =
  673. case s
  674. of "one": "1"
  675. of "two": "2"
  676. else: s
  677. let a = "one two three four"
  678. doAssert a.translate(wordToNumber) == "1 2 three four"
  679. # Allocate memory for the new string based on the old one.
  680. # If the new string length is less than the old, no allocations
  681. # will be needed. If the new string length is greater than the
  682. # old, then maybe only one allocation is needed
  683. result = newStringOfCap(s.len)
  684. var
  685. index = 0
  686. lastIndex = 0
  687. wordStart = 0
  688. inWord = false
  689. rune: Rune
  690. while index < len(s):
  691. lastIndex = index
  692. fastRuneAt(s, index, rune)
  693. let whiteSpace = rune.isWhiteSpace()
  694. if whiteSpace and inWord:
  695. # If we've reached the end of a word
  696. let word = substr(s.toOpenArray(wordStart, lastIndex - 1))
  697. result.add(replacements(word))
  698. result.add($rune)
  699. inWord = false
  700. elif not whiteSpace and not inWord:
  701. # If we've hit a non space character and
  702. # are not currently in a word, track
  703. # the starting index of the word
  704. inWord = true
  705. wordStart = lastIndex
  706. elif whiteSpace:
  707. result.add($rune)
  708. if wordStart < len(s) and inWord:
  709. # Get the trailing word at the end
  710. let word = substr(s.toOpenArray(wordStart, s.high))
  711. result.add(replacements(word))
  712. proc title*(s: openArray[char]): string {.noSideEffect,
  713. rtl, extern: "nuc$1".} =
  714. ## Converts ``s`` to a unicode title.
  715. ##
  716. ## Returns a new string such that the first character
  717. ## in each word inside ``s`` is capitalized.
  718. runnableExamples:
  719. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  720. var
  721. i = 0
  722. resultIndex = 0
  723. rune: Rune
  724. result = newString(len(s))
  725. var firstRune = true
  726. while i < len(s):
  727. fastRuneAt(s, i, rune)
  728. if not rune.isWhiteSpace() and firstRune:
  729. rune = rune.toUpper()
  730. firstRune = false
  731. elif rune.isWhiteSpace():
  732. firstRune = true
  733. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  734. iterator runes*(s: openArray[char]): Rune =
  735. ## Iterates over any rune of the string ``s`` returning runes.
  736. var
  737. i = 0
  738. result: Rune
  739. while i < len(s):
  740. fastRuneAt(s, i, result, true)
  741. yield result
  742. iterator utf8*(s: openArray[char]): string =
  743. ## Iterates over any rune of the string ``s`` returning utf8 values.
  744. ##
  745. ## See also:
  746. ## * `validateUtf8 proc <#validateUtf8,string>`_
  747. ## * `toUTF8 proc <#toUTF8,Rune>`_
  748. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  749. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  750. var o = 0
  751. while o < s.len:
  752. let n = runeLenAt(s, o)
  753. yield substr(s.toOpenArray(o, (o+n-1)))
  754. o += n
  755. proc toRunes*(s: openArray[char]): seq[Rune] =
  756. ## Obtains a sequence containing the Runes in ``s``.
  757. ##
  758. ## See also:
  759. ## * `$ proc <#$,Rune>`_ for a reverse operation
  760. runnableExamples:
  761. let a = toRunes("aáä")
  762. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  763. result = newSeq[Rune]()
  764. for r in s.runes:
  765. result.add(r)
  766. proc cmpRunesIgnoreCase*(a, b: openArray[char]): int {.rtl, extern: "nuc$1".} =
  767. ## Compares two UTF-8 strings and ignores the case. Returns:
  768. ##
  769. ## | 0 if a == b
  770. ## | < 0 if a < b
  771. ## | > 0 if a > b
  772. var i = 0
  773. var j = 0
  774. var ar, br: Rune
  775. while i < a.len and j < b.len:
  776. # slow path:
  777. fastRuneAt(a, i, ar)
  778. fastRuneAt(b, j, br)
  779. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  780. if result != 0: return
  781. result = a.len - b.len
  782. proc reversed*(s: openArray[char]): string =
  783. ## Returns the reverse of ``s``, interpreting it as runes.
  784. ##
  785. ## Unicode combining characters are correctly interpreted as well.
  786. runnableExamples:
  787. assert reversed("Reverse this!") == "!siht esreveR"
  788. assert reversed("先秦兩漢") == "漢兩秦先"
  789. assert reversed("as⃝df̅") == "f̅ds⃝a"
  790. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  791. var
  792. i = 0
  793. lastI = 0
  794. newPos = len(s) - 1
  795. blockPos = 0
  796. r: Rune
  797. template reverseUntil(pos) =
  798. var j = pos - 1
  799. while j > blockPos:
  800. result[newPos] = s[j]
  801. dec j
  802. dec newPos
  803. blockPos = pos - 1
  804. result = newString(len(s))
  805. while i < len(s):
  806. lastI = i
  807. fastRuneAt(s, i, r, true)
  808. if not isCombining(r):
  809. reverseUntil(lastI)
  810. reverseUntil(len(s))
  811. proc graphemeLen*(s: openArray[char]; i: Natural): Natural =
  812. ## The number of bytes belonging to byte index ``s[i]``,
  813. ## including following combining code unit.
  814. runnableExamples:
  815. let a = "añyóng"
  816. doAssert a.graphemeLen(1) == 2 ## ñ
  817. doAssert a.graphemeLen(2) == 1
  818. doAssert a.graphemeLen(4) == 2 ## ó
  819. var j = i.int
  820. var r, r2: Rune
  821. if j < s.len:
  822. fastRuneAt(s, j, r, true)
  823. result = j-i
  824. while j < s.len:
  825. fastRuneAt(s, j, r2, true)
  826. if not isCombining(r2): break
  827. result = j-i
  828. proc lastRune*(s: openArray[char]; last: int): (Rune, int) =
  829. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  830. ## in bytes.
  831. if s[last] <= chr(127):
  832. result = (Rune(s[last]), 1)
  833. else:
  834. var L = 0
  835. while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L)
  836. var r: Rune
  837. fastRuneAt(s, last-L, r, false)
  838. result = (r, L+1)
  839. proc size*(r: Rune): int {.noSideEffect.} =
  840. ## Returns the number of bytes the rune ``r`` takes.
  841. runnableExamples:
  842. let a = toRunes "aá"
  843. doAssert size(a[0]) == 1
  844. doAssert size(a[1]) == 2
  845. let v = r.uint32
  846. if v <= 0x007F'u32: result = 1
  847. elif v <= 0x07FF'u32: result = 2
  848. elif v <= 0xFFFF'u32: result = 3
  849. elif v <= 0x1FFFFF'u32: result = 4
  850. elif v <= 0x3FFFFFF'u32: result = 5
  851. elif v <= 0x7FFFFFFF'u32: result = 6
  852. else: result = 1
  853. # --------- Private templates for different split separators -----------
  854. proc stringHasSep(s: openArray[char], index: int, seps: openArray[Rune]): bool =
  855. var rune: Rune
  856. fastRuneAt(s, index, rune, false)
  857. return seps.contains(rune)
  858. proc stringHasSep(s: openArray[char], index: int, sep: Rune): bool =
  859. var rune: Rune
  860. fastRuneAt(s, index, rune, false)
  861. return sep == rune
  862. template splitCommon(s, sep, maxsplit: untyped) =
  863. ## Common code for split procedures.
  864. let
  865. sLen = len(s)
  866. var
  867. last = 0
  868. splits = maxsplit
  869. if sLen > 0:
  870. while last <= sLen:
  871. var first = last
  872. while last < sLen and not stringHasSep(s, last, sep):
  873. inc(last, runeLenAt(s, last))
  874. if splits == 0: last = sLen
  875. yield substr(s.toOpenArray(first, (last - 1)))
  876. if splits == 0: break
  877. dec(splits)
  878. inc(last, if last < sLen: runeLenAt(s, last) else: 1)
  879. iterator split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces,
  880. maxsplit: int = -1): string =
  881. ## Splits the unicode string ``s`` into substrings using a group of separators.
  882. ##
  883. ## Substrings are separated by a substring containing only ``seps``.
  884. runnableExamples:
  885. import std/sequtils
  886. assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
  887. @["hÃllo", "this", "is", "an", "example", "是"]
  888. # And the following code splits the same string using a sequence of Runes.
  889. assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
  890. @["añyóng", "hÃllo", "是", "example"]
  891. # example with a `Rune` separator and unused one `;`:
  892. assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
  893. # Another example that splits a string containing a date.
  894. let date = "2012-11-20T22:08:08.398990"
  895. assert toSeq(split(date, " -:T".toRunes)) ==
  896. @["2012", "11", "20", "22", "08", "08.398990"]
  897. splitCommon(s, seps, maxsplit)
  898. iterator splitWhitespace*(s: openArray[char]): string =
  899. ## Splits a unicode string at whitespace runes.
  900. splitCommon(s, unicodeSpaces, -1)
  901. template accResult(iter: untyped) =
  902. result = @[]
  903. for x in iter: add(result, x)
  904. proc splitWhitespace*(s: openArray[char]): seq[string] {.noSideEffect,
  905. rtl, extern: "ncuSplitWhitespace".} =
  906. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  907. ## iterator, but is a proc that returns a sequence of substrings.
  908. accResult(splitWhitespace(s))
  909. iterator split*(s: openArray[char], sep: Rune, maxsplit: int = -1): string =
  910. ## Splits the unicode string ``s`` into substrings using a single separator.
  911. ## Substrings are separated by the rune ``sep``.
  912. runnableExamples:
  913. import std/sequtils
  914. assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
  915. @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"]
  916. splitCommon(s, sep, maxsplit)
  917. proc split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  918. seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} =
  919. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  920. ## but is a proc that returns a sequence of substrings.
  921. accResult(split(s, seps, maxsplit))
  922. proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
  923. rtl, extern: "nucSplitRune".} =
  924. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  925. ## that returns a sequence of substrings.
  926. accResult(split(s, sep, maxsplit))
  927. proc strip*(s: openArray[char], leading = true, trailing = true,
  928. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
  929. rtl, extern: "nucStrip".} =
  930. ## Strips leading or trailing ``runes`` from ``s`` and returns
  931. ## the resulting string.
  932. ##
  933. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  934. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  935. ## If both are false, the string is returned unchanged.
  936. runnableExamples:
  937. let a = "\táñyóng "
  938. doAssert a.strip == "áñyóng"
  939. doAssert a.strip(leading = false) == "\táñyóng"
  940. doAssert a.strip(trailing = false) == "áñyóng "
  941. var
  942. sI = 0 ## starting index into string ``s``
  943. eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
  944. if leading:
  945. var
  946. i = 0
  947. xI: int ## value of ``sI`` at the beginning of the iteration
  948. rune: Rune
  949. while i < len(s):
  950. xI = i
  951. fastRuneAt(s, i, rune)
  952. sI = i # Assume to start from next rune
  953. if not runes.contains(rune):
  954. sI = xI # Go back to where the current rune starts
  955. break
  956. if trailing:
  957. var
  958. i = eI
  959. xI: int
  960. rune: Rune
  961. while i >= 0:
  962. xI = i
  963. fastRuneAt(s, xI, rune)
  964. var yI = i - 1
  965. while yI >= 0:
  966. var
  967. yIend = yI
  968. pRune: Rune
  969. fastRuneAt(s, yIend, pRune)
  970. if yIend < xI: break
  971. i = yI
  972. rune = pRune
  973. dec(yI)
  974. if not runes.contains(rune):
  975. eI = xI - 1
  976. break
  977. dec(i)
  978. let newLen = eI - sI + 1
  979. result = newStringOfCap(newLen)
  980. if newLen > 0:
  981. result.add substr(s.toOpenArray(sI, eI))
  982. proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
  983. rtl, extern: "nucRepeatRune".} =
  984. ## Returns a string of ``count`` Runes ``c``.
  985. ##
  986. ## The returned string will have a rune-length of ``count``.
  987. runnableExamples:
  988. let a = "ñ".runeAt(0)
  989. doAssert a.repeat(5) == "ñññññ"
  990. let s = $c
  991. result = newStringOfCap(count * s.len)
  992. for i in 0 ..< count:
  993. result.add s
  994. proc align*(s: openArray[char], count: Natural, padding = ' '.Rune): string {.
  995. noSideEffect, rtl, extern: "nucAlignString".} =
  996. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  997. ## of ``count``.
  998. ##
  999. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1000. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1001. ## returned unchanged. If you need to left align a string use the `alignLeft
  1002. ## proc <#alignLeft,string,Natural>`_.
  1003. runnableExamples:
  1004. assert align("abc", 4) == " abc"
  1005. assert align("a", 0) == "a"
  1006. assert align("1232", 6) == " 1232"
  1007. assert align("1232", 6, '#'.Rune) == "##1232"
  1008. assert align("Åge", 5) == " Åge"
  1009. assert align("×", 4, '_'.Rune) == "___×"
  1010. let sLen = s.runeLen
  1011. if sLen < count:
  1012. let padStr = $padding
  1013. result = newStringOfCap(padStr.len * count)
  1014. let spaces = count - sLen
  1015. for i in 0 ..< spaces: result.add padStr
  1016. result.add s
  1017. else:
  1018. result = s.substr
  1019. proc alignLeft*(s: openArray[char], count: Natural, padding = ' '.Rune): string {.
  1020. noSideEffect.} =
  1021. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1022. ## rune-length of ``count``.
  1023. ##
  1024. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1025. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1026. ## returned unchanged. If you need to right align a string use the `align
  1027. ## proc <#align,string,Natural>`_.
  1028. runnableExamples:
  1029. assert alignLeft("abc", 4) == "abc "
  1030. assert alignLeft("a", 0) == "a"
  1031. assert alignLeft("1232", 6) == "1232 "
  1032. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1033. assert alignLeft("Åge", 5) == "Åge "
  1034. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1035. let sLen = s.runeLen
  1036. if sLen < count:
  1037. let padStr = $padding
  1038. result = newStringOfCap(s.len + (count - sLen) * padStr.len)
  1039. result.add s
  1040. for i in sLen ..< count:
  1041. result.add padStr
  1042. else:
  1043. result = s.substr
  1044. proc runeLen*(s: string): int {.inline.} =
  1045. ## Returns the number of runes of the string ``s``.
  1046. runnableExamples:
  1047. let a = "añyóng"
  1048. doAssert a.runeLen == 6
  1049. ## note: a.len == 8
  1050. runeLen(toOa(s))
  1051. proc runeLenAt*(s: string, i: Natural): int {.inline.} =
  1052. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  1053. ##
  1054. ## See also:
  1055. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1056. runnableExamples:
  1057. let a = "añyóng"
  1058. doAssert a.runeLenAt(0) == 1
  1059. doAssert a.runeLenAt(1) == 2
  1060. runeLenAt(toOa(s), i)
  1061. proc runeAt*(s: string, i: Natural): Rune {.inline.} =
  1062. ## Returns the rune in ``s`` at **byte index** ``i``.
  1063. ##
  1064. ## See also:
  1065. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  1066. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  1067. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1068. runnableExamples:
  1069. let a = "añyóng"
  1070. doAssert a.runeAt(1) == "ñ".runeAt(0)
  1071. doAssert a.runeAt(2) == "ñ".runeAt(1)
  1072. doAssert a.runeAt(3) == "y".runeAt(0)
  1073. fastRuneAt(s, i, result, false)
  1074. proc validateUtf8*(s: string): int {.inline.} =
  1075. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  1076. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  1077. ##
  1078. ## See also:
  1079. ## * `toUTF8 proc <#toUTF8,Rune>`_
  1080. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  1081. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  1082. validateUtf8(toOa(s))
  1083. proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int {.inline.} =
  1084. ## Returns the byte position of rune
  1085. ## at position ``pos`` in ``s`` with an optional start byte position.
  1086. ## Returns the special value -1 if it runs out of the string.
  1087. ##
  1088. ## **Beware:** This can lead to unoptimized code and slow execution!
  1089. ## Most problems can be solved more efficiently by using an iterator
  1090. ## or conversion to a seq of Rune.
  1091. ##
  1092. ## See also:
  1093. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  1094. runnableExamples:
  1095. let a = "añyóng"
  1096. doAssert a.runeOffset(1) == 1
  1097. doAssert a.runeOffset(3) == 4
  1098. doAssert a.runeOffset(4) == 6
  1099. runeOffset(toOa(s), pos, start)
  1100. proc runeReverseOffset*(s: string, rev: Positive): (int, int) {.inline.} =
  1101. ## Returns a tuple with the byte offset of the
  1102. ## rune at position ``rev`` in ``s``, counting
  1103. ## from the end (starting with 1) and the total
  1104. ## number of runes in the string.
  1105. ##
  1106. ## Returns a negative value for offset if there are too few runes in
  1107. ## the string to satisfy the request.
  1108. ##
  1109. ## **Beware:** This can lead to unoptimized code and slow execution!
  1110. ## Most problems can be solved more efficiently by using an iterator
  1111. ## or conversion to a seq of Rune.
  1112. ##
  1113. ## See also:
  1114. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  1115. runeReverseOffset(toOa(s), rev)
  1116. proc runeAtPos*(s: string, pos: int): Rune {.inline.} =
  1117. ## Returns the rune at position ``pos``.
  1118. ##
  1119. ## **Beware:** This can lead to unoptimized code and slow execution!
  1120. ## Most problems can be solved more efficiently by using an iterator
  1121. ## or conversion to a seq of Rune.
  1122. ##
  1123. ## See also:
  1124. ## * `runeAt proc <#runeAt,string,Natural>`_
  1125. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  1126. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1127. fastRuneAt(toOa(s), runeOffset(s, pos), result, false)
  1128. proc runeStrAtPos*(s: string, pos: Natural): string {.inline.} =
  1129. ## Returns the rune at position ``pos`` as UTF8 String.
  1130. ##
  1131. ## **Beware:** This can lead to unoptimized code and slow execution!
  1132. ## Most problems can be solved more efficiently by using an iterator
  1133. ## or conversion to a seq of Rune.
  1134. ##
  1135. ## See also:
  1136. ## * `runeAt proc <#runeAt,string,Natural>`_
  1137. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  1138. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1139. let o = runeOffset(s, pos)
  1140. substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1)))
  1141. proc runeSubStr*(s: string, pos: int, len: int = int.high): string {.inline.} =
  1142. ## Returns the UTF-8 substring starting at code point ``pos``
  1143. ## with ``len`` code points.
  1144. ##
  1145. ## If ``pos`` or ``len`` is negative they count from
  1146. ## the end of the string. If ``len`` is not given it means the longest
  1147. ## possible string.
  1148. runnableExamples:
  1149. let s = "Hänsel ««: 10,00€"
  1150. doAssert(runeSubStr(s, 0, 2) == "Hä")
  1151. doAssert(runeSubStr(s, 10, 1) == ":")
  1152. doAssert(runeSubStr(s, -6) == "10,00€")
  1153. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1154. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1155. doAssert(runeSubStr(s, -6, 3) == "10,")
  1156. runeSubStr(toOa(s), pos, len)
  1157. proc isAlpha*(s: string): bool {.noSideEffect, inline.} =
  1158. ## Returns true if ``s`` contains all alphabetic runes.
  1159. runnableExamples:
  1160. let a = "añyóng"
  1161. doAssert a.isAlpha
  1162. isAlpha(toOa(s))
  1163. proc isSpace*(s: string): bool {.noSideEffect, inline.} =
  1164. ## Returns true if ``s`` contains all whitespace runes.
  1165. runnableExamples:
  1166. let a = "\t\l \v\r\f"
  1167. doAssert a.isSpace
  1168. isSpace(toOa(s))
  1169. proc toUpper*(s: string): string {.noSideEffect, inline.} =
  1170. ## Converts ``s`` into upper-case runes.
  1171. runnableExamples:
  1172. doAssert toUpper("abγ") == "ABΓ"
  1173. toUpper(toOa(s))
  1174. proc toLower*(s: string): string {.noSideEffect, inline.} =
  1175. ## Converts ``s`` into lower-case runes.
  1176. runnableExamples:
  1177. doAssert toLower("ABΓ") == "abγ"
  1178. toLower(toOa(s))
  1179. proc swapCase*(s: string): string {.noSideEffect, inline.} =
  1180. ## Swaps the case of runes in ``s``.
  1181. ##
  1182. ## Returns a new string such that the cases of all runes
  1183. ## are swapped if possible.
  1184. runnableExamples:
  1185. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1186. swapCase(toOa(s))
  1187. proc capitalize*(s: string): string {.noSideEffect.} =
  1188. ## Converts the first character of ``s`` into an upper-case rune.
  1189. runnableExamples:
  1190. doAssert capitalize("βeta") == "Βeta"
  1191. capitalize(toOa(s))
  1192. proc translate*(s: string, replacements: proc(key: string): string): string {.effectsOf: replacements, inline.} =
  1193. ## Translates words in a string using the ``replacements`` proc to substitute
  1194. ## words inside ``s`` with their replacements.
  1195. ##
  1196. ## ``replacements`` is any proc that takes a word and returns
  1197. ## a new word to fill it's place.
  1198. runnableExamples:
  1199. proc wordToNumber(s: string): string =
  1200. case s
  1201. of "one": "1"
  1202. of "two": "2"
  1203. else: s
  1204. let a = "one two three four"
  1205. doAssert a.translate(wordToNumber) == "1 2 three four"
  1206. translate(toOa(s), replacements)
  1207. proc title*(s: string): string {.noSideEffect, inline.} =
  1208. ## Converts ``s`` to a unicode title.
  1209. ##
  1210. ## Returns a new string such that the first character
  1211. ## in each word inside ``s`` is capitalized.
  1212. runnableExamples:
  1213. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1214. title(toOa(s))
  1215. iterator runes*(s: string): Rune =
  1216. ## Iterates over any rune of the string ``s`` returning runes.
  1217. for rune in runes(toOa(s)):
  1218. yield rune
  1219. iterator utf8*(s: string): string =
  1220. ## Iterates over any rune of the string ``s`` returning utf8 values.
  1221. ##
  1222. ## See also:
  1223. ## * `validateUtf8 proc <#validateUtf8,string>`_
  1224. ## * `toUTF8 proc <#toUTF8,Rune>`_
  1225. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  1226. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  1227. for str in utf8(toOa(s)):
  1228. yield str
  1229. proc toRunes*(s: string): seq[Rune] {.inline.} =
  1230. ## Obtains a sequence containing the Runes in ``s``.
  1231. ##
  1232. ## See also:
  1233. ## * `$ proc <#$,Rune>`_ for a reverse operation
  1234. runnableExamples:
  1235. let a = toRunes("aáä")
  1236. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  1237. toRunes(toOa(s))
  1238. proc cmpRunesIgnoreCase*(a, b: string): int {.inline.} =
  1239. ## Compares two UTF-8 strings and ignores the case. Returns:
  1240. ##
  1241. ## | 0 if a == b
  1242. ## | < 0 if a < b
  1243. ## | > 0 if a > b
  1244. cmpRunesIgnoreCase(a.toOa(), b.toOa())
  1245. proc reversed*(s: string): string {.inline.} =
  1246. ## Returns the reverse of ``s``, interpreting it as runes.
  1247. ##
  1248. ## Unicode combining characters are correctly interpreted as well.
  1249. runnableExamples:
  1250. assert reversed("Reverse this!") == "!siht esreveR"
  1251. assert reversed("先秦兩漢") == "漢兩秦先"
  1252. assert reversed("as⃝df̅") == "f̅ds⃝a"
  1253. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1254. reversed(toOa(s))
  1255. proc graphemeLen*(s: string; i: Natural): Natural {.inline.} =
  1256. ## The number of bytes belonging to byte index ``s[i]``,
  1257. ## including following combining code unit.
  1258. runnableExamples:
  1259. let a = "añyóng"
  1260. doAssert a.graphemeLen(1) == 2 ## ñ
  1261. doAssert a.graphemeLen(2) == 1
  1262. doAssert a.graphemeLen(4) == 2 ## ó
  1263. graphemeLen(toOa(s), i)
  1264. proc lastRune*(s: string; last: int): (Rune, int) {.inline.} =
  1265. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  1266. ## in bytes.
  1267. lastRune(toOa(s), last)
  1268. iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces,
  1269. maxsplit: int = -1): string =
  1270. ## Splits the unicode string ``s`` into substrings using a group of separators.
  1271. ##
  1272. ## Substrings are separated by a substring containing only ``seps``.
  1273. runnableExamples:
  1274. import std/sequtils
  1275. assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
  1276. @["hÃllo", "this", "is", "an", "example", "是"]
  1277. # And the following code splits the same string using a sequence of Runes.
  1278. assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
  1279. @["añyóng", "hÃllo", "是", "example"]
  1280. # example with a `Rune` separator and unused one `;`:
  1281. assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
  1282. # Another example that splits a string containing a date.
  1283. let date = "2012-11-20T22:08:08.398990"
  1284. assert toSeq(split(date, " -:T".toRunes)) ==
  1285. @["2012", "11", "20", "22", "08", "08.398990"]
  1286. splitCommon(toOa(s), seps, maxsplit)
  1287. iterator splitWhitespace*(s: string): string =
  1288. ## Splits a unicode string at whitespace runes.
  1289. splitCommon(s.toOa(), unicodeSpaces, -1)
  1290. proc splitWhitespace*(s: string): seq[string] {.noSideEffect, inline.}=
  1291. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  1292. ## iterator, but is a proc that returns a sequence of substrings.
  1293. accResult(splitWhitespace(toOa(s)))
  1294. iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
  1295. ## Splits the unicode string ``s`` into substrings using a single separator.
  1296. ## Substrings are separated by the rune ``sep``.
  1297. runnableExamples:
  1298. import std/sequtils
  1299. assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
  1300. @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"]
  1301. splitCommon(toOa(s), sep, maxsplit)
  1302. proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  1303. seq[string] {.noSideEffect, inline.} =
  1304. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  1305. ## but is a proc that returns a sequence of substrings.
  1306. accResult(split(toOa(s), seps, maxsplit))
  1307. proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, inline.} =
  1308. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  1309. ## that returns a sequence of substrings.
  1310. accResult(split(toOa(s), sep, maxsplit))
  1311. proc strip*(s: string, leading = true, trailing = true,
  1312. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, inline.} =
  1313. ## Strips leading or trailing ``runes`` from ``s`` and returns
  1314. ## the resulting string.
  1315. ##
  1316. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  1317. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  1318. ## If both are false, the string is returned unchanged.
  1319. runnableExamples:
  1320. let a = "\táñyóng "
  1321. doAssert a.strip == "áñyóng"
  1322. doAssert a.strip(leading = false) == "\táñyóng"
  1323. doAssert a.strip(trailing = false) == "áñyóng "
  1324. strip(toOa(s), leading, trailing, runes)
  1325. proc align*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} =
  1326. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  1327. ## of ``count``.
  1328. ##
  1329. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1330. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1331. ## returned unchanged. If you need to left align a string use the `alignLeft
  1332. ## proc <#alignLeft,string,Natural>`_.
  1333. runnableExamples:
  1334. assert align("abc", 4) == " abc"
  1335. assert align("a", 0) == "a"
  1336. assert align("1232", 6) == " 1232"
  1337. assert align("1232", 6, '#'.Rune) == "##1232"
  1338. assert align("Åge", 5) == " Åge"
  1339. assert align("×", 4, '_'.Rune) == "___×"
  1340. align(toOa(s), count, padding)
  1341. proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} =
  1342. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1343. ## rune-length of ``count``.
  1344. ##
  1345. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1346. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1347. ## returned unchanged. If you need to right align a string use the `align
  1348. ## proc <#align,string,Natural>`_.
  1349. runnableExamples:
  1350. assert alignLeft("abc", 4) == "abc "
  1351. assert alignLeft("a", 0) == "a"
  1352. assert alignLeft("1232", 6) == "1232 "
  1353. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1354. assert alignLeft("Åge", 5) == "Åge "
  1355. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1356. alignLeft(toOa(s), count, padding)