highlite.nim 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Source highlighter for programming or markup languages.
  10. ## Currently only few languages are supported, other languages may be added.
  11. ## The interface supports one language nested in another.
  12. ##
  13. ## You can use this to build your own syntax highlighting, check this example:
  14. ##
  15. ## ```Nim
  16. ## let code = """for x in $int.high: echo x.ord mod 2 == 0"""
  17. ## var toknizr: GeneralTokenizer
  18. ## initGeneralTokenizer(toknizr, code)
  19. ## while true:
  20. ## getNextToken(toknizr, langNim)
  21. ## case toknizr.kind
  22. ## of gtEof: break # End Of File (or string)
  23. ## of gtWhitespace:
  24. ## echo gtWhitespace # Maybe you want "visible" whitespaces?.
  25. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  26. ## of gtOperator:
  27. ## echo gtOperator # Maybe you want Operators to use a specific color?.
  28. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  29. ## # of gtSomeSymbol: syntaxHighlight("Comic Sans", "bold", "99px", "pink")
  30. ## else:
  31. ## echo toknizr.kind # All the kinds of tokens can be processed here.
  32. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  33. ## ```
  34. ##
  35. ## The proc `getSourceLanguage` can get the language `enum` from a string:
  36. ## ```Nim
  37. ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
  38. ## ```
  39. ##
  40. ## There is also a `Cmd` pseudo-language supported, which is a simple generic
  41. ## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
  42. ## no escaping, no programming language constructs besides variable definition
  43. ## at the beginning of line. It supports these operators:
  44. ## ```Cmd
  45. ## & && | || ( ) '' "" ; # for comments
  46. ## ```
  47. ##
  48. ## Instead of escaping always use quotes like here
  49. ## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
  50. ## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
  51. ## as a file or directory.
  52. ##
  53. ## In addition to `Cmd` there is also `Console` language for
  54. ## displaying interactive sessions.
  55. ## Lines with a command should start with ``$``, other lines are considered
  56. ## as program output.
  57. import
  58. std/strutils
  59. from std/algorithm import binarySearch
  60. when defined(nimPreviewSlimSystem):
  61. import std/[assertions, syncio]
  62. type
  63. SourceLanguage* = enum
  64. langNone, langNim, langCpp, langCsharp, langC, langJava,
  65. langYaml, langPython, langCmd, langConsole
  66. TokenClass* = enum
  67. gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
  68. gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
  69. gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
  70. gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
  71. gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
  72. gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
  73. gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
  74. GeneralTokenizer* = object of RootObj
  75. kind*: TokenClass
  76. start*, length*: int
  77. buf: cstring
  78. pos: int
  79. state: TokenClass
  80. lang: SourceLanguage
  81. const
  82. sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
  83. "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  84. sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
  85. "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  86. ## list of languages spelled with alpabetic characters
  87. tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
  88. "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
  89. "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
  90. "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
  91. "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
  92. "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
  93. "Label", "Reference", "Prompt", "ProgramOutput",
  94. # start from lower-case if there is a corresponding RST role (see rst.nim)
  95. "program", "option",
  96. "Other"]
  97. # The following list comes from doc/keywords.txt, make sure it is
  98. # synchronized with this array by running the module itself as a test case.
  99. nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
  100. "break", "case", "cast", "concept", "const", "continue", "converter",
  101. "defer", "discard", "distinct", "div", "do",
  102. "elif", "else", "end", "enum", "except", "export",
  103. "finally", "for", "from", "func",
  104. "if", "import", "in", "include",
  105. "interface", "is", "isnot", "iterator", "let", "macro", "method",
  106. "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
  107. "ptr", "raise", "ref", "return", "shl", "shr", "static",
  108. "template", "try", "tuple", "type", "using", "var", "when", "while",
  109. "xor", "yield"]
  110. proc getSourceLanguage*(name: string): SourceLanguage =
  111. for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
  112. if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
  113. return i
  114. if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
  115. return i
  116. result = langNone
  117. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
  118. g.buf = buf
  119. g.kind = low(TokenClass)
  120. g.start = 0
  121. g.length = 0
  122. g.state = low(TokenClass)
  123. g.lang = low(SourceLanguage)
  124. g.pos = 0
  125. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
  126. initGeneralTokenizer(g, cstring(buf))
  127. proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
  128. discard
  129. proc nimGetKeyword(id: string): TokenClass =
  130. for k in nimKeywords:
  131. if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  132. result = gtIdentifier
  133. when false:
  134. var i = getIdent(id)
  135. if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
  136. (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
  137. result = gtKeyword
  138. else:
  139. result = gtIdentifier
  140. proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
  141. var pos = position
  142. if g.buf[pos] == '\'':
  143. inc(pos)
  144. case g.buf[pos]
  145. of 'f', 'F':
  146. g.kind = gtFloatNumber
  147. inc(pos)
  148. if g.buf[pos] in {'0'..'9'}: inc(pos)
  149. if g.buf[pos] in {'0'..'9'}: inc(pos)
  150. of 'i', 'I':
  151. inc(pos)
  152. if g.buf[pos] in {'0'..'9'}: inc(pos)
  153. if g.buf[pos] in {'0'..'9'}: inc(pos)
  154. else:
  155. discard
  156. result = pos
  157. proc nimNumber(g: var GeneralTokenizer, position: int): int =
  158. const decChars = {'0'..'9', '_'}
  159. var pos = position
  160. g.kind = gtDecNumber
  161. while g.buf[pos] in decChars: inc(pos)
  162. if g.buf[pos] == '.':
  163. g.kind = gtFloatNumber
  164. inc(pos)
  165. while g.buf[pos] in decChars: inc(pos)
  166. if g.buf[pos] in {'e', 'E'}:
  167. g.kind = gtFloatNumber
  168. inc(pos)
  169. if g.buf[pos] in {'+', '-'}: inc(pos)
  170. while g.buf[pos] in decChars: inc(pos)
  171. result = nimNumberPostfix(g, pos)
  172. const
  173. OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  174. '|', '=', '%', '&', '$', '@', '~', ':'}
  175. proc isKeyword(x: openArray[string], y: string): int =
  176. binarySearch(x, y)
  177. proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
  178. const
  179. hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
  180. octChars = {'0'..'7', '_'}
  181. binChars = {'0'..'1', '_'}
  182. SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  183. var pos = g.pos
  184. g.start = g.pos
  185. if g.state == gtStringLit:
  186. if g.buf[pos] == '\\':
  187. g.kind = gtEscapeSequence
  188. inc(pos)
  189. case g.buf[pos]
  190. of 'x', 'X':
  191. inc(pos)
  192. if g.buf[pos] in hexChars: inc(pos)
  193. if g.buf[pos] in hexChars: inc(pos)
  194. of '0'..'9':
  195. while g.buf[pos] in {'0'..'9'}: inc(pos)
  196. of '\0':
  197. g.state = gtNone
  198. else: inc(pos)
  199. else:
  200. g.kind = gtStringLit
  201. while true:
  202. case g.buf[pos]
  203. of '\\':
  204. break
  205. of '\0', '\r', '\n':
  206. g.state = gtNone
  207. break
  208. of '\"':
  209. inc(pos)
  210. g.state = gtNone
  211. break
  212. else: inc(pos)
  213. else:
  214. case g.buf[pos]
  215. of ' ', '\t'..'\r':
  216. g.kind = gtWhitespace
  217. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  218. of '#':
  219. g.kind = gtComment
  220. inc(pos)
  221. var isDoc = false
  222. if g.buf[pos] == '#':
  223. inc(pos)
  224. isDoc = true
  225. if g.buf[pos] == '[' and g.lang == langNim:
  226. g.kind = gtLongComment
  227. var nesting = 0
  228. while true:
  229. case g.buf[pos]
  230. of '\0': break
  231. of '#':
  232. if isDoc:
  233. if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
  234. inc nesting
  235. elif g.buf[pos+1] == '[':
  236. inc nesting
  237. inc pos
  238. of ']':
  239. if isDoc:
  240. if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
  241. if nesting == 0:
  242. inc(pos, 3)
  243. break
  244. dec nesting
  245. elif g.buf[pos+1] == '#':
  246. if nesting == 0:
  247. inc(pos, 2)
  248. break
  249. dec nesting
  250. inc pos
  251. else:
  252. inc pos
  253. else:
  254. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  255. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  256. var id = ""
  257. while g.buf[pos] in SymChars + {'_'}:
  258. add(id, g.buf[pos])
  259. inc(pos)
  260. if (g.buf[pos] == '\"'):
  261. if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
  262. inc(pos, 3)
  263. g.kind = gtLongStringLit
  264. while true:
  265. case g.buf[pos]
  266. of '\0':
  267. break
  268. of '\"':
  269. inc(pos)
  270. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  271. g.buf[pos+2] != '\"':
  272. inc(pos, 2)
  273. break
  274. else: inc(pos)
  275. else:
  276. g.kind = gtRawData
  277. inc(pos)
  278. while not (g.buf[pos] in {'\0', '\n', '\r'}):
  279. if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
  280. inc(pos)
  281. if g.buf[pos] == '\"': inc(pos)
  282. else:
  283. if g.lang == langNim:
  284. g.kind = nimGetKeyword(id)
  285. elif isKeyword(keywords, id) >= 0:
  286. g.kind = gtKeyword
  287. of '0':
  288. inc(pos)
  289. case g.buf[pos]
  290. of 'b', 'B':
  291. g.kind = gtBinNumber
  292. inc(pos)
  293. while g.buf[pos] in binChars: inc(pos)
  294. pos = nimNumberPostfix(g, pos)
  295. of 'x', 'X':
  296. g.kind = gtHexNumber
  297. inc(pos)
  298. while g.buf[pos] in hexChars: inc(pos)
  299. pos = nimNumberPostfix(g, pos)
  300. of 'o', 'O':
  301. g.kind = gtOctNumber
  302. inc(pos)
  303. while g.buf[pos] in octChars: inc(pos)
  304. pos = nimNumberPostfix(g, pos)
  305. else: pos = nimNumber(g, pos)
  306. of '1'..'9':
  307. pos = nimNumber(g, pos)
  308. of '\'':
  309. inc(pos)
  310. if g.kind != gtPunctuation:
  311. g.kind = gtCharLit
  312. while true:
  313. case g.buf[pos]
  314. of '\0', '\r', '\n':
  315. break
  316. of '\'':
  317. inc(pos)
  318. break
  319. of '\\':
  320. inc(pos, 2)
  321. else: inc(pos)
  322. of '\"':
  323. inc(pos)
  324. if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
  325. inc(pos, 2)
  326. g.kind = gtLongStringLit
  327. while true:
  328. case g.buf[pos]
  329. of '\0':
  330. break
  331. of '\"':
  332. inc(pos)
  333. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  334. g.buf[pos+2] != '\"':
  335. inc(pos, 2)
  336. break
  337. else: inc(pos)
  338. else:
  339. g.kind = gtStringLit
  340. while true:
  341. case g.buf[pos]
  342. of '\0', '\r', '\n':
  343. break
  344. of '\"':
  345. inc(pos)
  346. break
  347. of '\\':
  348. g.state = g.kind
  349. break
  350. else: inc(pos)
  351. of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
  352. inc(pos)
  353. g.kind = gtPunctuation
  354. of '\0':
  355. g.kind = gtEof
  356. else:
  357. if g.buf[pos] in OpChars:
  358. g.kind = gtOperator
  359. while g.buf[pos] in OpChars: inc(pos)
  360. else:
  361. inc(pos)
  362. g.kind = gtNone
  363. g.length = pos - g.pos
  364. if g.kind != gtEof and g.state != gtNone and g.length <= 0:
  365. assert false, "nimNextToken: produced an empty token"
  366. g.pos = pos
  367. proc generalNumber(g: var GeneralTokenizer, position: int): int =
  368. const decChars = {'0'..'9'}
  369. var pos = position
  370. g.kind = gtDecNumber
  371. while g.buf[pos] in decChars: inc(pos)
  372. if g.buf[pos] == '.':
  373. g.kind = gtFloatNumber
  374. inc(pos)
  375. while g.buf[pos] in decChars: inc(pos)
  376. if g.buf[pos] in {'e', 'E'}:
  377. g.kind = gtFloatNumber
  378. inc(pos)
  379. if g.buf[pos] in {'+', '-'}: inc(pos)
  380. while g.buf[pos] in decChars: inc(pos)
  381. result = pos
  382. proc generalStrLit(g: var GeneralTokenizer, position: int): int =
  383. const
  384. decChars = {'0'..'9'}
  385. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  386. var pos = position
  387. g.kind = gtStringLit
  388. var c = g.buf[pos]
  389. inc(pos) # skip " or '
  390. while true:
  391. case g.buf[pos]
  392. of '\0':
  393. break
  394. of '\\':
  395. inc(pos)
  396. case g.buf[pos]
  397. of '\0':
  398. break
  399. of '0'..'9':
  400. while g.buf[pos] in decChars: inc(pos)
  401. of 'x', 'X':
  402. inc(pos)
  403. if g.buf[pos] in hexChars: inc(pos)
  404. if g.buf[pos] in hexChars: inc(pos)
  405. else: inc(pos, 2)
  406. else:
  407. if g.buf[pos] == c:
  408. inc(pos)
  409. break
  410. else:
  411. inc(pos)
  412. result = pos
  413. type
  414. TokenizerFlag = enum
  415. hasPreprocessor, hasNestedComments
  416. TokenizerFlags = set[TokenizerFlag]
  417. proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
  418. flags: TokenizerFlags) =
  419. const
  420. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  421. octChars = {'0'..'7'}
  422. binChars = {'0'..'1'}
  423. symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  424. var pos = g.pos
  425. g.start = g.pos
  426. if g.state == gtStringLit:
  427. g.kind = gtStringLit
  428. while true:
  429. case g.buf[pos]
  430. of '\\':
  431. g.kind = gtEscapeSequence
  432. inc(pos)
  433. case g.buf[pos]
  434. of 'x', 'X':
  435. inc(pos)
  436. if g.buf[pos] in hexChars: inc(pos)
  437. if g.buf[pos] in hexChars: inc(pos)
  438. of '0'..'9':
  439. while g.buf[pos] in {'0'..'9'}: inc(pos)
  440. of '\0':
  441. g.state = gtNone
  442. else: inc(pos)
  443. break
  444. of '\0', '\r', '\n':
  445. g.state = gtNone
  446. break
  447. of '\"':
  448. inc(pos)
  449. g.state = gtNone
  450. break
  451. else: inc(pos)
  452. else:
  453. case g.buf[pos]
  454. of ' ', '\t'..'\r':
  455. g.kind = gtWhitespace
  456. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  457. of '/':
  458. inc(pos)
  459. if g.buf[pos] == '/':
  460. g.kind = gtComment
  461. while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos)
  462. elif g.buf[pos] == '*':
  463. g.kind = gtLongComment
  464. var nested = 0
  465. inc(pos)
  466. while true:
  467. case g.buf[pos]
  468. of '*':
  469. inc(pos)
  470. if g.buf[pos] == '/':
  471. inc(pos)
  472. if nested == 0: break
  473. of '/':
  474. inc(pos)
  475. if g.buf[pos] == '*':
  476. inc(pos)
  477. if hasNestedComments in flags: inc(nested)
  478. of '\0':
  479. break
  480. else: inc(pos)
  481. else:
  482. g.kind = gtOperator
  483. while g.buf[pos] in OpChars: inc(pos)
  484. of '#':
  485. inc(pos)
  486. if hasPreprocessor in flags:
  487. g.kind = gtPreprocessor
  488. while g.buf[pos] in {' ', '\t'}: inc(pos)
  489. while g.buf[pos] in symChars: inc(pos)
  490. else:
  491. g.kind = gtOperator
  492. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  493. var id = ""
  494. while g.buf[pos] in symChars:
  495. add(id, g.buf[pos])
  496. inc(pos)
  497. if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
  498. else: g.kind = gtIdentifier
  499. of '0':
  500. inc(pos)
  501. case g.buf[pos]
  502. of 'b', 'B':
  503. inc(pos)
  504. while g.buf[pos] in binChars: inc(pos)
  505. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  506. of 'x', 'X':
  507. inc(pos)
  508. while g.buf[pos] in hexChars: inc(pos)
  509. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  510. of '0'..'7':
  511. inc(pos)
  512. while g.buf[pos] in octChars: inc(pos)
  513. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  514. else:
  515. pos = generalNumber(g, pos)
  516. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  517. of '1'..'9':
  518. pos = generalNumber(g, pos)
  519. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  520. of '\'':
  521. pos = generalStrLit(g, pos)
  522. g.kind = gtCharLit
  523. of '\"':
  524. inc(pos)
  525. g.kind = gtStringLit
  526. while true:
  527. case g.buf[pos]
  528. of '\0':
  529. break
  530. of '\"':
  531. inc(pos)
  532. break
  533. of '\\':
  534. g.state = g.kind
  535. break
  536. else: inc(pos)
  537. of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
  538. inc(pos)
  539. g.kind = gtPunctuation
  540. of '\0':
  541. g.kind = gtEof
  542. else:
  543. if g.buf[pos] in OpChars:
  544. g.kind = gtOperator
  545. while g.buf[pos] in OpChars: inc(pos)
  546. else:
  547. inc(pos)
  548. g.kind = gtNone
  549. g.length = pos - g.pos
  550. if g.kind != gtEof and g.length <= 0:
  551. assert false, "clikeNextToken: produced an empty token"
  552. g.pos = pos
  553. proc cNextToken(g: var GeneralTokenizer) =
  554. const
  555. keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
  556. "break", "case", "char", "const", "continue", "default", "do", "double",
  557. "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
  558. "long", "register", "restrict", "return", "short", "signed", "sizeof",
  559. "static", "struct", "switch", "typedef", "union", "unsigned", "void",
  560. "volatile", "while"]
  561. clikeNextToken(g, keywords, {hasPreprocessor})
  562. proc cppNextToken(g: var GeneralTokenizer) =
  563. const
  564. keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
  565. "char", "class", "const", "continue", "default", "delete", "do", "double",
  566. "else", "enum", "extern", "float", "for", "friend", "goto", "if",
  567. "inline", "int", "long", "new", "operator", "private", "protected",
  568. "public", "register", "return", "short", "signed", "sizeof", "static",
  569. "struct", "switch", "template", "this", "throw", "try", "typedef",
  570. "union", "unsigned", "virtual", "void", "volatile", "while"]
  571. clikeNextToken(g, keywords, {hasPreprocessor})
  572. proc csharpNextToken(g: var GeneralTokenizer) =
  573. const
  574. keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
  575. "byte", "case", "catch", "char", "checked", "class", "const", "continue",
  576. "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
  577. "explicit", "extern", "false", "finally", "fixed", "float", "for",
  578. "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
  579. "is", "lock", "long", "namespace", "new", "null", "object", "operator",
  580. "out", "override", "params", "private", "protected", "public", "readonly",
  581. "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
  582. "static", "string", "struct", "switch", "this", "throw", "true", "try",
  583. "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
  584. "virtual", "void", "volatile", "while"]
  585. clikeNextToken(g, keywords, {hasPreprocessor})
  586. proc javaNextToken(g: var GeneralTokenizer) =
  587. const
  588. keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
  589. "byte", "case", "catch", "char", "class", "const", "continue", "default",
  590. "do", "double", "else", "enum", "extends", "false", "final", "finally",
  591. "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
  592. "interface", "long", "native", "new", "null", "package", "private",
  593. "protected", "public", "return", "short", "static", "strictfp", "super",
  594. "switch", "synchronized", "this", "throw", "throws", "transient", "true",
  595. "try", "void", "volatile", "while"]
  596. clikeNextToken(g, keywords, {})
  597. proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
  598. g.kind = gtStringLit
  599. while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}:
  600. if g.buf[pos] == ':' and
  601. g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  602. break
  603. inc(pos)
  604. proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
  605. g.kind = gtNone
  606. if g.buf[pos] == '-': inc(pos)
  607. if g.buf[pos] == '0': inc(pos)
  608. elif g.buf[pos] in '1'..'9':
  609. inc(pos)
  610. while g.buf[pos] in {'0'..'9'}: inc(pos)
  611. else: yamlPlainStrLit(g, pos)
  612. if g.kind == gtNone:
  613. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  614. g.kind = gtDecNumber
  615. elif g.buf[pos] == '.':
  616. inc(pos)
  617. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  618. else:
  619. while g.buf[pos] in {'0'..'9'}: inc(pos)
  620. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  621. g.kind = gtFloatNumber
  622. if g.kind == gtNone:
  623. if g.buf[pos] in {'e', 'E'}:
  624. inc(pos)
  625. if g.buf[pos] in {'-', '+'}: inc(pos)
  626. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  627. else:
  628. while g.buf[pos] in {'0'..'9'}: inc(pos)
  629. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  630. g.kind = gtFloatNumber
  631. else: yamlPlainStrLit(g, pos)
  632. else: yamlPlainStrLit(g, pos)
  633. while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}:
  634. inc(pos)
  635. if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}:
  636. yamlPlainStrLit(g, pos)
  637. break
  638. # theoretically, we would need to parse indentation (like with block scalars)
  639. # because of possible multiline flow scalars that start with number-like
  640. # content, but that is far too troublesome. I think it is fine that the
  641. # highlighter is sloppy here.
  642. proc yamlNextToken(g: var GeneralTokenizer) =
  643. const
  644. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  645. var pos = g.pos
  646. g.start = g.pos
  647. if g.state == gtStringLit:
  648. g.kind = gtStringLit
  649. while true:
  650. case g.buf[pos]
  651. of '\\':
  652. if pos != g.pos: break
  653. g.kind = gtEscapeSequence
  654. inc(pos)
  655. case g.buf[pos]
  656. of 'x':
  657. inc(pos)
  658. for i in 1..2:
  659. if g.buf[pos] in hexChars: inc(pos)
  660. break
  661. of 'u':
  662. inc(pos)
  663. for i in 1..4:
  664. if g.buf[pos] in hexChars: inc(pos)
  665. break
  666. of 'U':
  667. inc(pos)
  668. for i in 1..8:
  669. if g.buf[pos] in hexChars: inc(pos)
  670. break
  671. else: inc(pos)
  672. break
  673. of '\0':
  674. g.state = gtOther
  675. break
  676. of '\"':
  677. inc(pos)
  678. g.state = gtOther
  679. break
  680. else: inc(pos)
  681. elif g.state == gtCharLit:
  682. # abusing gtCharLit as single-quoted string lit
  683. g.kind = gtStringLit
  684. inc(pos) # skip the starting '
  685. while true:
  686. case g.buf[pos]
  687. of '\'':
  688. inc(pos)
  689. if g.buf[pos] == '\'':
  690. inc(pos)
  691. g.kind = gtEscapeSequence
  692. else: g.state = gtOther
  693. break
  694. else: inc(pos)
  695. elif g.state == gtCommand:
  696. # gtCommand means 'block scalar header'
  697. case g.buf[pos]
  698. of ' ', '\t':
  699. g.kind = gtWhitespace
  700. while g.buf[pos] in {' ', '\t'}: inc(pos)
  701. of '#':
  702. g.kind = gtComment
  703. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  704. of '\n', '\r': discard
  705. else:
  706. # illegal here. just don't parse a block scalar
  707. g.kind = gtNone
  708. g.state = gtOther
  709. if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand:
  710. g.state = gtLongStringLit
  711. elif g.state == gtLongStringLit:
  712. # beware, this is the only token where we actually have to parse
  713. # indentation.
  714. g.kind = gtLongStringLit
  715. # first, we have to find the parent indentation of the block scalar, so that
  716. # we know when to stop
  717. assert g.buf[pos] in {'\n', '\r'}
  718. var lookbehind = pos - 1
  719. var headerStart = -1
  720. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  721. if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
  722. headerStart = lookbehind
  723. dec(lookbehind)
  724. assert headerStart != -1
  725. var indentation = 1
  726. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  727. if g.buf[lookbehind + indentation] in {'|', '>'}:
  728. # when the header is alone in a line, this line does not show the parent's
  729. # indentation, so we must go further. search the first previous line with
  730. # non-whitespace content.
  731. while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}:
  732. dec(lookbehind)
  733. while lookbehind >= 0 and
  734. g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
  735. # now, find the beginning of the line...
  736. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  737. dec(lookbehind)
  738. # ... and its indentation
  739. indentation = 1
  740. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  741. if lookbehind == -1: indentation = 0 # top level
  742. elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
  743. g.buf[lookbehind + 3] == '-' and
  744. g.buf[lookbehind + 4] in {'\t'..'\r', ' '}:
  745. # this is a document start, therefore, we are at top level
  746. indentation = 0
  747. # because lookbehind was at newline char when calculating indentation, we're
  748. # off by one. fix that. top level's parent will have indentation of -1.
  749. let parentIndentation = indentation - 1
  750. # find first content
  751. while g.buf[pos] in {' ', '\n', '\r'}:
  752. if g.buf[pos] == ' ': inc(indentation)
  753. else: indentation = 0
  754. inc(pos)
  755. var minIndentation = indentation
  756. # for stupid edge cases, we must check whether an explicit indentation depth
  757. # is given at the header.
  758. while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
  759. if g.buf[headerStart] in {'0'..'9'}:
  760. minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
  761. # process content lines
  762. while indentation > parentIndentation and g.buf[pos] != '\0':
  763. if (indentation < minIndentation and g.buf[pos] == '#') or
  764. (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
  765. g.buf[pos + 2] == '.' and
  766. g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}):
  767. # comment after end of block scalar, or end of document
  768. break
  769. minIndentation = min(indentation, minIndentation)
  770. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  771. while g.buf[pos] in {' ', '\n', '\r'}:
  772. if g.buf[pos] == ' ': inc(indentation)
  773. else: indentation = 0
  774. inc(pos)
  775. g.state = gtOther
  776. elif g.state == gtOther:
  777. # gtOther means 'inside YAML document'
  778. case g.buf[pos]
  779. of ' ', '\t'..'\r':
  780. g.kind = gtWhitespace
  781. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  782. of '#':
  783. g.kind = gtComment
  784. inc(pos)
  785. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  786. of '-':
  787. inc(pos)
  788. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  789. g.kind = gtPunctuation
  790. elif g.buf[pos] == '-' and
  791. (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line
  792. inc(pos)
  793. if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  794. inc(pos)
  795. g.kind = gtKeyword
  796. else: yamlPossibleNumber(g, pos)
  797. else: yamlPossibleNumber(g, pos)
  798. of '.':
  799. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  800. inc(pos)
  801. for i in 1..2:
  802. if g.buf[pos] != '.': break
  803. inc(pos)
  804. if pos == g.start + 3:
  805. g.kind = gtKeyword
  806. g.state = gtNone
  807. else: yamlPlainStrLit(g, pos)
  808. else: yamlPlainStrLit(g, pos)
  809. of '?':
  810. inc(pos)
  811. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  812. g.kind = gtPunctuation
  813. else: yamlPlainStrLit(g, pos)
  814. of ':':
  815. inc(pos)
  816. if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or
  817. (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
  818. g.kind = gtPunctuation
  819. else: yamlPlainStrLit(g, pos)
  820. of '[', ']', '{', '}', ',':
  821. inc(pos)
  822. g.kind = gtPunctuation
  823. of '\"':
  824. inc(pos)
  825. g.state = gtStringLit
  826. g.kind = gtStringLit
  827. of '\'':
  828. g.state = gtCharLit
  829. g.kind = gtNone
  830. of '!':
  831. g.kind = gtTagStart
  832. inc(pos)
  833. if g.buf[pos] == '<':
  834. # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
  835. while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos)
  836. if g.buf[pos] == '>': inc(pos)
  837. else:
  838. while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
  839. case g.buf[pos]
  840. of '!':
  841. # prefixed tag (e.g. `!!str`)
  842. inc(pos)
  843. while g.buf[pos] notin
  844. {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos)
  845. of '\0', '\t'..'\r', ' ': discard
  846. else:
  847. # local tag (e.g. `!nim:system:int`)
  848. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  849. of '&':
  850. g.kind = gtLabel
  851. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  852. of '*':
  853. g.kind = gtReference
  854. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  855. of '|', '>':
  856. # this can lead to incorrect tokenization when | or > appear inside flow
  857. # content. checking whether we're inside flow content is not
  858. # chomsky type-3, so we won't do that here.
  859. g.kind = gtCommand
  860. g.state = gtCommand
  861. inc(pos)
  862. while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
  863. of '0'..'9': yamlPossibleNumber(g, pos)
  864. of '\0': g.kind = gtEof
  865. else: yamlPlainStrLit(g, pos)
  866. else:
  867. # outside document
  868. case g.buf[pos]
  869. of '%':
  870. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  871. g.kind = gtDirective
  872. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  873. else:
  874. g.state = gtOther
  875. yamlPlainStrLit(g, pos)
  876. of ' ', '\t'..'\r':
  877. g.kind = gtWhitespace
  878. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  879. of '#':
  880. g.kind = gtComment
  881. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  882. of '\0': g.kind = gtEof
  883. else:
  884. g.kind = gtNone
  885. g.state = gtOther
  886. g.length = pos - g.pos
  887. g.pos = pos
  888. proc pythonNextToken(g: var GeneralTokenizer) =
  889. const
  890. keywords: array[0..34, string] = [
  891. "False", "None", "True", "and", "as", "assert", "async", "await",
  892. "break", "class", "continue", "def", "del", "elif", "else", "except",
  893. "finally", "for", "from", "global", "if", "import", "in", "is", "lambda",
  894. "nonlocal", "not", "or", "pass", "raise", "return", "try", "while",
  895. "with", "yield"]
  896. nimNextToken(g, keywords)
  897. proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
  898. var pos = g.pos
  899. g.start = g.pos
  900. if g.state == low(TokenClass):
  901. g.state = if dollarPrompt: gtPrompt else: gtProgram
  902. case g.buf[pos]
  903. of ' ', '\t'..'\r':
  904. g.kind = gtWhitespace
  905. while g.buf[pos] in {' ', '\t'..'\r'}:
  906. if g.buf[pos] == '\n':
  907. g.state = if dollarPrompt: gtPrompt else: gtProgram
  908. inc(pos)
  909. of '\'', '"':
  910. g.kind = gtOption
  911. let q = g.buf[pos]
  912. inc(pos)
  913. while g.buf[pos] notin {q, '\0'}:
  914. inc(pos)
  915. if g.buf[pos] == q: inc(pos)
  916. of '#':
  917. g.kind = gtComment
  918. while g.buf[pos] notin {'\n', '\0'}:
  919. inc(pos)
  920. of '&', '|':
  921. g.kind = gtOperator
  922. inc(pos)
  923. if g.buf[pos] == g.buf[pos-1]: inc(pos)
  924. g.state = gtProgram
  925. of '(':
  926. g.kind = gtOperator
  927. g.state = gtProgram
  928. inc(pos)
  929. of ')':
  930. g.kind = gtOperator
  931. inc(pos)
  932. of ';':
  933. g.state = gtProgram
  934. g.kind = gtOperator
  935. inc(pos)
  936. of '\0': g.kind = gtEof
  937. elif dollarPrompt and g.state == gtPrompt:
  938. if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
  939. g.kind = gtPrompt
  940. inc pos, 2
  941. g.state = gtProgram
  942. else:
  943. g.kind = gtProgramOutput
  944. while g.buf[pos] notin {'\n', '\0'}:
  945. inc(pos)
  946. else:
  947. if g.state == gtProgram:
  948. g.kind = gtProgram
  949. g.state = gtOption
  950. else:
  951. g.kind = gtOption
  952. while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
  953. if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
  954. # (check space because ';' can be used inside arguments in Win bat)
  955. break
  956. if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
  957. g.kind = gtIdentifier # for file/dir name
  958. elif g.kind == gtProgram and g.buf[pos] == '=':
  959. g.kind = gtIdentifier # for env variable setting at beginning of line
  960. g.state = gtProgram
  961. inc(pos)
  962. g.length = pos - g.pos
  963. g.pos = pos
  964. proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  965. g.lang = lang
  966. case lang
  967. of langNone: assert false
  968. of langNim: nimNextToken(g)
  969. of langCpp: cppNextToken(g)
  970. of langCsharp: csharpNextToken(g)
  971. of langC: cNextToken(g)
  972. of langJava: javaNextToken(g)
  973. of langYaml: yamlNextToken(g)
  974. of langPython: pythonNextToken(g)
  975. of langCmd: cmdNextToken(g)
  976. of langConsole: cmdNextToken(g, dollarPrompt=true)
  977. proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
  978. var g: GeneralTokenizer
  979. initGeneralTokenizer(g, text)
  980. var prevPos = 0
  981. while true:
  982. getNextToken(g, lang)
  983. if g.kind == gtEof:
  984. break
  985. var s = text[prevPos ..< g.pos]
  986. result.add (s, g.kind)
  987. prevPos = g.pos
  988. when isMainModule:
  989. var keywords: seq[string]
  990. # Try to work running in both the subdir or at the root.
  991. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
  992. try:
  993. let input = readFile(filename)
  994. keywords = input.splitWhitespace()
  995. break
  996. except:
  997. echo filename, " not found"
  998. doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
  999. for i in 0..min(keywords.len, nimKeywords.len)-1:
  1000. doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
  1001. doAssert keywords.len == nimKeywords.len, "No matching lengths"