tparser_generator.nim 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. discard """
  2. output: '''Match failed: spam
  3. Match failed: ham'''
  4. joinable: false
  5. """
  6. # bug #6220
  7. import nre
  8. import options
  9. import strutils except isAlpha, isLower, isUpper, isSpace
  10. from unicode import isAlpha, isLower, isUpper, isTitle, isWhiteSpace
  11. import os
  12. const debugLex = false
  13. template debug(enable: bool, text: string): typed =
  14. when enable:
  15. echo(text)
  16. type
  17. Parser[N, T] = proc(text: T, start: int, nodes: var seq[Node[N]]): int {.closure.}
  18. RuleObj[N, T] = object
  19. parser: Parser[N, T]
  20. kind: N
  21. Rule[N, T] = ref RuleObj[N, T]
  22. NodeKind = enum
  23. terminal,
  24. nonterminal
  25. Node*[N] = object of RootObj
  26. # Uncomment the following lines and the compiler crashes
  27. # case nodeKind: NodeKind
  28. # of nonterminal:
  29. # kids: Node[N]
  30. # of terminal:
  31. # discard
  32. start*: int
  33. length*: int
  34. kind*: N
  35. NonTerminal[N] = object of Node
  36. children: seq[Node[N]]
  37. proc newRule[N, T](parser: Parser, kind: N): Rule[N, T] =
  38. new(result)
  39. result.parser = parser
  40. result.kind = kind
  41. proc newRule[N, T](kind: N): Rule[N, T] =
  42. new(result)
  43. result.kind = kind
  44. proc initNode[N](start: int, length: int, kind: N): Node[N] =
  45. result.start = start
  46. result.length = length
  47. result.kind = kind
  48. proc initNode[N](start: int, length: int, children: seq[Node[N]], kind: N): NonTerminal[N] =
  49. result.start = start
  50. result.length = length
  51. result.kind = kind
  52. result.children = children
  53. proc substr[T](text: T, first, last: int): T =
  54. text[first .. last]
  55. proc continuesWith[N](text: seq[Node[N]], subtext: seq[N], start: Natural): bool =
  56. let length = len(text)
  57. var pos = 0
  58. while pos < len(subtext):
  59. let textpos = start + pos
  60. if textpos == len(text):
  61. return false
  62. if text[textpos].kind != subtext[pos].kind:
  63. return false
  64. pos+=1
  65. return true
  66. proc render*[N, T](text: T, nodes: seq[Node[N]]): string =
  67. ## Uses a sequence of Nodes to render a given text string
  68. result = ""
  69. for node in nodes:
  70. result.add("<" & node.value(text) & ">")
  71. proc render*[N, T](rule: Rule[N, T], text: string): string =
  72. ## Uses a rule to render a given text string
  73. render(text, rule.parse(text))
  74. proc render*[N, T](text: T, nodes: seq[Node[N]], source: string): string =
  75. result = ""
  76. for node in nodes:
  77. result.add("[" & node.value(text, source) & "]")
  78. proc render*[N, T, X](rule: Rule[N, T], text: seq[Node[X]], source: string): string =
  79. ## Uses a rule to render a given series of nodes, providing the source string
  80. text.render(rule.parse(text, source = source), source)
  81. proc annotate*[N, T](node: Node[N], text: T): string =
  82. result = "<" & node.value(text) & ":" & $node.kind & ">"
  83. proc annotate*[N, T](nodes: seq[Node[N]], text: T): string =
  84. result = ""
  85. for node in nodes:
  86. result.add(node.annotate(text))
  87. proc annotate*[N, T](rule: Rule[N, T], text: T): string =
  88. annotate(rule.parse(text), text)
  89. proc value*[N, T](node: Node[N], text: T): string =
  90. result = $text.substr(node.start, node.start + node.length - 1)
  91. proc value*[N, X](node: Node[N], text: seq[Node[X]], source: string): string =
  92. result = ""
  93. for n in node.start ..< node.start + node.length:
  94. result &= text[n].annotate(source)
  95. proc parse*[N, T](rule: Rule[N, T], text: T, start = 0, source: string = ""): seq[Node[N]] =
  96. result = newSeq[Node[N]]()
  97. debug(debugLex, "Parsing: " & $text)
  98. let length = rule.parser(text, start, result)
  99. when T is string:
  100. if length == -1:
  101. echo("Match failed: " & $text)
  102. result = @[]
  103. elif length == len(text):
  104. debug(debugLex, "Matched: " & $text & " => " & $len(result) & " tokens: " & text.render(result))
  105. else:
  106. echo("Matched first " & $length & " symbols: " & $text & " => " & $len(result) & " tokens: " & text.render(result))
  107. else:
  108. if length == -1:
  109. echo("Match failed: " & $text)
  110. result = @[]
  111. elif length == len(text):
  112. debug(debugLex, "Matched: " & $text & " => " & $len(result) & " tokens: " & text.render(result, source))
  113. else:
  114. echo("Matched first " & $length & " symbols: " & $text & " => " & $len(result) & " tokens: " & text.render(result, source))
  115. proc literal*[N, T, P](pattern: P, kind: N): Rule[N, T] =
  116. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  117. if start == len(text):
  118. return -1
  119. doAssert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
  120. when P is string or P is seq[N]:
  121. debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start..start+len(pattern)-1])
  122. if text.continuesWith(pattern, start):
  123. let node = initNode(start, len(pattern), kind)
  124. nodes.add(node)
  125. debug(debugLex, "Literal: matched <" & $text[start ..< start+node.length] & ":" & $node.length & ">" )
  126. return node.length
  127. elif P is char:
  128. debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start])
  129. if text[start] == pattern:
  130. let node = initNode(start, 1, kind)
  131. nodes.add(node)
  132. return 1
  133. else:
  134. debug(debugLex, "Literal[" & $kind & "]: testing " & $pattern & " at " & $start & ": " & $text[start])
  135. if text[start].kind == pattern:
  136. let node = initNode(start, 1, kind)
  137. nodes.add(node)
  138. return 1
  139. return -1
  140. result = newRule[N, T](parser, kind)
  141. proc token[N, T](pattern: T, kind: N): Rule[N, T] =
  142. when T is not string:
  143. {.fatal: "Token is only supported for strings".}
  144. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  145. debug(debugLex, "Token[" & $kind & "]: testing " & pattern & " at " & $start)
  146. if start == len(text):
  147. return -1
  148. doAssert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
  149. let m = text.match(re(pattern), start)
  150. if m.isSome:
  151. let node = initNode(start, len(m.get.match), kind)
  152. nodes.add(node)
  153. result = node.length
  154. debug(debugLex, "Token: matched <" & text[start ..< start+node.length] & ":" & $node.length & ">" )
  155. else:
  156. result = -1
  157. result = newRule[N, T](parser, kind)
  158. proc chartest[N, T, S](testfunc: proc(s: S): bool, kind: N): Rule[N, T] =
  159. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  160. if start == len(text):
  161. return -1
  162. doAssert(len(text)>start, "Attempting to match at $#, string length is $# " % [$start, $len(text)])
  163. if testfunc(text[start]):
  164. nodes.add(initNode(start, 1, kind))
  165. result = 1
  166. else:
  167. result = -1
  168. result = newRule[N, T](parser, kind)
  169. proc any*[N, T, S](symbols: T, kind: N): Rule[N, T] =
  170. let test = proc(s: S): bool =
  171. when S is string:
  172. debug(debugLex, "Any[" & $kind & "]: testing for " & symbols.replace("\n", "\\n").replace("\r", "\\r"))
  173. else:
  174. debug(debugLex, "Any[" & $kind & "]: testing for " & $symbols)
  175. result = s in symbols
  176. result = chartest[N, T, S](test, kind)
  177. proc ignore*[N, T](rule: Rule[N, T]): Rule[N, T] =
  178. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  179. var mynodes = newSeq[Node[N]]()
  180. result = rule.parser(text, start, mynodes)
  181. result = newRule[N, T](parser, rule.kind)
  182. proc combine*[N, T](rule: Rule[N, T], kind: N): Rule[N, T] =
  183. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  184. var mynodes = newSeq[Node[N]]()
  185. result = rule.parser(text, start, mynodes)
  186. nodes.add(initNode(start, result, kind))
  187. result = newRule[N, T](parser, kind)
  188. proc build*[N, T](rule: Rule[N, T], kind: N): Rule[N, T] =
  189. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  190. var mynodes = newSeq[Node[N]]()
  191. result = rule.parser(text, start, mynodes)
  192. let nonTerminal = initNode(start, result, mynodes, kind)
  193. nodes.add(nonTerminal)
  194. result = newRule[N, T](parser, kind)
  195. proc fail*[N, T](message: string, kind: N): Rule[N, T] =
  196. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  197. let lineno = countLines(text[0..start])
  198. var startline = start
  199. var endline = start
  200. while startline>0:
  201. if text[startline] in NewLines:
  202. break
  203. startline-=1
  204. while endline < len(text):
  205. if text[endline] in NewLines:
  206. break
  207. endline+=1
  208. let charno = start-startline
  209. echo text.substr(startline, endline)
  210. echo ' '.repeat(max(charno,0)) & '^'
  211. raise newException(ValueError, "Position: " & $start & " Line: " & $lineno & ", Symbol: " & $charno & ": " & message)
  212. result = newRule[N, T](parser, kind)
  213. proc `+`*[N, T](left: Rule[N, T], right: Rule[N, T]): Rule[N, T] =
  214. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  215. var mynodes = newSeq[Node[N]]()
  216. doAssert(not isNil(left.parser), "Left hand side parser is nil")
  217. let leftlength = left.parser(text, start, mynodes)
  218. if leftlength == -1:
  219. return leftlength
  220. doAssert(not isNil(right.parser), "Right hand side parser is nil")
  221. let rightlength = right.parser(text, start+leftlength, mynodes)
  222. if rightlength == -1:
  223. return rightlength
  224. result = leftlength + rightlength
  225. nodes.add(mynodes)
  226. result = newRule[N, T](parser, left.kind)
  227. proc `/`*[N, T](left: Rule[N, T], right: Rule[N, T]): Rule[N, T] =
  228. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  229. var mynodes = newSeq[Node[N]]()
  230. doAssert(not isNil(left.parser), "Left hand side of / is not fully defined")
  231. let leftlength = left.parser(text, start, mynodes)
  232. if leftlength != -1:
  233. nodes.add(mynodes)
  234. return leftlength
  235. mynodes = newSeq[Node[N]]()
  236. doAssert(not isNil(right.parser), "Right hand side of / is not fully defined")
  237. let rightlength = right.parser(text, start, mynodes)
  238. if rightlength == -1:
  239. return rightlength
  240. nodes.add(mynodes)
  241. return rightlength
  242. result = newRule[N, T](parser, left.kind)
  243. proc `?`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  244. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  245. let success = rule.parser(text, start, nodes)
  246. return if success != -1: success else: 0
  247. result = newRule[N, T](parser, rule.kind)
  248. proc `+`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  249. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  250. var success = rule.parser(text, start, nodes)
  251. if success == -1:
  252. return success
  253. var total = 0
  254. while success != -1 and start+total < len(text):
  255. total += success
  256. success = rule.parser(text, start+total, nodes)
  257. return total
  258. result = newRule[N, T](parser, rule.kind)
  259. proc `*`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  260. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  261. let success = (+rule).parser(text, start, nodes)
  262. return if success != -1: success else: 0
  263. result = newRule[N, T](parser, rule.kind)
  264. #Note: this consumes - for zero-width lookahead see !
  265. proc `^`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  266. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  267. var mynodes = newSeq[Node[N]]()
  268. let success = rule.parser(text, start, mynodes)
  269. return if success == -1: 1 else: -1
  270. result = newRule[N, T](parser, rule.kind)
  271. proc `*`*[N, T](repetitions: int, rule: Rule[N, T]): Rule[N, T] =
  272. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  273. var mynodes = newSeq[Node[N]]()
  274. var total = 0
  275. for i in 0..<repetitions:
  276. let success = rule.parser(text, start+total, mynodes)
  277. if success == -1:
  278. return success
  279. else:
  280. total += success
  281. nodes.add(mynodes)
  282. return total
  283. result = newRule[N, T](parser, rule.kind)
  284. # Positive zero-width lookahead
  285. proc `&`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  286. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  287. var mynodes = newSeq[Node[N]]()
  288. let success = rule.parser(text, start, mynodes)
  289. return if success != -1: 0 else: -1
  290. result = newRule[N, T](parser, rule.kind)
  291. # Negative zero-width lookahead
  292. proc `!`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  293. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  294. var mynodes = newSeq[Node[N]]()
  295. let failure = rule.parser(text, start, mynodes)
  296. return if failure == -1: 0 else: -1
  297. result = newRule[N, T](parser, rule.kind)
  298. proc `/`*[N, T](rule: Rule[N, T]): Rule[N, T] =
  299. let parser = proc (text: T, start: int, nodes: var seq[Node[N]]): int =
  300. var mynodes = newSeq[Node[N]]()
  301. var length = 0
  302. var success = rule.parser(text, start+length, mynodes)
  303. while success == -1 and start+length < len(text):
  304. length += 1
  305. success = rule.parser(text, start+length, mynodes)
  306. if start+length >= len(text):
  307. result = -1
  308. else:
  309. nodes.add(initNode(start, length, rule.kind))
  310. nodes.add(mynodes)
  311. result = length + success
  312. result = newRule[N, T](parser, rule.kind)
  313. proc `->`*(rule: Rule, production: Rule) =
  314. doAssert(not isnil(production.parser), "Right hand side of -> is nil - has the rule been defined yet?")
  315. rule.parser = production.parser
  316. template grammar*[K](Kind, Text, Symbol: typedesc; default: K, code: untyped): typed {.hint[XDeclaredButNotUsed]: off.} =
  317. proc newRule(): Rule[Kind, Text] {.inject.} = newRule[Kind, Text](default)
  318. proc chartest(testfunc: proc(c: Symbol): bool): Rule[Kind, Text] {.inject.} = chartest[Kind, Text, Symbol](testfunc, default)
  319. proc literal[P](pattern: P, kind: K): Rule[Kind, Text] {.inject.} = literal[Kind, Text, P](pattern, kind)
  320. proc literal[P](pattern: P): Rule[Kind, Text] {.inject.} = literal[Kind, Text, P](pattern, default)
  321. when Text is string:
  322. proc token(pattern: string): Rule[Kind, Text] {.inject.} = token(pattern, default)
  323. proc fail(message: string): Rule[Kind, Text] {.inject.} = fail[Kind, Text](message, default)
  324. let alpha {.inject.} = chartest[Kind, Text, Symbol](isAlphaAscii, default)
  325. let alphanumeric {.inject.}= chartest[Kind, Text, Symbol](isAlphaNumeric, default)
  326. let digit {.inject.} = chartest[Kind, Text, Symbol](isDigit, default)
  327. let lower {.inject.} = chartest[Kind, Text, Symbol](isLowerAscii, default)
  328. let upper {.inject.} = chartest[Kind, Text, Symbol](isUpperAscii, default)
  329. let isspace = proc (x: char): bool = x.isSpaceAscii and not (x in NewLines)
  330. let space {.inject.} = chartest[Kind, Text, Symbol](isspace, default)
  331. let isnewline = proc (x: char): bool = x in NewLines
  332. let newline {.inject.} = chartest[Kind, Text, Symbol](isnewline, default)
  333. let alphas {.inject.} = combine(+alpha, default)
  334. let alphanumerics {.inject.} = combine(+alphanumeric, default)
  335. let digits {.inject.} = combine(+digit, default)
  336. let lowers {.inject.} = combine(+lower, default)
  337. let uppers {.inject.} = combine(+upper, default)
  338. let spaces {.inject.} = combine(+space, default)
  339. let newlines {.inject.} = combine(+newline, default)
  340. proc any(chars: Text): Rule[Kind, Text] {.inject.} = any[Kind, Text, Symbol](chars, default)
  341. proc combine(rule: Rule[Kind, Text]): Rule[Kind, Text] {.inject.} = combine[Kind, Text](rule, default)
  342. code
  343. template grammar*[K](Kind: typedesc; default: K, code: untyped): typed {.hint[XDeclaredButNotUsed]: off.} =
  344. grammar(Kind, string, char, default, code)
  345. block:
  346. type DummyKind = enum dkDefault
  347. grammar(DummyKind, string, char, dkDefault):
  348. let rule = token("h[a]+m") + ignore(token(r"\s+")) + (literal("eggs") / literal("beans"))
  349. var text = "ham beans"
  350. discard rule.parse(text)
  351. var recursive = newRule()
  352. recursive -> (literal("(") + recursive + literal(")")) / token(r"\d+")
  353. for test in ["spam", "57", "(25)", "((25))"]:
  354. discard recursive.parse(test)
  355. let repeated = +literal("spam") + ?literal("ham") + *literal("salami")
  356. for test in ["ham", "spam", "spamspamspam" , "spamham", "spamsalami", "spamsalamisalami"]:
  357. discard repeated.parse(test)