parser.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. package parser
  2. import "fmt"
  3. import "kumachan/lang/source"
  4. import "kumachan/lang/textual/cst"
  5. import "kumachan/lang/textual/syntax"
  6. import "kumachan/lang/textual/scanner"
  7. /**
  8. * Universal First-Fit Parser
  9. *
  10. * This parser can parse some Unicode text into a Concrete Syntax Tree
  11. * according to the syntax defined in the `syntax` package.
  12. * In this file, the `buildTree()` function describes the
  13. * core logic of the parser; the `Parse()` function is intended
  14. * to be used outside the package as an API.
  15. * Note that the CST generated by this parser is still primitive so that
  16. * further transformation of the CST (into AST) is recommended.
  17. */
  18. func buildTree(root syntax.Id, tokens scanner.Tokens) ([]cst.TreeNode, *Error) {
  19. /**
  20. * This function performs a top-down derivation on the tokens.
  21. *
  22. * A "stack" (considered as a raw CST at same time) is used
  23. * to record state, instead of a recursion procedure.
  24. * Although this function does not contain too much code,
  25. * its logic is extraordinarily complicated.
  26. * Therefore, be careful when modifying the following code.
  27. */
  28. var NameId = syntax.NameTokenId()
  29. var RootId = root
  30. var RootPart = syntax.Part {
  31. Id: RootId,
  32. PartType: syntax.Recursive,
  33. Required: true,
  34. }
  35. var ZeroSpan = scanner.Span { Start: 0, End: 0 }
  36. // prepare the stack (tree), push the root node
  37. var tree = make([]cst.TreeNode, 0)
  38. tree = append(tree, cst.TreeNode {
  39. Part: RootPart, Parent: -1,
  40. Length: 0, Status: cst.Initial,
  41. Tried: 0, Index: 0,
  42. Pos: 0, Amount: 0,
  43. Span: ZeroSpan,
  44. })
  45. // set node pointer (current index) to zero and start looping
  46. var ptr = 0
  47. loop: for {
  48. var node = &tree[ptr]
  49. var id = node.Part.Id
  50. var part_type = node.Part.PartType
  51. switch part_type {
  52. case syntax.Recursive:
  53. if node.Status == cst.Initial {
  54. node.Status = cst.BranchFailed
  55. }
  56. // derivation through a branch failed
  57. if node.Status == cst.BranchFailed {
  58. var rule = syntax.Id2Rule(id)
  59. var num_branches = len(rule.Branches)
  60. // check if all branches have been tried
  61. if node.Tried == num_branches {
  62. // if tried, switch to a final status
  63. if rule.Nullable {
  64. node.Status = cst.Success
  65. node.Length = 0
  66. node.Span = ZeroSpan
  67. } else {
  68. node.Status = cst.Failed
  69. }
  70. }
  71. }
  72. case syntax.MatchToken:
  73. if node.Pos >= len(tokens) { node.Status = cst.Failed; break }
  74. var token_id = tokens[node.Pos].Id
  75. if token_id == id {
  76. node.Status = cst.Success
  77. node.Amount = 1
  78. node.Span = tokens[node.Pos].Span
  79. } else {
  80. node.Status = cst.Failed
  81. }
  82. case syntax.MatchKeyword:
  83. if node.Pos >= len(tokens) { node.Status = cst.Failed; break }
  84. var token = tokens[node.Pos]
  85. var token_id = token.Id
  86. if token_id != NameId { node.Status = cst.Failed; break }
  87. var text = token.Content
  88. var keyword = syntax.Id2ConditionalKeyword(id)
  89. if len(text) != len(keyword) { node.Status = cst.Failed; break }
  90. var equal = true
  91. for i, char := range keyword {
  92. if char != rune(text[i]) {
  93. equal = false
  94. break
  95. }
  96. }
  97. if equal {
  98. node.Status = cst.Success
  99. node.Amount = 1
  100. node.Span = token.Span
  101. } else {
  102. node.Status = cst.Failed
  103. }
  104. default:
  105. panic("invalid part type")
  106. }
  107. // if node is in final status
  108. if node.Status == cst.Success || node.Status == cst.Failed {
  109. // if part_type is Recursive, empty match <=> node.Length == 0
  110. // if part_type is otherwise, empty match <=> node.Amount == 0
  111. // if node.part is required, it should not be empty
  112. if node.Part.Required && ((node.Status == cst.Failed) ||
  113. (node.Length == 0 && node.Amount == 0)) {
  114. // PrintFlatTree(tree)
  115. return tree, &Error {
  116. HasExpectedPart: true,
  117. ExpectedPart: id,
  118. NodeIndex: ptr,
  119. }
  120. }
  121. }
  122. switch node.Status {
  123. case cst.BranchFailed:
  124. // status == BranchFailed => part_type == Recursive
  125. var rule = syntax.Id2Rule(id)
  126. var next = rule.Branches[node.Tried]
  127. node.Tried += 1 // increment the number of tried branches
  128. node.Length = 0 // clear invalid children
  129. // derive through the next branch
  130. var num_parts = len(next.Parts)
  131. for i := 0; i < num_parts; i += 1 {
  132. var part = next.Parts[i]
  133. tree = append(tree, cst.TreeNode {
  134. Part: part, Parent: ptr,
  135. Length: 0, Status: cst.Initial,
  136. Tried: 0, Index: i,
  137. Pos: -1, Amount: 0,
  138. Span: ZeroSpan,
  139. })
  140. }
  141. ptr = (len(tree) - num_parts)
  142. tree[ptr].Pos = node.Pos
  143. node.Status = cst.Pending
  144. case cst.Failed:
  145. var parent_ptr = node.Parent
  146. if parent_ptr < 0 { break loop }
  147. var parent = &tree[parent_ptr]
  148. parent.Status = cst.BranchFailed // notify failure to parent node
  149. tree = tree[0: ptr-(node.Index)] // clear invalid nodes
  150. ptr = parent_ptr // go back to parent node
  151. case cst.Success:
  152. if part_type == syntax.Recursive {
  153. // calculate the number of tokens matched by the node
  154. node.Amount = 0
  155. for i := 0; i < node.Length; i += 1 {
  156. var child = node.Children[i]
  157. node.Amount += tree[child].Amount
  158. }
  159. }
  160. var parent_ptr = node.Parent
  161. if parent_ptr < 0 { break loop }
  162. var parent = &tree[parent_ptr]
  163. parent.Children[parent.Length] = ptr
  164. parent.Length += 1
  165. parent.Span = parent.Span.Merged(node.Span)
  166. if ((ptr+1 < len(tree)) && (tree[ptr+1].Parent == parent_ptr)) {
  167. // if node.part is NOT the last part in the branch,
  168. // go to the node corresponding to the next part
  169. ptr += 1
  170. tree[ptr].Pos = (node.Pos + node.Amount)
  171. } else {
  172. // if node.part is the last part in the branch
  173. // notify success to the parent node and go to it
  174. ptr = parent_ptr
  175. tree[ptr].Status = cst.Success
  176. }
  177. default:
  178. panic("invalid status")
  179. }
  180. }
  181. // check if all the tokens have been matched
  182. var root_node = tree[0]
  183. if root_node.Amount < len(tokens) {
  184. // PrintFlatTree(tree)
  185. var last_token_ptr = 0
  186. var last_token_pos = 0
  187. for i, node := range tree {
  188. switch node.Part.PartType {
  189. case syntax.MatchKeyword,
  190. syntax.MatchToken:
  191. if node.Pos > last_token_pos {
  192. last_token_pos = node.Pos
  193. last_token_ptr = i
  194. }
  195. }
  196. }
  197. return tree, &Error {
  198. HasExpectedPart: false,
  199. NodeIndex: last_token_ptr,
  200. }
  201. }
  202. return tree, nil
  203. }
  204. func Parse(code source.Code, root syntax.Id, name string) (*cst.Tree, *Error) {
  205. var tokens, info, span_map, s_err = scanner.Scan(code)
  206. if s_err != nil { return nil, &Error {
  207. IsScannerError: true,
  208. ScannerError: fmt.Errorf("error scanning '%s': %w", name, s_err),
  209. } }
  210. var nodes, err = buildTree(root, tokens)
  211. var tree = &cst.Tree {
  212. Name: name, Nodes: nodes,
  213. Code: code, Tokens: tokens,
  214. Info: info, SpanMap: span_map,
  215. }
  216. if err != nil {
  217. err.Tree = tree
  218. }
  219. return tree, err
  220. }