123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- package parser
- import "fmt"
- import "kumachan/lang/source"
- import "kumachan/lang/textual/cst"
- import "kumachan/lang/textual/syntax"
- import "kumachan/lang/textual/scanner"
- /**
- * Universal First-Fit Parser
- *
- * This parser can parse some Unicode text into a Concrete Syntax Tree
- * according to the syntax defined in the `syntax` package.
- * In this file, the `buildTree()` function describes the
- * core logic of the parser; the `Parse()` function is intended
- * to be used outside the package as an API.
- * Note that the CST generated by this parser is still primitive so that
- * further transformation of the CST (into AST) is recommended.
- */
- func buildTree(root syntax.Id, tokens scanner.Tokens) ([]cst.TreeNode, *Error) {
- /**
- * This function performs a top-down derivation on the tokens.
- *
- * A "stack" (considered as a raw CST at same time) is used
- * to record state, instead of a recursion procedure.
- * Although this function does not contain too much code,
- * its logic is extraordinarily complicated.
- * Therefore, be careful when modifying the following code.
- */
- var NameId = syntax.NameTokenId()
- var RootId = root
- var RootPart = syntax.Part {
- Id: RootId,
- PartType: syntax.Recursive,
- Required: true,
- }
- var ZeroSpan = scanner.Span { Start: 0, End: 0 }
- // prepare the stack (tree), push the root node
- var tree = make([]cst.TreeNode, 0)
- tree = append(tree, cst.TreeNode {
- Part: RootPart, Parent: -1,
- Length: 0, Status: cst.Initial,
- Tried: 0, Index: 0,
- Pos: 0, Amount: 0,
- Span: ZeroSpan,
- })
- // set node pointer (current index) to zero and start looping
- var ptr = 0
- loop: for {
- var node = &tree[ptr]
- var id = node.Part.Id
- var part_type = node.Part.PartType
- switch part_type {
- case syntax.Recursive:
- if node.Status == cst.Initial {
- node.Status = cst.BranchFailed
- }
- // derivation through a branch failed
- if node.Status == cst.BranchFailed {
- var rule = syntax.Id2Rule(id)
- var num_branches = len(rule.Branches)
- // check if all branches have been tried
- if node.Tried == num_branches {
- // if tried, switch to a final status
- if rule.Nullable {
- node.Status = cst.Success
- node.Length = 0
- node.Span = ZeroSpan
- } else {
- node.Status = cst.Failed
- }
- }
- }
- case syntax.MatchToken:
- if node.Pos >= len(tokens) { node.Status = cst.Failed; break }
- var token_id = tokens[node.Pos].Id
- if token_id == id {
- node.Status = cst.Success
- node.Amount = 1
- node.Span = tokens[node.Pos].Span
- } else {
- node.Status = cst.Failed
- }
- case syntax.MatchKeyword:
- if node.Pos >= len(tokens) { node.Status = cst.Failed; break }
- var token = tokens[node.Pos]
- var token_id = token.Id
- if token_id != NameId { node.Status = cst.Failed; break }
- var text = token.Content
- var keyword = syntax.Id2ConditionalKeyword(id)
- if len(text) != len(keyword) { node.Status = cst.Failed; break }
- var equal = true
- for i, char := range keyword {
- if char != rune(text[i]) {
- equal = false
- break
- }
- }
- if equal {
- node.Status = cst.Success
- node.Amount = 1
- node.Span = token.Span
- } else {
- node.Status = cst.Failed
- }
- default:
- panic("invalid part type")
- }
- // if node is in final status
- if node.Status == cst.Success || node.Status == cst.Failed {
- // if part_type is Recursive, empty match <=> node.Length == 0
- // if part_type is otherwise, empty match <=> node.Amount == 0
- // if node.part is required, it should not be empty
- if node.Part.Required && ((node.Status == cst.Failed) ||
- (node.Length == 0 && node.Amount == 0)) {
- // PrintFlatTree(tree)
- return tree, &Error {
- HasExpectedPart: true,
- ExpectedPart: id,
- NodeIndex: ptr,
- }
- }
- }
- switch node.Status {
- case cst.BranchFailed:
- // status == BranchFailed => part_type == Recursive
- var rule = syntax.Id2Rule(id)
- var next = rule.Branches[node.Tried]
- node.Tried += 1 // increment the number of tried branches
- node.Length = 0 // clear invalid children
- // derive through the next branch
- var num_parts = len(next.Parts)
- for i := 0; i < num_parts; i += 1 {
- var part = next.Parts[i]
- tree = append(tree, cst.TreeNode {
- Part: part, Parent: ptr,
- Length: 0, Status: cst.Initial,
- Tried: 0, Index: i,
- Pos: -1, Amount: 0,
- Span: ZeroSpan,
- })
- }
- ptr = (len(tree) - num_parts)
- tree[ptr].Pos = node.Pos
- node.Status = cst.Pending
- case cst.Failed:
- var parent_ptr = node.Parent
- if parent_ptr < 0 { break loop }
- var parent = &tree[parent_ptr]
- parent.Status = cst.BranchFailed // notify failure to parent node
- tree = tree[0: ptr-(node.Index)] // clear invalid nodes
- ptr = parent_ptr // go back to parent node
- case cst.Success:
- if part_type == syntax.Recursive {
- // calculate the number of tokens matched by the node
- node.Amount = 0
- for i := 0; i < node.Length; i += 1 {
- var child = node.Children[i]
- node.Amount += tree[child].Amount
- }
- }
- var parent_ptr = node.Parent
- if parent_ptr < 0 { break loop }
- var parent = &tree[parent_ptr]
- parent.Children[parent.Length] = ptr
- parent.Length += 1
- parent.Span = parent.Span.Merged(node.Span)
- if ((ptr+1 < len(tree)) && (tree[ptr+1].Parent == parent_ptr)) {
- // if node.part is NOT the last part in the branch,
- // go to the node corresponding to the next part
- ptr += 1
- tree[ptr].Pos = (node.Pos + node.Amount)
- } else {
- // if node.part is the last part in the branch
- // notify success to the parent node and go to it
- ptr = parent_ptr
- tree[ptr].Status = cst.Success
- }
- default:
- panic("invalid status")
- }
- }
- // check if all the tokens have been matched
- var root_node = tree[0]
- if root_node.Amount < len(tokens) {
- // PrintFlatTree(tree)
- var last_token_ptr = 0
- var last_token_pos = 0
- for i, node := range tree {
- switch node.Part.PartType {
- case syntax.MatchKeyword,
- syntax.MatchToken:
- if node.Pos > last_token_pos {
- last_token_pos = node.Pos
- last_token_ptr = i
- }
- }
- }
- return tree, &Error {
- HasExpectedPart: false,
- NodeIndex: last_token_ptr,
- }
- }
- return tree, nil
- }
- func Parse(code source.Code, root syntax.Id, name string) (*cst.Tree, *Error) {
- var tokens, info, span_map, s_err = scanner.Scan(code)
- if s_err != nil { return nil, &Error {
- IsScannerError: true,
- ScannerError: fmt.Errorf("error scanning '%s': %w", name, s_err),
- } }
- var nodes, err = buildTree(root, tokens)
- var tree = &cst.Tree {
- Name: name, Nodes: nodes,
- Code: code, Tokens: tokens,
- Info: info, SpanMap: span_map,
- }
- if err != nil {
- err.Tree = tree
- }
- return tree, err
- }
|