tokeniser.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. package dirty
  2. import (
  3. "bufio"
  4. "fmt"
  5. "io"
  6. )
  7. // todo use reader.Peek()
  8. var skipped rune = 0
  9. type tokenType int
  10. const (
  11. UNKNOWN tokenType = iota
  12. LBRACKET
  13. RBRACKET
  14. STRING
  15. STRING_RAW
  16. NUMBER
  17. FLOAT
  18. CONST
  19. COMMENT
  20. EOF
  21. )
  22. type token struct {
  23. ttype tokenType
  24. t string
  25. i *int64
  26. f *float64
  27. }
  28. func nextToken(reader *bufio.Reader) (token, error) {
  29. t, finished, e := nextToken_initial(reader)
  30. if finished || e != nil {
  31. return t, e
  32. }
  33. t, e = nextToken_rest(reader, t)
  34. return t, e
  35. }
  36. func nextToken_initial(reader *bufio.Reader) (token, bool, error) {
  37. var (
  38. r rune = 0
  39. err error = nil
  40. t token
  41. )
  42. initialTokenLoop:
  43. for {
  44. if skipped != 0 {
  45. r = skipped
  46. skipped = 0
  47. } else {
  48. r, _, err = reader.ReadRune()
  49. }
  50. //debugf("%c\n", r)
  51. if err != nil {
  52. if err == io.EOF {
  53. t := token{ttype: EOF}
  54. return t, true, nil
  55. }
  56. return token{}, true, fmt.Errorf("while reading: %w", err)
  57. }
  58. switch {
  59. case r == '(':
  60. return token{ttype: LBRACKET}, true, nil
  61. case r == ')':
  62. return token{ttype: RBRACKET}, true, nil
  63. case r == '#':
  64. t = token{ttype: COMMENT}
  65. break initialTokenLoop
  66. case r == '`':
  67. t = token{ttype: STRING_RAW}
  68. break initialTokenLoop
  69. case r == '\'':
  70. t = token{ttype: STRING}
  71. break initialTokenLoop
  72. case r == 't':
  73. t = token{ttype: CONST, t: "t"}
  74. break initialTokenLoop
  75. case r == 'f':
  76. t = token{ttype: CONST, t: "f"}
  77. break initialTokenLoop
  78. case r == 'n':
  79. t = token{ttype: CONST, t: "n"}
  80. break initialTokenLoop
  81. case in(r, []rune{'1', '2', '3', '4', '5', '6', '7', '8', '9', '↊', '↋', '-', '.', '·', ','}):
  82. t = token{ttype: NUMBER, t: string(r)}
  83. break initialTokenLoop
  84. case r == '0':
  85. r, _, err = reader.ReadRune()
  86. //debugf("%c\n", r)
  87. if err != nil {
  88. if err == io.EOF {
  89. t := token{ttype: EOF}
  90. return t, true, nil
  91. }
  92. return token{}, true, fmt.Errorf("while reading: %w", err)
  93. }
  94. switch r {
  95. case 'b':
  96. t = token{ttype: NUMBER, t: "0b"}
  97. break initialTokenLoop
  98. case 'o':
  99. t = token{ttype: NUMBER, t: "0o"}
  100. break initialTokenLoop
  101. case 'x':
  102. t = token{ttype: NUMBER, t: "0x"}
  103. break initialTokenLoop
  104. default:
  105. skipped = r
  106. var zero int64 = 0
  107. return token{ttype: NUMBER, i: &zero}, true, nil
  108. }
  109. case in(r, []rune{' ', '\t', '\n', '\r'}):
  110. continue
  111. default:
  112. return token{}, true, nil
  113. }
  114. }
  115. return t, false, err
  116. }
  117. func nextToken_rest(reader *bufio.Reader, t token) (token, error) {
  118. var (
  119. r rune
  120. err error = nil
  121. escaping bool = false
  122. stringRawIndent string = ""
  123. stringRawIndentSkip string = ""
  124. stringRawState int = 0 // todo enum
  125. )
  126. tokenLoop:
  127. for {
  128. if skipped != 0 {
  129. r = skipped
  130. skipped = 0
  131. } else {
  132. r, _, err = reader.ReadRune()
  133. }
  134. //debugf("%c\n", r)
  135. // todo line, column
  136. if err != nil {
  137. if err == io.EOF {
  138. if t.ttype == STRING || t.ttype == STRING_RAW {
  139. return token{}, NewUnterminatedError("string", t.t)
  140. } else {
  141. return token{ttype: EOF}, nil
  142. }
  143. }
  144. return token{}, fmt.Errorf("while reading: %w", err)
  145. }
  146. switch t.ttype {
  147. case COMMENT:
  148. if r != '\n' {
  149. t.t += string(r)
  150. } else {
  151. break tokenLoop
  152. }
  153. case STRING:
  154. if !escaping && r == '\'' {
  155. t, err = parseString(t)
  156. if err != nil {
  157. return token{}, err
  158. }
  159. break tokenLoop
  160. } else if r == '\n' {
  161. return token{}, NewUnterminatedError("string", t.t)
  162. } else {
  163. t.t += string(r)
  164. }
  165. if escaping {
  166. escaping = false
  167. } else if r == '\\' {
  168. escaping = true
  169. }
  170. case STRING_RAW:
  171. if stringRawState == 0 {
  172. if r != '\n' {
  173. return token{}, NewRawStringError("missing new line after opening `")
  174. } else {
  175. stringRawState = 1
  176. continue
  177. }
  178. }
  179. if stringRawState == 1 {
  180. if r == ' ' || r == '\t' {
  181. stringRawIndent += string(r)
  182. } else {
  183. stringRawState = 2
  184. t.t += string(r)
  185. }
  186. continue
  187. }
  188. if stringRawState == 2 {
  189. // fixme assumes lines ending with \n; get to end of line
  190. if r == '\n' {
  191. stringRawState = 3
  192. stringRawIndentSkip = ""
  193. }
  194. t.t += string(r)
  195. continue
  196. }
  197. if stringRawState == 3 {
  198. if len(stringRawIndentSkip) == 0 && r == '`' {
  199. break tokenLoop
  200. }
  201. if len(stringRawIndentSkip) < len(stringRawIndent) {
  202. stringRawIndentSkip += string(r)
  203. } else {
  204. if stringRawIndent != stringRawIndentSkip {
  205. // todo convert whitespace to escape codes
  206. return token{}, NewRawStringError("Indent ‘" + stringRawIndent + "’ does not begin with ‘" + stringRawIndentSkip + "’")
  207. }
  208. skipped = r
  209. stringRawState = 2
  210. }
  211. }
  212. case CONST:
  213. if t.t[0] == 't' && in(r, []rune{'r', 'u', 'e'}) && len(t.t) < 4 {
  214. t.t += string(r)
  215. continue
  216. }
  217. if t.t[0] == 'f' && in(r, []rune{'a', 'l', 's', 'e'}) && len(t.t) < 5 {
  218. t.t += string(r)
  219. continue
  220. }
  221. if in(r, []rune{'u', 'l'}) && len(t.t) < 4 {
  222. t.t += string(r)
  223. continue
  224. }
  225. skipped = r
  226. t, err = parseConst(t)
  227. break tokenLoop
  228. case NUMBER:
  229. if t.t[0] == '0' && t.t[1] == 'b' && in(r, []rune{'0', '1', ','}) {
  230. t.t += string(r)
  231. continue
  232. }
  233. if t.t[0] == '0' && t.t[1] == 'o' && in(r, []rune{'0', '1', '2', '3', '4', '5', '6', '7'}) {
  234. t.t += string(r)
  235. continue
  236. }
  237. if t.t[0] == '0' && t.t[1] == 'x' && in(r, []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F'}) {
  238. t.t += string(r)
  239. continue
  240. }
  241. if in(r, []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '↊', '↋', ',', '.', '·', 'e', '-'}) {
  242. t.t += string(r)
  243. continue
  244. }
  245. skipped = r
  246. t, err = parseNumber(t) // todo errors that are not CommaError <- NumberError
  247. break tokenLoop
  248. }
  249. }
  250. return t, err
  251. }
  252. func in(c rune, expected []rune) bool {
  253. for _, e := range expected {
  254. if c == e {
  255. return true
  256. }
  257. }
  258. return false
  259. }