lexer.scm 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. ;;; lexer.scm -- lexer for Joy.
  2. ;;; Copyright © 2016 Eric Bavier <bavier@member.fsf.org>
  3. ;;;
  4. ;;; Joy is free software; you can redistribute it and/or modify it under
  5. ;;; the terms of the GNU General Public License as published by the Free
  6. ;;; Software Foundation; either version 3 of the License, or (at your
  7. ;;; option) any later version.
  8. ;;;
  9. ;;; Joy is distributed in the hope that it will be useful, but WITHOUT
  10. ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. ;;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
  12. ;;; License for more details.
  13. ;;;
  14. ;;; You should have received a copy of the GNU General Public License
  15. ;;; along with Joy. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Code:
  17. (define-module (language joy lexer)
  18. #:use-module (ice-9 rdelim)
  19. #:export (get-lexer))
  20. ;;; See j09imp.html for a more thorough description of that prototype of
  21. ;;; a Joy interpreter.
  22. ;;;
  23. ;;; There it says that joy interpreter supports lines starting with '$',
  24. ;;; which are processed by the command shell. Interesting.
  25. (define *keywords*
  26. '(("==" . ==)
  27. ("MODULE" . module)
  28. ("PRIVATE" . private)
  29. ("PUBLIC" . public)
  30. ("DEFINE" . define)
  31. ("END" . end)))
  32. (define integer-regex (make-regexp "^[+-]?[0-9]+$"))
  33. (define float-regex
  34. (make-regexp
  35. "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
  36. (define symbol-allowed-characters
  37. (char-set-difference
  38. ;; We allow #\. because it is handled elsewhere
  39. char-set:graphic (string->char-set "[]{};")))
  40. (define (get-symbol-or-number port)
  41. (let iterate ((result-chars '())
  42. (non-numeric? #f))
  43. (let* ((c (read-char port))
  44. (finish (lambda ()
  45. (let ((result (list->string
  46. (reverse result-chars))))
  47. (values
  48. (cond
  49. ((regexp-exec integer-regex result)
  50. 'integer)
  51. ((regexp-exec float-regex result)
  52. 'float)
  53. (else 'symbol))
  54. result))))
  55. (allowed? (lambda (c)
  56. (char-set-contains?
  57. symbol-allowed-characters c))))
  58. (cond
  59. ((eof-object? c) (finish))
  60. ((char=? c #\\)
  61. (error "character escapes not allowed in symbols"))
  62. ((allowed? c)
  63. (iterate (cons c result-chars)
  64. (or non-numeric?
  65. (not (or (char-numeric? c)
  66. (char=? c #\+)
  67. (char=? c #\-))))))
  68. (else
  69. (unread-char c port)
  70. (finish))))))
  71. (define (char-hex? c)
  72. (and (not (eof-object? c))
  73. (or (char-numeric? c)
  74. (memv c '(#\a #\b #\c #\d #\e #\f))
  75. (memv c '(#\A #\B #\C #\D #\E #\F)))))
  76. (define (digit->number c)
  77. (- (char->integer c) (char->integer #\0)))
  78. (define (hex->number c)
  79. (if (char-numeric? c)
  80. (digit->number c)
  81. (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
  82. (define (read-escape port)
  83. (let ((c (read-char port)))
  84. (case c
  85. ((#\' #\" #\\) c)
  86. ((#\b) #\bs)
  87. ((#\f) #\np)
  88. ((#\n) #\nl)
  89. ((#\r) #\cr)
  90. ((#\t) #\tab)
  91. ((#\v) #\vt)
  92. ((#\0)
  93. (let ((next (peek-char port)))
  94. (cond
  95. ((eof-object? next) #\nul)
  96. ((char-numeric? next)
  97. (error "octal escape sequences are not supported"))
  98. (else #\nul))))
  99. ((#\x)
  100. (let* ((a (read-char port))
  101. (b (read-char port)))
  102. (cond
  103. ((and (char-hex? a) (char-hex? b))
  104. (integer->char (+ (* 16 (hex->number a)) (hex->number b))))
  105. (else
  106. (error "bad hex character escape")))))
  107. ((#\u)
  108. (let* ((a (read-char port))
  109. (b (read-char port))
  110. (c (read-char port))
  111. (d (read-char port)))
  112. (integer->char (string->number (string a b c d) 16))))
  113. (else
  114. c))))
  115. (define (read-string port)
  116. (let iterate ((chars '()))
  117. (let ((c (read-char port)))
  118. (case c
  119. ((#\")
  120. (list->string (reverse chars)))
  121. ((#\\)
  122. (case (peek-char port)
  123. ((#\newline #\space)
  124. (iterate chars))
  125. (else
  126. (iterate (cons (read-character port) chars)))))
  127. (else
  128. (iterate (cons c chars)))))))
  129. (define (read-character port)
  130. (let ((c (read-char port)))
  131. (case c
  132. ((#\\) (read-escape port))
  133. (else c))))
  134. ;;; Main lexer routine which is given a port and looks for the next
  135. ;;; token.
  136. (define (lex port)
  137. (let ((return (let ((file (if (file-port? port)
  138. (port-filename port)
  139. #f))
  140. (line (1+ (port-line port)))
  141. (column (1+ (port-column port))))
  142. (lambda (token value)
  143. (let ((obj (cons token value)))
  144. (set-source-property! obj 'filename file)
  145. (set-source-property! obj 'line line)
  146. (set-source-property! obj 'column column)
  147. obj))))
  148. ;; Read afterwards so the source-properties are correct above
  149. ;; and actually point to the very character to be read.
  150. (c (read-char port)))
  151. (cond
  152. ;; End of input must be specially marked to the parser.
  153. ((eof-object? c) (return 'eof c))
  154. ;; Explicitely mark newline's so the parser can delimit
  155. ;; expressions with it if necessary.
  156. ((char=? c #\newline) (return 'newline c))
  157. ;; Whitespace, just skip it.
  158. ((char-whitespace? c) (lex port))
  159. (else
  160. (case c
  161. ;; An line comment, skip until end-of-line is found
  162. ((#\#)
  163. (read-line port)
  164. (lex port))
  165. ((#\')
  166. ;; A literal character
  167. (return 'character (read-character port)))
  168. ((#\")
  169. ;; A literal string. Similar to single characters, except
  170. ;; that escaped newline and space are to be completely
  171. ;; ignored.
  172. (return 'string (read-string port)))
  173. ((#\()
  174. (let ((c (read-char port)))
  175. (case c
  176. ;; Multi-line comment, discard until closing "*)"
  177. ((#\*)
  178. (let iterate ()
  179. (let ((c (read-char port)))
  180. (cond
  181. ((eof-object? c)
  182. (error "unexpected end of file in multi-line comment"))
  183. ((char=? c #\*)
  184. (cond
  185. ((char=? (read-char port) #\)) (lex port))
  186. (else (iterate))))
  187. (else (iterate))))))
  188. (else
  189. ;; The #\( could be understood as part of a symbol, but
  190. ;; it seems wiser to reserve it for future use as its
  191. ;; own token.
  192. (unread-char c port)
  193. (return 'paren-open #f)))))
  194. ((#\)) (return 'paren-close c))
  195. ((#\[) (return 'square-open c))
  196. ((#\]) (return 'square-close c))
  197. ((#\{) (return 'bracket-open c))
  198. ((#\}) (return 'bracket-close c))
  199. ((#\;) (return 'semicolon c))
  200. (else
  201. ;; Now only have numeric or symbol input possible.
  202. (unread-char c port)
  203. (call-with-values
  204. (lambda () (get-symbol-or-number port))
  205. (lambda (type str)
  206. (case type
  207. ((symbol)
  208. ;; str could be empty if the first character is already
  209. ;; something not allowed in a symbol (and not escaped)!
  210. ;; Take care about that, it is an error because that
  211. ;; character should have been handled elsewhere or is
  212. ;; invalid in the input.
  213. (cond
  214. ((zero? (string-length str))
  215. (begin
  216. ;; Take it out so the REPL might not get into an
  217. ;; infinite loop with further reading attempts.
  218. (read-char port)
  219. (error "invalid character in input" c)))
  220. ((assoc-ref *keywords* str)
  221. => (lambda (kw) (return kw str)))
  222. (else
  223. (return 'symbol (string->symbol str)))))
  224. ((integer)
  225. (return 'integer (string->number str)))
  226. ((float)
  227. (return 'float (string->number str)))
  228. (else
  229. (error "unexpected numeric/symbol type" type)))))))))))
  230. ;;; Build a lexer thunk for a port. This is the exported routine
  231. ;;; which can be used to create a lexer for the parser to use.
  232. (define (get-lexer port)
  233. (lambda () (lex port)))