123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- ;;; lexer.scm -- lexer for Joy.
- ;;; Copyright © 2016 Eric Bavier <bavier@member.fsf.org>
- ;;;
- ;;; Joy is free software; you can redistribute it and/or modify it under
- ;;; the terms of the GNU General Public License as published by the Free
- ;;; Software Foundation; either version 3 of the License, or (at your
- ;;; option) any later version.
- ;;;
- ;;; Joy is distributed in the hope that it will be useful, but WITHOUT
- ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- ;;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
- ;;; License for more details.
- ;;;
- ;;; You should have received a copy of the GNU General Public License
- ;;; along with Joy. If not, see <http://www.gnu.org/licenses/>.
- ;;; Code:
- (define-module (language joy lexer)
- #:use-module (ice-9 rdelim)
- #:export (get-lexer))
- ;;; See j09imp.html for a more thorough description of that prototype of
- ;;; a Joy interpreter.
- ;;;
- ;;; There it says that joy interpreter supports lines starting with '$',
- ;;; which are processed by the command shell. Interesting.
- (define *keywords*
- '(("==" . ==)
- ("MODULE" . module)
- ("PRIVATE" . private)
- ("PUBLIC" . public)
- ("DEFINE" . define)
- ("END" . end)))
- (define integer-regex (make-regexp "^[+-]?[0-9]+$"))
- (define float-regex
- (make-regexp
- "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
- (define symbol-allowed-characters
- (char-set-difference
- ;; We allow #\. because it is handled elsewhere
- char-set:graphic (string->char-set "[]{};")))
- (define (get-symbol-or-number port)
- (let iterate ((result-chars '())
- (non-numeric? #f))
- (let* ((c (read-char port))
- (finish (lambda ()
- (let ((result (list->string
- (reverse result-chars))))
- (values
- (cond
- ((regexp-exec integer-regex result)
- 'integer)
- ((regexp-exec float-regex result)
- 'float)
- (else 'symbol))
- result))))
- (allowed? (lambda (c)
- (char-set-contains?
- symbol-allowed-characters c))))
- (cond
- ((eof-object? c) (finish))
- ((char=? c #\\)
- (error "character escapes not allowed in symbols"))
- ((allowed? c)
- (iterate (cons c result-chars)
- (or non-numeric?
- (not (or (char-numeric? c)
- (char=? c #\+)
- (char=? c #\-))))))
- (else
- (unread-char c port)
- (finish))))))
- (define (char-hex? c)
- (and (not (eof-object? c))
- (or (char-numeric? c)
- (memv c '(#\a #\b #\c #\d #\e #\f))
- (memv c '(#\A #\B #\C #\D #\E #\F)))))
- (define (digit->number c)
- (- (char->integer c) (char->integer #\0)))
- (define (hex->number c)
- (if (char-numeric? c)
- (digit->number c)
- (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
- (define (read-escape port)
- (let ((c (read-char port)))
- (case c
- ((#\' #\" #\\) c)
- ((#\b) #\bs)
- ((#\f) #\np)
- ((#\n) #\nl)
- ((#\r) #\cr)
- ((#\t) #\tab)
- ((#\v) #\vt)
- ((#\0)
- (let ((next (peek-char port)))
- (cond
- ((eof-object? next) #\nul)
- ((char-numeric? next)
- (error "octal escape sequences are not supported"))
- (else #\nul))))
- ((#\x)
- (let* ((a (read-char port))
- (b (read-char port)))
- (cond
- ((and (char-hex? a) (char-hex? b))
- (integer->char (+ (* 16 (hex->number a)) (hex->number b))))
- (else
- (error "bad hex character escape")))))
- ((#\u)
- (let* ((a (read-char port))
- (b (read-char port))
- (c (read-char port))
- (d (read-char port)))
- (integer->char (string->number (string a b c d) 16))))
- (else
- c))))
- (define (read-string port)
- (let iterate ((chars '()))
- (let ((c (read-char port)))
- (case c
- ((#\")
- (list->string (reverse chars)))
- ((#\\)
- (case (peek-char port)
- ((#\newline #\space)
- (iterate chars))
- (else
- (iterate (cons (read-character port) chars)))))
- (else
- (iterate (cons c chars)))))))
- (define (read-character port)
- (let ((c (read-char port)))
- (case c
- ((#\\) (read-escape port))
- (else c))))
- ;;; Main lexer routine which is given a port and looks for the next
- ;;; token.
- (define (lex port)
- (let ((return (let ((file (if (file-port? port)
- (port-filename port)
- #f))
- (line (1+ (port-line port)))
- (column (1+ (port-column port))))
- (lambda (token value)
- (let ((obj (cons token value)))
- (set-source-property! obj 'filename file)
- (set-source-property! obj 'line line)
- (set-source-property! obj 'column column)
- obj))))
- ;; Read afterwards so the source-properties are correct above
- ;; and actually point to the very character to be read.
- (c (read-char port)))
- (cond
- ;; End of input must be specially marked to the parser.
- ((eof-object? c) (return 'eof c))
- ;; Explicitely mark newline's so the parser can delimit
- ;; expressions with it if necessary.
- ((char=? c #\newline) (return 'newline c))
- ;; Whitespace, just skip it.
- ((char-whitespace? c) (lex port))
- (else
- (case c
- ;; An line comment, skip until end-of-line is found
- ((#\#)
- (read-line port)
- (lex port))
- ((#\')
- ;; A literal character
- (return 'character (read-character port)))
- ((#\")
- ;; A literal string. Similar to single characters, except
- ;; that escaped newline and space are to be completely
- ;; ignored.
- (return 'string (read-string port)))
- ((#\()
- (let ((c (read-char port)))
- (case c
- ;; Multi-line comment, discard until closing "*)"
- ((#\*)
- (let iterate ()
- (let ((c (read-char port)))
- (cond
- ((eof-object? c)
- (error "unexpected end of file in multi-line comment"))
- ((char=? c #\*)
- (cond
- ((char=? (read-char port) #\)) (lex port))
- (else (iterate))))
- (else (iterate))))))
- (else
- ;; The #\( could be understood as part of a symbol, but
- ;; it seems wiser to reserve it for future use as its
- ;; own token.
- (unread-char c port)
- (return 'paren-open #f)))))
- ((#\)) (return 'paren-close c))
- ((#\[) (return 'square-open c))
- ((#\]) (return 'square-close c))
- ((#\{) (return 'bracket-open c))
- ((#\}) (return 'bracket-close c))
- ((#\;) (return 'semicolon c))
- (else
- ;; Now only have numeric or symbol input possible.
- (unread-char c port)
- (call-with-values
- (lambda () (get-symbol-or-number port))
- (lambda (type str)
- (case type
- ((symbol)
- ;; str could be empty if the first character is already
- ;; something not allowed in a symbol (and not escaped)!
- ;; Take care about that, it is an error because that
- ;; character should have been handled elsewhere or is
- ;; invalid in the input.
- (cond
- ((zero? (string-length str))
- (begin
- ;; Take it out so the REPL might not get into an
- ;; infinite loop with further reading attempts.
- (read-char port)
- (error "invalid character in input" c)))
- ((assoc-ref *keywords* str)
- => (lambda (kw) (return kw str)))
- (else
- (return 'symbol (string->symbol str)))))
- ((integer)
- (return 'integer (string->number str)))
- ((float)
- (return 'float (string->number str)))
- (else
- (error "unexpected numeric/symbol type" type)))))))))))
- ;;; Build a lexer thunk for a port. This is the exported routine
- ;;; which can be used to create a lexer for the parser to use.
- (define (get-lexer port)
- (lambda () (lex port)))
|