123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- ;;;; (sxml ssax input-parse) -- a simple lexer
- ;;;;
- ;;;; Copyright (C) 2009 Free Software Foundation, Inc.
- ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
- ;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as input-parse.scm.
- ;;;;
- ;;;; This library is free software; you can redistribute it and/or
- ;;;; modify it under the terms of the GNU Lesser General Public
- ;;;; License as published by the Free Software Foundation; either
- ;;;; version 3 of the License, or (at your option) any later version.
- ;;;;
- ;;;; This library is distributed in the hope that it will be useful,
- ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;;;; Lesser General Public License for more details.
- ;;;;
- ;;;; You should have received a copy of the GNU Lesser General Public
- ;;;; License along with this library; if not, write to the Free Software
- ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;;;;
- ;;; Commentary:
- ;;
- ;; A simple lexer.
- ;;
- ;; The procedures in this module surprisingly often suffice to parse an
- ;; input stream. They either skip, or build and return tokens, according
- ;; to inclusion or delimiting semantics. The list of characters to
- ;; expect, include, or to break at may vary from one invocation of a
- ;; function to another. This allows the functions to easily parse even
- ;; context-sensitive languages.
- ;;
- ;; EOF is generally frowned on, and thrown up upon if encountered.
- ;; Exceptions are mentioned specifically. The list of expected
- ;; characters (characters to skip until, or break-characters) may
- ;; include an EOF "character", which is to be coded as the symbol,
- ;; @code{*eof*}.
- ;;
- ;; The input stream to parse is specified as a @dfn{port}, which is
- ;; usually the last (and optional) argument. It defaults to the current
- ;; input port if omitted.
- ;;
- ;; If the parser encounters an error, it will throw an exception to the
- ;; key @code{parser-error}. The arguments will be of the form
- ;; @code{(@var{port} @var{message} @var{specialising-msg}*)}.
- ;;
- ;; The first argument is a port, which typically points to the offending
- ;; character or its neighborhood. You can then use @code{port-column}
- ;; and @code{port-line} to query the current position. @var{message} is
- ;; the description of the error. Other arguments supply more details
- ;; about the problem.
- ;;
- ;;; Code:
- (define-module (sxml ssax input-parse)
- #:use-module (ice-9 rdelim)
- #:export (peek-next-char
- assert-curr-char
- skip-until
- skip-while
- next-token
- next-token-of
- read-text-line
- read-string
- find-string-from-port?))
- (define ascii->char integer->char)
- (define char->ascii char->integer)
- (define char-newline #\newline)
- (define char-return #\return)
- (define inc 1+)
- (define dec 1-)
- ;; rewrite oleg's define-opt into define* style
- (define-macro (define-opt bindings body . body-rest)
- (let* ((rev-bindings (reverse bindings))
- (opt-bindings
- (and (pair? rev-bindings) (pair? (car rev-bindings))
- (eq? 'optional (caar rev-bindings))
- (cdar rev-bindings))))
- (if opt-bindings
- `(define* ,(append (reverse (cons #:optional (cdr rev-bindings)))
- opt-bindings)
- ,body ,@body-rest)
- `(define* ,bindings ,body ,@body-rest))))
- (define (parser-error port message . rest)
- (apply throw 'parser-error port message rest))
- (include-from-path "sxml/upstream/input-parse.scm")
- ;; This version for guile is quite speedy, due to read-delimited (which
- ;; is implemented in C).
- (define-opt (next-token prefix-skipped-chars break-chars
- (optional (comment "") (port (current-input-port))) )
- (let ((delims (list->string (delete '*eof* break-chars))))
- (if (eof-object? (if (null? prefix-skipped-chars)
- (peek-char port)
- (skip-while prefix-skipped-chars port)))
- (if (memq '*eof* break-chars)
- ""
- (parser-error port "EOF while reading a token " comment))
- (let ((token (read-delimited delims port 'peek)))
- (if (and (eof-object? (peek-char port))
- (not (memq '*eof* break-chars)))
- (parser-error port "EOF while reading a token " comment)
- token)))))
- (define-opt (read-text-line (optional (port (current-input-port))) )
- (read-line port))
- ;; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org)
- ;; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu)
- ;; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu)
- ;; Modified 2004 Andy Wingo <wingo at pobox dot com>
- ;; This function is from SLIB's strsrch.scm, and is in the public domain.
- (define (find-string-from-port? str <input-port> . max-no-char)
- "Looks for @var{str} in @var{<input-port>}, optionally within the
- first @var{max-no-char} characters."
- (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
- (letrec
- ((no-chars-read 0)
- (peeked? #f)
- (my-peek-char ; Return a peeked char or #f
- (lambda () (and (or (not (number? max-no-char))
- (< no-chars-read max-no-char))
- (let ((c (peek-char <input-port>)))
- (cond (peeked? c)
- ((eof-object? c) #f)
- ((procedure? max-no-char)
- (set! peeked? #t)
- (if (max-no-char c) #f c))
- ((eqv? max-no-char c) #f)
- (else c))))))
- (next-char (lambda () (set! peeked? #f) (read-char <input-port>)
- (set! no-chars-read (+ 1 no-chars-read))))
- (match-1st-char ; of the string str
- (lambda ()
- (let ((c (my-peek-char)))
- (and c
- (begin (next-char)
- (if (char=? c (string-ref str 0))
- (match-other-chars 1)
- (match-1st-char)))))))
- ;; There has been a partial match, up to the point pos-to-match
- ;; (for example, str[0] has been found in the stream)
- ;; Now look to see if str[pos-to-match] for would be found, too
- (match-other-chars
- (lambda (pos-to-match)
- (if (>= pos-to-match (string-length str))
- no-chars-read ; the entire string has matched
- (let ((c (my-peek-char)))
- (and c
- (if (not (char=? c (string-ref str pos-to-match)))
- (backtrack 1 pos-to-match)
- (begin (next-char)
- (match-other-chars (+ 1 pos-to-match)))))))))
- ;; There had been a partial match, but then a wrong char showed up.
- ;; Before discarding previously read (and matched) characters, we check
- ;; to see if there was some smaller partial match. Note, characters read
- ;; so far (which matter) are those of str[0..matched-substr-len - 1]
- ;; In other words, we will check to see if there is such i>0 that
- ;; substr(str,0,j) = substr(str,i,matched-substr-len)
- ;; where j=matched-substr-len - i
- (backtrack
- (lambda (i matched-substr-len)
- (let ((j (- matched-substr-len i)))
- (if (<= j 0)
- ;; backed off completely to the begining of str
- (match-1st-char)
- (let loop ((k 0))
- (if (>= k j)
- (match-other-chars j) ; there was indeed a shorter match
- (if (char=? (string-ref str k)
- (string-ref str (+ i k)))
- (loop (+ 1 k))
- (backtrack (+ 1 i) matched-substr-len))))))))
- )
- (match-1st-char)))
|