input-parse.scm 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. ;;;; (sxml ssax input-parse) -- a simple lexer
  2. ;;;;
  3. ;;;; Copyright (C) 2009 Free Software Foundation, Inc.
  4. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
  5. ;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as input-parse.scm.
  6. ;;;;
  7. ;;;; This library is free software; you can redistribute it and/or
  8. ;;;; modify it under the terms of the GNU Lesser General Public
  9. ;;;; License as published by the Free Software Foundation; either
  10. ;;;; version 3 of the License, or (at your option) any later version.
  11. ;;;;
  12. ;;;; This library is distributed in the hope that it will be useful,
  13. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;;;; Lesser General Public License for more details.
  16. ;;;;
  17. ;;;; You should have received a copy of the GNU Lesser General Public
  18. ;;;; License along with this library; if not, write to the Free Software
  19. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;;;;
  21. ;;; Commentary:
  22. ;;
  23. ;; A simple lexer.
  24. ;;
  25. ;; The procedures in this module surprisingly often suffice to parse an
  26. ;; input stream. They either skip, or build and return tokens, according
  27. ;; to inclusion or delimiting semantics. The list of characters to
  28. ;; expect, include, or to break at may vary from one invocation of a
  29. ;; function to another. This allows the functions to easily parse even
  30. ;; context-sensitive languages.
  31. ;;
  32. ;; EOF is generally frowned on, and thrown up upon if encountered.
  33. ;; Exceptions are mentioned specifically. The list of expected
  34. ;; characters (characters to skip until, or break-characters) may
  35. ;; include an EOF "character", which is to be coded as the symbol,
  36. ;; @code{*eof*}.
  37. ;;
  38. ;; The input stream to parse is specified as a @dfn{port}, which is
  39. ;; usually the last (and optional) argument. It defaults to the current
  40. ;; input port if omitted.
  41. ;;
  42. ;; If the parser encounters an error, it will throw an exception to the
  43. ;; key @code{parser-error}. The arguments will be of the form
  44. ;; @code{(@var{port} @var{message} @var{specialising-msg}*)}.
  45. ;;
  46. ;; The first argument is a port, which typically points to the offending
  47. ;; character or its neighborhood. You can then use @code{port-column}
  48. ;; and @code{port-line} to query the current position. @var{message} is
  49. ;; the description of the error. Other arguments supply more details
  50. ;; about the problem.
  51. ;;
  52. ;;; Code:
  53. (define-module (sxml ssax input-parse)
  54. #:use-module (ice-9 rdelim)
  55. #:export (peek-next-char
  56. assert-curr-char
  57. skip-until
  58. skip-while
  59. next-token
  60. next-token-of
  61. read-text-line
  62. read-string
  63. find-string-from-port?))
  64. (define ascii->char integer->char)
  65. (define char->ascii char->integer)
  66. (define char-newline #\newline)
  67. (define char-return #\return)
  68. (define inc 1+)
  69. (define dec 1-)
  70. ;; rewrite oleg's define-opt into define* style
  71. (define-macro (define-opt bindings body . body-rest)
  72. (let* ((rev-bindings (reverse bindings))
  73. (opt-bindings
  74. (and (pair? rev-bindings) (pair? (car rev-bindings))
  75. (eq? 'optional (caar rev-bindings))
  76. (cdar rev-bindings))))
  77. (if opt-bindings
  78. `(define* ,(append (reverse (cons #:optional (cdr rev-bindings)))
  79. opt-bindings)
  80. ,body ,@body-rest)
  81. `(define* ,bindings ,body ,@body-rest))))
  82. (define (parser-error port message . rest)
  83. (apply throw 'parser-error port message rest))
  84. (include-from-path "sxml/upstream/input-parse.scm")
  85. ;; This version for guile is quite speedy, due to read-delimited (which
  86. ;; is implemented in C).
  87. (define-opt (next-token prefix-skipped-chars break-chars
  88. (optional (comment "") (port (current-input-port))) )
  89. (let ((delims (list->string (delete '*eof* break-chars))))
  90. (if (eof-object? (if (null? prefix-skipped-chars)
  91. (peek-char port)
  92. (skip-while prefix-skipped-chars port)))
  93. (if (memq '*eof* break-chars)
  94. ""
  95. (parser-error port "EOF while reading a token " comment))
  96. (let ((token (read-delimited delims port 'peek)))
  97. (if (and (eof-object? (peek-char port))
  98. (not (memq '*eof* break-chars)))
  99. (parser-error port "EOF while reading a token " comment)
  100. token)))))
  101. (define-opt (read-text-line (optional (port (current-input-port))) )
  102. (read-line port))
  103. ;; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org)
  104. ;; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu)
  105. ;; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu)
  106. ;; Modified 2004 Andy Wingo <wingo at pobox dot com>
  107. ;; This function is from SLIB's strsrch.scm, and is in the public domain.
  108. (define (find-string-from-port? str <input-port> . max-no-char)
  109. "Looks for @var{str} in @var{<input-port>}, optionally within the
  110. first @var{max-no-char} characters."
  111. (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
  112. (letrec
  113. ((no-chars-read 0)
  114. (peeked? #f)
  115. (my-peek-char ; Return a peeked char or #f
  116. (lambda () (and (or (not (number? max-no-char))
  117. (< no-chars-read max-no-char))
  118. (let ((c (peek-char <input-port>)))
  119. (cond (peeked? c)
  120. ((eof-object? c) #f)
  121. ((procedure? max-no-char)
  122. (set! peeked? #t)
  123. (if (max-no-char c) #f c))
  124. ((eqv? max-no-char c) #f)
  125. (else c))))))
  126. (next-char (lambda () (set! peeked? #f) (read-char <input-port>)
  127. (set! no-chars-read (+ 1 no-chars-read))))
  128. (match-1st-char ; of the string str
  129. (lambda ()
  130. (let ((c (my-peek-char)))
  131. (and c
  132. (begin (next-char)
  133. (if (char=? c (string-ref str 0))
  134. (match-other-chars 1)
  135. (match-1st-char)))))))
  136. ;; There has been a partial match, up to the point pos-to-match
  137. ;; (for example, str[0] has been found in the stream)
  138. ;; Now look to see if str[pos-to-match] for would be found, too
  139. (match-other-chars
  140. (lambda (pos-to-match)
  141. (if (>= pos-to-match (string-length str))
  142. no-chars-read ; the entire string has matched
  143. (let ((c (my-peek-char)))
  144. (and c
  145. (if (not (char=? c (string-ref str pos-to-match)))
  146. (backtrack 1 pos-to-match)
  147. (begin (next-char)
  148. (match-other-chars (+ 1 pos-to-match)))))))))
  149. ;; There had been a partial match, but then a wrong char showed up.
  150. ;; Before discarding previously read (and matched) characters, we check
  151. ;; to see if there was some smaller partial match. Note, characters read
  152. ;; so far (which matter) are those of str[0..matched-substr-len - 1]
  153. ;; In other words, we will check to see if there is such i>0 that
  154. ;; substr(str,0,j) = substr(str,i,matched-substr-len)
  155. ;; where j=matched-substr-len - i
  156. (backtrack
  157. (lambda (i matched-substr-len)
  158. (let ((j (- matched-substr-len i)))
  159. (if (<= j 0)
  160. ;; backed off completely to the begining of str
  161. (match-1st-char)
  162. (let loop ((k 0))
  163. (if (>= k j)
  164. (match-other-chars j) ; there was indeed a shorter match
  165. (if (char=? (string-ref str k)
  166. (string-ref str (+ i k)))
  167. (loop (+ 1 k))
  168. (backtrack (+ 1 i) matched-substr-len))))))))
  169. )
  170. (match-1st-char)))