input-parse.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. ;****************************************************************************
  2. ; Simple Parsing of input
  3. ;
  4. ; The following simple functions surprisingly often suffice to parse
  5. ; an input stream. They either skip, or build and return tokens,
  6. ; according to inclusion or delimiting semantics. The list of
  7. ; characters to expect, include, or to break at may vary from one
  8. ; invocation of a function to another. This allows the functions to
  9. ; easily parse even context-sensitive languages.
  10. ;
  11. ; EOF is generally frowned on, and thrown up upon if encountered.
  12. ; Exceptions are mentioned specifically. The list of expected characters
  13. ; (characters to skip until, or break-characters) may include an EOF
  14. ; "character", which is to be coded as symbol *eof*
  15. ;
  16. ; The input stream to parse is specified as a PORT, which is usually
  17. ; the last (and optional) argument. It defaults to the current input
  18. ; port if omitted.
  19. ;
  20. ; IMPORT
  21. ; This package relies on a function parser-error, which must be defined
  22. ; by a user of the package. The function has the following signature:
  23. ; parser-error PORT MESSAGE SPECIALISING-MSG*
  24. ; Many procedures of this package call parser-error to report a parsing
  25. ; error. The first argument is a port, which typically points to the
  26. ; offending character or its neighborhood. Most of the Scheme systems
  27. ; let the user query a PORT for the current position. MESSAGE is the
  28. ; description of the error. Other arguments supply more details about
  29. ; the problem.
  30. ; myenv.scm, myenv-bigloo.scm or a similar prelude is assumed.
  31. ; From SRFI-13, string-concatenate-reverse
  32. ; If a particular implementation lacks SRFI-13 support, please
  33. ; include the file srfi-13-local.scm
  34. ;
  35. ; $Id: input-parse.scm,v 1.7 2004/07/07 16:02:31 sperber Exp $
  36. ;------------------------------------------------------------------------
  37. ; -- procedure+: peek-next-char [PORT]
  38. ; advances to the next character in the PORT and peeks at it.
  39. ; This function is useful when parsing LR(1)-type languages
  40. ; (one-char-read-ahead).
  41. ; The optional argument PORT defaults to the current input port.
  42. (define-opt (peek-next-char (optional (port (current-input-port))))
  43. (read-char port)
  44. (peek-char port))
  45. ;------------------------------------------------------------------------
  46. ; -- procedure+: assert-curr-char CHAR-LIST STRING [PORT]
  47. ; Reads a character from the PORT and looks it up
  48. ; in the CHAR-LIST of expected characters
  49. ; If the read character was found among expected, it is returned
  50. ; Otherwise, the procedure writes a nasty message using STRING
  51. ; as a comment, and quits.
  52. ; The optional argument PORT defaults to the current input port.
  53. ;
  54. (define-opt (assert-curr-char expected-chars comment
  55. (optional (port (current-input-port))))
  56. (let ((c (read-char port)))
  57. (if (memv c expected-chars) c
  58. (parser-error port "Wrong character " c
  59. " (0x" (if (eof-object? c) "*eof*"
  60. (number->string (char->integer c) 16)) ") "
  61. comment ". " expected-chars " expected"))))
  62. ; -- procedure+: skip-until CHAR-LIST [PORT]
  63. ; Reads and skips characters from the PORT until one of the break
  64. ; characters is encountered. This break character is returned.
  65. ; The break characters are specified as the CHAR-LIST. This list
  66. ; may include EOF, which is to be coded as a symbol *eof*
  67. ;
  68. ; -- procedure+: skip-until NUMBER [PORT]
  69. ; Skips the specified NUMBER of characters from the PORT and returns #f
  70. ;
  71. ; The optional argument PORT defaults to the current input port.
  72. (define-opt (skip-until arg (optional (port (current-input-port))) )
  73. (cond
  74. ((number? arg) ; skip 'arg' characters
  75. (do ((i arg (dec i)))
  76. ((not (positive? i)) #f)
  77. (if (eof-object? (read-char port))
  78. (parser-error port "Unexpected EOF while skipping "
  79. arg " characters"))))
  80. (else ; skip until break-chars (=arg)
  81. (let loop ((c (read-char port)))
  82. (cond
  83. ((memv c arg) c)
  84. ((eof-object? c)
  85. (if (memq '*eof* arg) c
  86. (parser-error port "Unexpected EOF while skipping until " arg)))
  87. (else (loop (read-char port))))))))
  88. ; -- procedure+: skip-while CHAR-LIST [PORT]
  89. ; Reads characters from the PORT and disregards them,
  90. ; as long as they are mentioned in the CHAR-LIST.
  91. ; The first character (which may be EOF) peeked from the stream
  92. ; that is NOT a member of the CHAR-LIST is returned. This character
  93. ; is left on the stream.
  94. ; The optional argument PORT defaults to the current input port.
  95. (define-opt (skip-while skip-chars (optional (port (current-input-port))) )
  96. (do ((c (peek-char port) (peek-char port)))
  97. ((not (memv c skip-chars)) c)
  98. (read-char port)))
  99. ; whitespace const
  100. ;------------------------------------------------------------------------
  101. ; Stream tokenizers
  102. ; -- procedure+:
  103. ; next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT]
  104. ; skips any number of the prefix characters (members of the
  105. ; PREFIX-CHAR-LIST), if any, and reads the sequence of characters
  106. ; up to (but not including) a break character, one of the
  107. ; BREAK-CHAR-LIST.
  108. ; The string of characters thus read is returned.
  109. ; The break character is left on the input stream
  110. ; The list of break characters may include EOF, which is to be coded as
  111. ; a symbol *eof*. Otherwise, EOF is fatal, generating an error message
  112. ; including a specified COMMENT-STRING (if any)
  113. ;
  114. ; The optional argument PORT defaults to the current input port.
  115. ;
  116. ; Note: since we can't tell offhand how large the token being read is
  117. ; going to be, we make a guess, pre-allocate a string, and grow it by
  118. ; quanta if necessary. The quantum is always the length of the string
  119. ; before it was extended the last time. Thus the algorithm does
  120. ; a Fibonacci-type extension, which has been proven optimal.
  121. ; Note, explicit port specification in read-char, peek-char helps.
  122. ; Procedure: input-parse:init-buffer
  123. ; returns an initial buffer for next-token* procedures.
  124. ; The input-parse:init-buffer may allocate a new buffer per each invocation:
  125. ; (define (input-parse:init-buffer) (make-string 32))
  126. ; Size 32 turns out to be fairly good, on average.
  127. ; That policy is good only when a Scheme system is multi-threaded with
  128. ; preemptive scheduling, or when a Scheme system supports shared substrings.
  129. ; In all the other cases, it's better for input-parse:init-buffer to
  130. ; return the same static buffer. next-token* functions return a copy
  131. ; (a substring) of accumulated data, so the same buffer can be reused.
  132. ; We shouldn't worry about an incoming token being too large:
  133. ; next-token will use another chunk automatically. Still,
  134. ; the best size for the static buffer is to allow most of the tokens to fit in.
  135. ; Using a static buffer _dramatically_ reduces the amount of produced garbage
  136. ; (e.g., during XML parsing).
  137. (define input-parse:init-buffer
  138. (let ((buffer (make-string 512)))
  139. (lambda () buffer)))
  140. ; See a better version below
  141. (define-opt (next-token-old prefix-skipped-chars break-chars
  142. (optional (comment "") (port (current-input-port))) )
  143. (let* ((buffer (input-parse:init-buffer))
  144. (curr-buf-len (string-length buffer))
  145. (quantum curr-buf-len))
  146. (let loop ((i 0) (c (skip-while prefix-skipped-chars port)))
  147. (cond
  148. ((memv c break-chars) (substring buffer 0 i))
  149. ((eof-object? c)
  150. (if (memq '*eof* break-chars)
  151. (substring buffer 0 i) ; was EOF expected?
  152. (parser-error port "EOF while reading a token " comment)))
  153. (else
  154. (if (>= i curr-buf-len) ; make space for i-th char in buffer
  155. (begin ; -> grow the buffer by the quantum
  156. (set! buffer (string-append buffer (make-string quantum)))
  157. (set! quantum curr-buf-len)
  158. (set! curr-buf-len (string-length buffer))))
  159. (string-set! buffer i c)
  160. (read-char port) ; move to the next char
  161. (loop (inc i) (peek-char port))
  162. )))))
  163. ; A better version of next-token, which accumulates the characters
  164. ; in chunks, and later on reverse-concatenates them, using
  165. ; SRFI-13 if available.
  166. ; The overhead of copying characters is only 100% (or even smaller: bulk
  167. ; string copying might be well-optimised), compared to the (hypothetical)
  168. ; circumstance if we had known the size of the token beforehand.
  169. ; For small tokens, the code performs just as above. For large
  170. ; tokens, we expect an improvement. Note, the code also has no
  171. ; assignments.
  172. ; See next-token-comp.scm
  173. (define-opt (next-token prefix-skipped-chars break-chars
  174. (optional (comment "") (port (current-input-port))) )
  175. (let outer ((buffer (input-parse:init-buffer)) (filled-buffer-l '())
  176. (c (skip-while prefix-skipped-chars port)))
  177. (let ((curr-buf-len (string-length buffer)))
  178. (let loop ((i 0) (c c))
  179. (cond
  180. ((memv c break-chars)
  181. (if (null? filled-buffer-l) (substring buffer 0 i)
  182. (string-concatenate-reverse filled-buffer-l buffer i)))
  183. ((eof-object? c)
  184. (if (memq '*eof* break-chars) ; was EOF expected?
  185. (if (null? filled-buffer-l) (substring buffer 0 i)
  186. (string-concatenate-reverse filled-buffer-l buffer i))
  187. (parser-error port "EOF while reading a token " comment)))
  188. ((>= i curr-buf-len)
  189. (outer (make-string curr-buf-len)
  190. (cons buffer filled-buffer-l) c))
  191. (else
  192. (string-set! buffer i c)
  193. (read-char port) ; move to the next char
  194. (loop (inc i) (peek-char port))))))))
  195. ; -- procedure+: next-token-of INC-CHARSET [PORT]
  196. ; Reads characters from the PORT that belong to the list of characters
  197. ; INC-CHARSET. The reading stops at the first character which is not
  198. ; a member of the set. This character is left on the stream.
  199. ; All the read characters are returned in a string.
  200. ;
  201. ; -- procedure+: next-token-of PRED [PORT]
  202. ; Reads characters from the PORT for which PRED (a procedure of one
  203. ; argument) returns non-#f. The reading stops at the first character
  204. ; for which PRED returns #f. That character is left on the stream.
  205. ; All the results of evaluating of PRED up to #f are returned in a
  206. ; string.
  207. ;
  208. ; PRED is a procedure that takes one argument (a character
  209. ; or the EOF object) and returns a character or #f. The returned
  210. ; character does not have to be the same as the input argument
  211. ; to the PRED. For example,
  212. ; (next-token-of (lambda (c)
  213. ; (cond ((eof-object? c) #f)
  214. ; ((char-alphabetic? c) (char-downcase c))
  215. ; (else #f))))
  216. ; will try to read an alphabetic token from the current
  217. ; input port, and return it in lower case.
  218. ;
  219. ; The optional argument PORT defaults to the current input port.
  220. ;
  221. ; This procedure is similar to next-token but only it implements
  222. ; an inclusion rather than delimiting semantics.
  223. (define-opt (next-token-of incl-list/pred
  224. (optional (port (current-input-port))) )
  225. (let* ((buffer (input-parse:init-buffer))
  226. (curr-buf-len (string-length buffer)))
  227. (if (procedure? incl-list/pred)
  228. (let outer ((buffer buffer) (filled-buffer-l '()))
  229. (let loop ((i 0))
  230. (if (>= i curr-buf-len) ; make sure we have space
  231. (outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
  232. (let ((c (incl-list/pred (peek-char port))))
  233. (if c
  234. (begin
  235. (string-set! buffer i c)
  236. (read-char port) ; move to the next char
  237. (loop (inc i)))
  238. ; incl-list/pred decided it had had enough
  239. (if (null? filled-buffer-l) (substring buffer 0 i)
  240. (string-concatenate-reverse filled-buffer-l buffer i)))))))
  241. ; incl-list/pred is a list of allowed characters
  242. (let outer ((buffer buffer) (filled-buffer-l '()))
  243. (let loop ((i 0))
  244. (if (>= i curr-buf-len) ; make sure we have space
  245. (outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
  246. (let ((c (peek-char port)))
  247. (cond
  248. ((not (memv c incl-list/pred))
  249. (if (null? filled-buffer-l) (substring buffer 0 i)
  250. (string-concatenate-reverse filled-buffer-l buffer i)))
  251. (else
  252. (string-set! buffer i c)
  253. (read-char port) ; move to the next char
  254. (loop (inc i))))))))
  255. )))
  256. ; -- procedure+: read-text-line [PORT]
  257. ; Reads one line of text from the PORT, and returns it as a string.
  258. ; A line is a (possibly empty) sequence of characters terminated
  259. ; by CR, CRLF or LF (or even the end of file).
  260. ; The terminating character (or CRLF combination) is removed from
  261. ; the input stream. The terminating character(s) is not a part
  262. ; of the return string either.
  263. ; If EOF is encountered before any character is read, the return
  264. ; value is EOF.
  265. ;
  266. ; The optional argument PORT defaults to the current input port.
  267. (define *read-line-breaks* (list char-newline char-return '*eof*))
  268. (define-opt (read-text-line (optional (port (current-input-port))) )
  269. (if (eof-object? (peek-char port)) (peek-char port)
  270. (let* ((line
  271. (next-token '() *read-line-breaks*
  272. "reading a line" port))
  273. (c (read-char port))) ; must be either \n or \r or EOF
  274. (and (eqv? c char-return) (eqv? (peek-char port) #\newline)
  275. (read-char port)) ; skip \n that follows \r
  276. line)))
  277. ; -- procedure+: read-string N [PORT]
  278. ; Reads N characters from the PORT, and returns them in a string.
  279. ; If EOF is encountered before N characters are read, a shorter string
  280. ; will be returned.
  281. ; If N is not positive, an empty string will be returned.
  282. ; The optional argument PORT defaults to the current input port.
  283. (define-opt (read-string n (optional (port (current-input-port))) )
  284. (if (not (positive? n)) ""
  285. (let ((buffer (make-string n)))
  286. (let loop ((i 0) (c (read-char port)))
  287. (if (eof-object? c) (substring buffer 0 i)
  288. (let ((i1 (inc i)))
  289. (string-set! buffer i c)
  290. (if (= i1 n) buffer
  291. (loop i1 (read-char port)))))))))