string-utils.scm 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. ;;;; (texinfo string-utils) -- text filling and wrapping
  2. ;;;;
  3. ;;;; Copyright (C) 2009, 2013 Free Software Foundation, Inc.
  4. ;;;; Copyright (C) 2003 Richard Todd
  5. ;;;;
  6. ;;;; This library is free software; you can redistribute it and/or
  7. ;;;; modify it under the terms of the GNU Lesser General Public
  8. ;;;; License as published by the Free Software Foundation; either
  9. ;;;; version 3 of the License, or (at your option) any later version.
  10. ;;;;
  11. ;;;; This library is distributed in the hope that it will be useful,
  12. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;;;; Lesser General Public License for more details.
  15. ;;;;
  16. ;;;; You should have received a copy of the GNU Lesser General Public
  17. ;;;; License along with this library; if not, write to the Free Software
  18. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;;;;
  20. ;;; Commentary:
  21. ;; Module @samp{(texinfo string-utils)} provides various string-related
  22. ;; functions useful to Guile's texinfo support.
  23. ;;; Code:
  24. (define-module (texinfo string-utils)
  25. #:use-module (srfi srfi-13)
  26. #:use-module (srfi srfi-14)
  27. #:export (escape-special-chars
  28. transform-string
  29. expand-tabs
  30. center-string
  31. left-justify-string
  32. right-justify-string
  33. collapse-repeated-chars
  34. make-text-wrapper
  35. fill-string
  36. string->wrapped-lines))
  37. (define* (transform-string str match? replace #:optional (start #f) (end #f))
  38. "Uses @var{match?} against each character in @var{str}, and performs a
  39. replacement on each character for which matches are found.
  40. @var{match?} may either be a function, a character, a string, or
  41. @code{#t}. If @var{match?} is a function, then it takes a single
  42. character as input, and should return @samp{#t} for matches.
  43. @var{match?} is a character, it is compared to each string character
  44. using @code{char=?}. If @var{match?} is a string, then any character
  45. in that string will be considered a match. @code{#t} will cause
  46. every character to be a match.
  47. If @var{replace} is a function, it is called with the matched
  48. character as an argument, and the returned value is sent to the output
  49. string via @samp{display}. If @var{replace} is anything else, it is
  50. sent through the output string via @samp{display}.
  51. Note that te replacement for the matched characters does not need to
  52. be a single character. That is what differentiates this function from
  53. @samp{string-map}, and what makes it useful for applications such as
  54. converting @samp{#\\&} to @samp{\"&\"} in web page text. Some other
  55. functions in this module are just wrappers around common uses of
  56. @samp{transform-string}. Transformations not possible with this
  57. function should probably be done with regular expressions.
  58. If @var{start} and @var{end} are given, they control which portion
  59. of the string undergoes transformation. The entire input string
  60. is still output, though. So, if @var{start} is @samp{5}, then the
  61. first five characters of @var{str} will still appear in the returned
  62. string.
  63. @lisp
  64. ; these two are equivalent...
  65. (transform-string str #\\space #\\-) ; change all spaces to -'s
  66. (transform-string str (lambda (c) (char=? #\\space c)) #\\-)
  67. @end lisp"
  68. ;; I had implemented this with string-fold, but it was
  69. ;; slower...
  70. (let* ((os (open-output-string))
  71. (matcher (cond ((char? match?)
  72. (lambda (c) (char=? match? c)))
  73. ((procedure? match?)
  74. match?)
  75. ((string? match?)
  76. (lambda (c) (string-index match? c)))
  77. ((boolean? match?)
  78. (lambda (c) match?))
  79. (else (throw 'bad-type "expected #t, char, string, or procedure"))))
  80. (replacer (if (procedure? replace)
  81. (lambda (c) (display (replace c) os))
  82. (lambda (c) (display replace os)))))
  83. ;; put the first part in, un-transformed if they asked for it...
  84. (if (and start (<= start (string-length str)))
  85. (display (substring str 0 start) os))
  86. ;; process the portion they want processed....
  87. (string-for-each
  88. (lambda (c)
  89. (if (matcher c)
  90. ;; we have a match! replace the char as directed...
  91. (replacer c)
  92. ;; not a match, just insert the character itself...
  93. (write-char c os)))
  94. str
  95. (or start 0)
  96. (or end (string-length str)))
  97. ;; if there was any at the end, tack it on...
  98. (if (and end (< end (string-length str)))
  99. (display (substring str end) os))
  100. (get-output-string os)))
  101. (define* (expand-tabs str #:optional (tab-size 8))
  102. "Returns a copy of @var{str} with all tabs expanded to spaces. @var{tab-size} defaults to 8.
  103. Assuming tab size of 8, this is equivalent to: @lisp
  104. (transform-string str #\\tab \" \")
  105. @end lisp"
  106. (transform-string str
  107. #\tab
  108. (make-string tab-size #\space)))
  109. (define (escape-special-chars str special-chars escape-char)
  110. "Returns a copy of @var{str} with all given special characters preceded
  111. by the given @var{escape-char}.
  112. @var{special-chars} can either be a single character, or a string consisting
  113. of all the special characters.
  114. @lisp
  115. ;; make a string regexp-safe...
  116. (escape-special-chars \"***(Example String)***\"
  117. \"[]()/*.\"
  118. #\\\\)
  119. => \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\"
  120. ;; also can escape a singe char...
  121. (escape-special-chars \"richardt@@vzavenue.net\"
  122. #\\@@
  123. #\\@@)
  124. => \"richardt@@@@vzavenue.net\"
  125. @end lisp"
  126. (transform-string str
  127. (if (char? special-chars)
  128. ;; if they gave us a char, use char=?
  129. (lambda (c) (char=? c special-chars))
  130. ;; if they gave us a string, see if our character is in it
  131. (lambda (c) (string-index special-chars c)))
  132. ;; replace matches with the character preceded by the escape character
  133. (lambda (c) (string escape-char c))))
  134. (define* (center-string str #:optional (width 80) (chr #\space) (rchr #f))
  135. "Returns a copy of @var{str} centered in a field of @var{width}
  136. characters. Any needed padding is done by character @var{chr}, which
  137. defaults to @samp{#\\space}. If @var{rchr} is provided, then the
  138. padding to the right will use it instead. See the examples below.
  139. left and @var{rchr} on the right. The default @var{width} is 80. The
  140. default @var{chr} and @var{rchr} is @samp{#\\space}. The string is
  141. never truncated.
  142. @lisp
  143. (center-string \"Richard Todd\" 24)
  144. => \" Richard Todd \"
  145. (center-string \" Richard Todd \" 24 #\\=)
  146. => \"===== Richard Todd =====\"
  147. (center-string \" Richard Todd \" 24 #\\< #\\>)
  148. => \"<<<<< Richard Todd >>>>>\"
  149. @end lisp"
  150. (let* ((len (string-length str))
  151. (lpad (make-string (max (quotient (- width len) 2) 0) chr))
  152. ;; right-char == char unless it has been provided by the user
  153. (right-chr (or rchr chr))
  154. (rpad (if (char=? right-chr chr)
  155. lpad
  156. (make-string (max (quotient (- width len) 2) 0) right-chr))))
  157. (if (>= len width)
  158. str
  159. (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) "")))))
  160. (define* (left-justify-string str #:optional (width 80) (chr #\space))
  161. "@code{left-justify-string str [width chr]}.
  162. Returns a copy of @var{str} padded with @var{chr} such that it is left
  163. justified in a field of @var{width} characters. The default
  164. @var{width} is 80. Unlike @samp{string-pad} from srfi-13, the string
  165. is never truncated."
  166. (let* ((len (string-length str))
  167. (pad (make-string (max (- width len) 0) chr)))
  168. (if (>= len width)
  169. str
  170. (string-append str pad))))
  171. (define* (right-justify-string str #:optional (width 80) (chr #\space))
  172. "Returns a copy of @var{str} padded with @var{chr} such that it is
  173. right justified in a field of @var{width} characters. The default
  174. @var{width} is 80. The default @var{chr} is @samp{#\\space}. Unlike
  175. @samp{string-pad} from srfi-13, the string is never truncated."
  176. (let* ((len (string-length str))
  177. (pad (make-string (max (- width len) 0) chr)))
  178. (if (>= len width)
  179. str
  180. (string-append pad str))))
  181. (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1))
  182. "Returns a copy of @var{str} with all repeated instances of
  183. @var{chr} collapsed down to at most @var{num} instances.
  184. The default value for @var{chr} is @samp{#\\space}, and
  185. the default value for @var{num} is 1.
  186. @lisp
  187. (collapse-repeated-chars \"H e l l o\")
  188. => \"H e l l o\"
  189. (collapse-repeated-chars \"H--e--l--l--o\" #\\-)
  190. => \"H-e-l-l-o\"
  191. (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2)
  192. => \"H-e--l--l--o\"
  193. @end lisp"
  194. ;; define repeat-locator as a stateful match? function which remembers
  195. ;; the last character it had seen.
  196. (let ((repeat-locator
  197. ;; initialize prev-chr to something other than what we're seeking...
  198. (let ((prev-chr (if (char=? chr #\space) #\A #\space))
  199. (match-count 0))
  200. (lambda (c)
  201. (if (and (char=? c prev-chr)
  202. (char=? prev-chr chr))
  203. ;; found enough duplicates if the match-count is high enough
  204. (begin
  205. (set! match-count (+ 1 match-count))
  206. (>= match-count num))
  207. ;; did not find a duplicate
  208. (begin (set! match-count 0)
  209. (set! prev-chr c)
  210. #f))))))
  211. ;; transform the string with our stateful matcher...
  212. ;; deleting matches...
  213. (transform-string str repeat-locator "")))
  214. ;; split a text string into segments that have the form...
  215. ;; <ws non-ws> <ws non-ws> etc..
  216. (define (split-by-single-words str)
  217. (let ((non-wschars (char-set-complement char-set:whitespace)))
  218. (let loop ((ans '())
  219. (index 0))
  220. (let ((next-non-ws (string-index str non-wschars index)))
  221. (if next-non-ws
  222. ;; found non-ws...look for ws following...
  223. (let ((next-ws (string-index str char-set:whitespace next-non-ws)))
  224. (if next-ws
  225. ;; found the ws following...
  226. (loop (cons (substring str index next-ws) ans)
  227. next-ws)
  228. ;; did not find ws...must be the end...
  229. (reverse (cons (substring str index) ans))))
  230. ;; did not find non-ws... only ws at end of the string...
  231. (reverse ans))))))
  232. (define (end-of-sentence? str)
  233. "Return #t when STR likely denotes the end of sentence."
  234. (let ((len (string-length str)))
  235. (and (> len 1)
  236. (eqv? #\. (string-ref str (- len 1)))
  237. (not (eqv? #\. (string-ref str (- len 2)))))))
  238. (define* (make-text-wrapper #:key
  239. (line-width 80)
  240. (expand-tabs? #t)
  241. (tab-width 8)
  242. (collapse-whitespace? #t)
  243. (subsequent-indent "")
  244. (initial-indent "")
  245. (break-long-words? #t))
  246. "Returns a procedure that will split a string into lines according to the
  247. given parameters.
  248. @table @code
  249. @item #:line-width
  250. This is the target length used when deciding where to wrap lines.
  251. Default is 80.
  252. @item #:expand-tabs?
  253. Boolean describing whether tabs in the input should be expanded. Default
  254. is #t.
  255. @item #:tab-width
  256. If tabs are expanded, this will be the number of spaces to which they
  257. expand. Default is 8.
  258. @item #:collapse-whitespace?
  259. Boolean describing whether the whitespace inside the existing text
  260. should be removed or not. Default is #t.
  261. If text is already well-formatted, and is just being wrapped to fit in a
  262. different width, then set this to @samp{#f}. This way, many common text
  263. conventions (such as two spaces between sentences) can be preserved if
  264. in the original text. If the input text spacing cannot be trusted, then
  265. leave this setting at the default, and all repeated whitespace will be
  266. collapsed down to a single space.
  267. @item #:initial-indent
  268. Defines a string that will be put in front of the first line of wrapped
  269. text. Default is the empty string, ``''.
  270. @item #:subsequent-indent
  271. Defines a string that will be put in front of all lines of wrapped
  272. text, except the first one. Default is the empty string, ``''.
  273. @item #:break-long-words?
  274. If a single word is too big to fit on a line, this setting tells the
  275. wrapper what to do. Defaults to #t, which will break up long words.
  276. When set to #f, the line will be allowed, even though it is longer
  277. than the defined @code{#:line-width}.
  278. @end table
  279. The return value is a procedure of one argument, the input string, which
  280. returns a list of strings, where each element of the list is one line."
  281. (lambda (str)
  282. ;; replace newlines with spaces
  283. (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space))
  284. ;; expand tabs if they wanted us to...
  285. (if expand-tabs?
  286. (set! str (expand-tabs str tab-width)))
  287. ;; collapse whitespace if they wanted us to...
  288. (if collapse-whitespace?
  289. (set! str (collapse-repeated-chars str)))
  290. ;; drop any whitespace from the front...
  291. (set! str (string-trim str))
  292. ;; now start breaking the text into lines...
  293. (let loop ((ans '())
  294. (words (split-by-single-words str))
  295. (line initial-indent)
  296. (count 0))
  297. (if (null? words)
  298. ;; out of words? ...done!
  299. (reverse (if (> count 0)
  300. (cons line ans)
  301. ans))
  302. ;; not out of words...keep going...
  303. (let ((length-left (- line-width
  304. (string-length line)))
  305. (next-word (if (= count 0)
  306. (string-trim (car words))
  307. (car words))))
  308. (cond
  309. ;; does the next entry fit?
  310. ((<= (string-length next-word)
  311. length-left)
  312. (loop ans
  313. (cdr words)
  314. (if (and collapse-whitespace?
  315. (end-of-sentence? line))
  316. ;; Add an extra space after the period.
  317. (string-append line " " next-word)
  318. (string-append line next-word))
  319. (+ count 1)))
  320. ;; ok, it didn't fit...is there already at least one word on the line?
  321. ((> count 0)
  322. ;; try to use it for the next line, then...
  323. (loop (cons line ans)
  324. words
  325. subsequent-indent
  326. 0))
  327. ;; ok, it didn't fit...and it's the first word.
  328. ;; were we told to break up long words?
  329. (break-long-words?
  330. ;; break the like at the limit, since the user wants us to...
  331. (loop (cons (string-append line (substring next-word 0 length-left))
  332. ans)
  333. (cons (substring next-word length-left)
  334. (cdr words))
  335. subsequent-indent
  336. 0))
  337. ;; well, then is it the first word and we *shouldn't* break long words, then...
  338. (else
  339. (loop (cons (string-append line next-word)
  340. ans)
  341. (cdr words)
  342. subsequent-indent
  343. 0))))))))
  344. (define (string->wrapped-lines str . kwargs)
  345. "@code{string->wrapped-lines str keywds ...}. Wraps the text given in
  346. string @var{str} according to the parameters provided in @var{keywds},
  347. or the default setting if they are not given. Returns a list of strings
  348. representing the formatted lines. Valid keyword arguments are discussed
  349. in @code{make-text-wrapper}."
  350. ((apply make-text-wrapper kwargs) str))
  351. (define (fill-string str . kwargs)
  352. "Wraps the text given in string @var{str} according to the parameters
  353. provided in @var{kwargs}, or the default setting if they are not
  354. given. Returns a single string with the wrapped text. Valid keyword
  355. arguments are discussed in @code{make-text-wrapper}."
  356. (string-join (apply string->wrapped-lines str kwargs)
  357. "\n"
  358. 'infix))