quote.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run makeisprint.go -output isprint.go
  5. package strconv
  6. import (
  7. "unicode/utf8"
  8. )
  9. const lowerhex = "0123456789abcdef"
  10. func quoteWith(s string, quote byte, ASCIIonly bool) string {
  11. var runeTmp [utf8.UTFMax]byte
  12. buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
  13. buf = append(buf, quote)
  14. for width := 0; len(s) > 0; s = s[width:] {
  15. r := rune(s[0])
  16. width = 1
  17. if r >= utf8.RuneSelf {
  18. r, width = utf8.DecodeRuneInString(s)
  19. }
  20. if width == 1 && r == utf8.RuneError {
  21. buf = append(buf, `\x`...)
  22. buf = append(buf, lowerhex[s[0]>>4])
  23. buf = append(buf, lowerhex[s[0]&0xF])
  24. continue
  25. }
  26. if r == rune(quote) || r == '\\' { // always backslashed
  27. buf = append(buf, '\\')
  28. buf = append(buf, byte(r))
  29. continue
  30. }
  31. if ASCIIonly {
  32. if r < utf8.RuneSelf && IsPrint(r) {
  33. buf = append(buf, byte(r))
  34. continue
  35. }
  36. } else if IsPrint(r) {
  37. n := utf8.EncodeRune(runeTmp[:], r)
  38. buf = append(buf, runeTmp[:n]...)
  39. continue
  40. }
  41. switch r {
  42. case '\a':
  43. buf = append(buf, `\a`...)
  44. case '\b':
  45. buf = append(buf, `\b`...)
  46. case '\f':
  47. buf = append(buf, `\f`...)
  48. case '\n':
  49. buf = append(buf, `\n`...)
  50. case '\r':
  51. buf = append(buf, `\r`...)
  52. case '\t':
  53. buf = append(buf, `\t`...)
  54. case '\v':
  55. buf = append(buf, `\v`...)
  56. default:
  57. switch {
  58. case r < ' ':
  59. buf = append(buf, `\x`...)
  60. buf = append(buf, lowerhex[s[0]>>4])
  61. buf = append(buf, lowerhex[s[0]&0xF])
  62. case r > utf8.MaxRune:
  63. r = 0xFFFD
  64. fallthrough
  65. case r < 0x10000:
  66. buf = append(buf, `\u`...)
  67. for s := 12; s >= 0; s -= 4 {
  68. buf = append(buf, lowerhex[r>>uint(s)&0xF])
  69. }
  70. default:
  71. buf = append(buf, `\U`...)
  72. for s := 28; s >= 0; s -= 4 {
  73. buf = append(buf, lowerhex[r>>uint(s)&0xF])
  74. }
  75. }
  76. }
  77. }
  78. buf = append(buf, quote)
  79. return string(buf)
  80. }
  81. // Quote returns a double-quoted Go string literal representing s. The
  82. // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
  83. // control characters and non-printable characters as defined by
  84. // IsPrint.
  85. func Quote(s string) string {
  86. return quoteWith(s, '"', false)
  87. }
  88. // AppendQuote appends a double-quoted Go string literal representing s,
  89. // as generated by Quote, to dst and returns the extended buffer.
  90. func AppendQuote(dst []byte, s string) []byte {
  91. return append(dst, Quote(s)...)
  92. }
  93. // QuoteToASCII returns a double-quoted Go string literal representing s.
  94. // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
  95. // non-ASCII characters and non-printable characters as defined by IsPrint.
  96. func QuoteToASCII(s string) string {
  97. return quoteWith(s, '"', true)
  98. }
  99. // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
  100. // as generated by QuoteToASCII, to dst and returns the extended buffer.
  101. func AppendQuoteToASCII(dst []byte, s string) []byte {
  102. return append(dst, QuoteToASCII(s)...)
  103. }
  104. // QuoteRune returns a single-quoted Go character literal representing the
  105. // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
  106. // for control characters and non-printable characters as defined by IsPrint.
  107. func QuoteRune(r rune) string {
  108. // TODO: avoid the allocation here.
  109. return quoteWith(string(r), '\'', false)
  110. }
  111. // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
  112. // as generated by QuoteRune, to dst and returns the extended buffer.
  113. func AppendQuoteRune(dst []byte, r rune) []byte {
  114. return append(dst, QuoteRune(r)...)
  115. }
  116. // QuoteRuneToASCII returns a single-quoted Go character literal representing
  117. // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
  118. // \u0100) for non-ASCII characters and non-printable characters as defined
  119. // by IsPrint.
  120. func QuoteRuneToASCII(r rune) string {
  121. // TODO: avoid the allocation here.
  122. return quoteWith(string(r), '\'', true)
  123. }
  124. // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
  125. // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
  126. func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
  127. return append(dst, QuoteRuneToASCII(r)...)
  128. }
  129. // CanBackquote reports whether the string s can be represented
  130. // unchanged as a single-line backquoted string without control
  131. // characters other than tab.
  132. func CanBackquote(s string) bool {
  133. for len(s) > 0 {
  134. r, wid := utf8.DecodeRuneInString(s)
  135. s = s[wid:]
  136. if wid > 1 {
  137. if r == '\ufeff' {
  138. return false // BOMs are invisible and should not be quoted.
  139. }
  140. continue // All other multibyte runes are correctly encoded and assumed printable.
  141. }
  142. if r == utf8.RuneError {
  143. return false
  144. }
  145. if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
  146. return false
  147. }
  148. }
  149. return true
  150. }
  151. func unhex(b byte) (v rune, ok bool) {
  152. c := rune(b)
  153. switch {
  154. case '0' <= c && c <= '9':
  155. return c - '0', true
  156. case 'a' <= c && c <= 'f':
  157. return c - 'a' + 10, true
  158. case 'A' <= c && c <= 'F':
  159. return c - 'A' + 10, true
  160. }
  161. return
  162. }
  163. // UnquoteChar decodes the first character or byte in the escaped string
  164. // or character literal represented by the string s.
  165. // It returns four values:
  166. //
  167. // 1) value, the decoded Unicode code point or byte value;
  168. // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
  169. // 3) tail, the remainder of the string after the character; and
  170. // 4) an error that will be nil if the character is syntactically valid.
  171. //
  172. // The second argument, quote, specifies the type of literal being parsed
  173. // and therefore which escaped quote character is permitted.
  174. // If set to a single quote, it permits the sequence \' and disallows unescaped '.
  175. // If set to a double quote, it permits \" and disallows unescaped ".
  176. // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
  177. func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
  178. // easy cases
  179. switch c := s[0]; {
  180. case c == quote && (quote == '\'' || quote == '"'):
  181. err = ErrSyntax
  182. return
  183. case c >= utf8.RuneSelf:
  184. r, size := utf8.DecodeRuneInString(s)
  185. return r, true, s[size:], nil
  186. case c != '\\':
  187. return rune(s[0]), false, s[1:], nil
  188. }
  189. // hard case: c is backslash
  190. if len(s) <= 1 {
  191. err = ErrSyntax
  192. return
  193. }
  194. c := s[1]
  195. s = s[2:]
  196. switch c {
  197. case 'a':
  198. value = '\a'
  199. case 'b':
  200. value = '\b'
  201. case 'f':
  202. value = '\f'
  203. case 'n':
  204. value = '\n'
  205. case 'r':
  206. value = '\r'
  207. case 't':
  208. value = '\t'
  209. case 'v':
  210. value = '\v'
  211. case 'x', 'u', 'U':
  212. n := 0
  213. switch c {
  214. case 'x':
  215. n = 2
  216. case 'u':
  217. n = 4
  218. case 'U':
  219. n = 8
  220. }
  221. var v rune
  222. if len(s) < n {
  223. err = ErrSyntax
  224. return
  225. }
  226. for j := 0; j < n; j++ {
  227. x, ok := unhex(s[j])
  228. if !ok {
  229. err = ErrSyntax
  230. return
  231. }
  232. v = v<<4 | x
  233. }
  234. s = s[n:]
  235. if c == 'x' {
  236. // single-byte string, possibly not UTF-8
  237. value = v
  238. break
  239. }
  240. if v > utf8.MaxRune {
  241. err = ErrSyntax
  242. return
  243. }
  244. value = v
  245. multibyte = true
  246. case '0', '1', '2', '3', '4', '5', '6', '7':
  247. v := rune(c) - '0'
  248. if len(s) < 2 {
  249. err = ErrSyntax
  250. return
  251. }
  252. for j := 0; j < 2; j++ { // one digit already; two more
  253. x := rune(s[j]) - '0'
  254. if x < 0 || x > 7 {
  255. err = ErrSyntax
  256. return
  257. }
  258. v = (v << 3) | x
  259. }
  260. s = s[2:]
  261. if v > 255 {
  262. err = ErrSyntax
  263. return
  264. }
  265. value = v
  266. case '\\':
  267. value = '\\'
  268. case '\'', '"':
  269. if c != quote {
  270. err = ErrSyntax
  271. return
  272. }
  273. value = rune(c)
  274. default:
  275. err = ErrSyntax
  276. return
  277. }
  278. tail = s
  279. return
  280. }
  281. // Unquote interprets s as a single-quoted, double-quoted,
  282. // or backquoted Go string literal, returning the string value
  283. // that s quotes. (If s is single-quoted, it would be a Go
  284. // character literal; Unquote returns the corresponding
  285. // one-character string.)
  286. func Unquote(s string) (t string, err error) {
  287. n := len(s)
  288. if n < 2 {
  289. return "", ErrSyntax
  290. }
  291. quote := s[0]
  292. if quote != s[n-1] {
  293. return "", ErrSyntax
  294. }
  295. s = s[1 : n-1]
  296. if quote == '`' {
  297. if contains(s, '`') {
  298. return "", ErrSyntax
  299. }
  300. return s, nil
  301. }
  302. if quote != '"' && quote != '\'' {
  303. return "", ErrSyntax
  304. }
  305. if contains(s, '\n') {
  306. return "", ErrSyntax
  307. }
  308. // Is it trivial? Avoid allocation.
  309. if !contains(s, '\\') && !contains(s, quote) {
  310. switch quote {
  311. case '"':
  312. return s, nil
  313. case '\'':
  314. r, size := utf8.DecodeRuneInString(s)
  315. if size == len(s) && (r != utf8.RuneError || size != 1) {
  316. return s, nil
  317. }
  318. }
  319. }
  320. var runeTmp [utf8.UTFMax]byte
  321. buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
  322. for len(s) > 0 {
  323. c, multibyte, ss, err := UnquoteChar(s, quote)
  324. if err != nil {
  325. return "", err
  326. }
  327. s = ss
  328. if c < utf8.RuneSelf || !multibyte {
  329. buf = append(buf, byte(c))
  330. } else {
  331. n := utf8.EncodeRune(runeTmp[:], c)
  332. buf = append(buf, runeTmp[:n]...)
  333. }
  334. if quote == '\'' && len(s) != 0 {
  335. // single-quoted must be single character
  336. return "", ErrSyntax
  337. }
  338. }
  339. return string(buf), nil
  340. }
  341. // contains reports whether the string contains the byte c.
  342. func contains(s string, c byte) bool {
  343. for i := 0; i < len(s); i++ {
  344. if s[i] == c {
  345. return true
  346. }
  347. }
  348. return false
  349. }
  350. // bsearch16 returns the smallest i such that a[i] >= x.
  351. // If there is no such i, bsearch16 returns len(a).
  352. func bsearch16(a []uint16, x uint16) int {
  353. i, j := 0, len(a)
  354. for i < j {
  355. h := i + (j-i)/2
  356. if a[h] < x {
  357. i = h + 1
  358. } else {
  359. j = h
  360. }
  361. }
  362. return i
  363. }
  364. // bsearch32 returns the smallest i such that a[i] >= x.
  365. // If there is no such i, bsearch32 returns len(a).
  366. func bsearch32(a []uint32, x uint32) int {
  367. i, j := 0, len(a)
  368. for i < j {
  369. h := i + (j-i)/2
  370. if a[h] < x {
  371. i = h + 1
  372. } else {
  373. j = h
  374. }
  375. }
  376. return i
  377. }
  378. // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
  379. // to give the same answer. It allows this package not to depend on unicode,
  380. // and therefore not pull in all the Unicode tables. If the linker were better
  381. // at tossing unused tables, we could get rid of this implementation.
  382. // That would be nice.
  383. // IsPrint reports whether the rune is defined as printable by Go, with
  384. // the same definition as unicode.IsPrint: letters, numbers, punctuation,
  385. // symbols and ASCII space.
  386. func IsPrint(r rune) bool {
  387. // Fast check for Latin-1
  388. if r <= 0xFF {
  389. if 0x20 <= r && r <= 0x7E {
  390. // All the ASCII is printable from space through DEL-1.
  391. return true
  392. }
  393. if 0xA1 <= r && r <= 0xFF {
  394. // Similarly for ¡ through ÿ...
  395. return r != 0xAD // ...except for the bizarre soft hyphen.
  396. }
  397. return false
  398. }
  399. // Same algorithm, either on uint16 or uint32 value.
  400. // First, find first i such that isPrint[i] >= x.
  401. // This is the index of either the start or end of a pair that might span x.
  402. // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
  403. // If we find x in a range, make sure x is not in isNotPrint list.
  404. if 0 <= r && r < 1<<16 {
  405. rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
  406. i := bsearch16(isPrint, rr)
  407. if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
  408. return false
  409. }
  410. j := bsearch16(isNotPrint, rr)
  411. return j >= len(isNotPrint) || isNotPrint[j] != rr
  412. }
  413. rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
  414. i := bsearch32(isPrint, rr)
  415. if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
  416. return false
  417. }
  418. if r >= 0x20000 {
  419. return true
  420. }
  421. r -= 0x10000
  422. j := bsearch16(isNotPrint, uint16(r))
  423. return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
  424. }