armor_decoder.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package amp
  2. import (
  3. "bufio"
  4. "bytes"
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "golang.org/x/net/html"
  9. )
  10. // ErrUnknownVersion is the error returned when the first character inside the
  11. // element encoding (but outside the base64 encoding) is not '0'.
  12. type ErrUnknownVersion byte
  13. func (err ErrUnknownVersion) Error() string {
  14. return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
  15. }
  16. func isASCIIWhitespace(b byte) bool {
  17. switch b {
  18. // https://infra.spec.whatwg.org/#ascii-whitespace
  19. case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
  20. return true
  21. default:
  22. return false
  23. }
  24. }
  25. func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
  26. var i, j int
  27. // Skip initial whitespace.
  28. for i = 0; i < len(data); i++ {
  29. if !isASCIIWhitespace(data[i]) {
  30. break
  31. }
  32. }
  33. // Look for next whitespace.
  34. for j = i; j < len(data); j++ {
  35. if isASCIIWhitespace(data[j]) {
  36. return j + 1, data[i:j], nil
  37. }
  38. }
  39. // We reached the end of data without finding more whitespace. Only
  40. // consider it a token if we are at EOF.
  41. if atEOF && i < j {
  42. return j, data[i:j], nil
  43. }
  44. // Otherwise, request more data.
  45. return i, nil, nil
  46. }
  47. func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
  48. tokenizer := html.NewTokenizer(r)
  49. // Set a memory limit on token sizes, otherwise the tokenizer will
  50. // buffer text indefinitely if it is not broken up by other token types.
  51. tokenizer.SetMaxBuf(elementSizeLimit)
  52. active := false
  53. total := int64(0)
  54. for {
  55. tt := tokenizer.Next()
  56. switch tt {
  57. case html.ErrorToken:
  58. err := tokenizer.Err()
  59. if err == io.EOF {
  60. err = nil
  61. }
  62. if err == nil && active {
  63. return total, fmt.Errorf("missing </pre> tag")
  64. }
  65. return total, err
  66. case html.TextToken:
  67. if active {
  68. // Re-join the separate chunks of text and
  69. // feed them to the decoder.
  70. scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
  71. scanner.Split(splitASCIIWhitespace)
  72. for scanner.Scan() {
  73. n, err := w.Write(scanner.Bytes())
  74. total += int64(n)
  75. if err != nil {
  76. return total, err
  77. }
  78. }
  79. if err := scanner.Err(); err != nil {
  80. return total, err
  81. }
  82. }
  83. case html.StartTagToken:
  84. tn, _ := tokenizer.TagName()
  85. if string(tn) == "pre" {
  86. if active {
  87. // nesting not allowed
  88. return total, fmt.Errorf("unexpected %s", tokenizer.Token())
  89. }
  90. active = true
  91. }
  92. case html.EndTagToken:
  93. tn, _ := tokenizer.TagName()
  94. if string(tn) == "pre" {
  95. if !active {
  96. // stray end tag
  97. return total, fmt.Errorf("unexpected %s", tokenizer.Token())
  98. }
  99. active = false
  100. }
  101. }
  102. }
  103. }
  104. // NewArmorDecoder returns a new AMP armor decoder.
  105. func NewArmorDecoder(r io.Reader) (io.Reader, error) {
  106. pr, pw := io.Pipe()
  107. go func() {
  108. _, err := decodeToWriter(pw, r)
  109. pw.CloseWithError(err)
  110. }()
  111. // The first byte inside the element encoding is a server–client
  112. // protocol version indicator.
  113. var version [1]byte
  114. _, err := pr.Read(version[:])
  115. if err != nil {
  116. pr.CloseWithError(err)
  117. return nil, err
  118. }
  119. switch version[0] {
  120. case '0':
  121. return base64.NewDecoder(base64.StdEncoding, pr), nil
  122. default:
  123. err := ErrUnknownVersion(version[0])
  124. pr.CloseWithError(err)
  125. return nil, err
  126. }
  127. }