recognizer.go 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. package chardet
  2. type recognizer interface {
  3. Match(*recognizerInput) recognizerOutput
  4. }
  5. type recognizerOutput Result
  6. type recognizerInput struct {
  7. raw []byte
  8. input []byte
  9. tagStripped bool
  10. byteStats []int
  11. hasC1Bytes bool
  12. }
  13. func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
  14. input, stripped := mayStripInput(raw, stripTag)
  15. byteStats := computeByteStats(input)
  16. return &recognizerInput{
  17. raw: raw,
  18. input: input,
  19. tagStripped: stripped,
  20. byteStats: byteStats,
  21. hasC1Bytes: computeHasC1Bytes(byteStats),
  22. }
  23. }
  24. func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
  25. const inputBufferSize = 8192
  26. out = make([]byte, 0, inputBufferSize)
  27. var badTags, openTags int32
  28. var inMarkup bool = false
  29. stripped = false
  30. if stripTag {
  31. stripped = true
  32. for _, c := range raw {
  33. if c == '<' {
  34. if inMarkup {
  35. badTags += 1
  36. }
  37. inMarkup = true
  38. openTags += 1
  39. }
  40. if !inMarkup {
  41. out = append(out, c)
  42. if len(out) >= inputBufferSize {
  43. break
  44. }
  45. }
  46. if c == '>' {
  47. inMarkup = false
  48. }
  49. }
  50. }
  51. if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
  52. limit := len(raw)
  53. if limit > inputBufferSize {
  54. limit = inputBufferSize
  55. }
  56. out = make([]byte, limit)
  57. copy(out, raw[:limit])
  58. stripped = false
  59. }
  60. return
  61. }
  62. func computeByteStats(input []byte) []int {
  63. r := make([]int, 256)
  64. for _, c := range input {
  65. r[c] += 1
  66. }
  67. return r
  68. }
  69. func computeHasC1Bytes(byteStats []int) bool {
  70. for _, count := range byteStats[0x80 : 0x9F+1] {
  71. if count > 0 {
  72. return true
  73. }
  74. }
  75. return false
  76. }