detector.go 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. // Package chardet ports character set detection from ICU.
  2. package chardet
  3. import (
  4. "errors"
  5. "sort"
  6. )
  7. // Result contains all the information that charset detector gives.
  8. type Result struct {
  9. // IANA name of the detected charset.
  10. Charset string
  11. // IANA name of the detected language. It may be empty for some charsets.
  12. Language string
  13. // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
  14. Confidence int
  15. }
  16. // Detector implements charset detection.
  17. type Detector struct {
  18. recognizers []recognizer
  19. stripTag bool
  20. }
  21. // List of charset recognizers
  22. var recognizers = []recognizer{
  23. newRecognizer_utf8(),
  24. newRecognizer_utf16be(),
  25. newRecognizer_utf16le(),
  26. newRecognizer_utf32be(),
  27. newRecognizer_utf32le(),
  28. newRecognizer_8859_1_en(),
  29. newRecognizer_8859_1_da(),
  30. newRecognizer_8859_1_de(),
  31. newRecognizer_8859_1_es(),
  32. newRecognizer_8859_1_fr(),
  33. newRecognizer_8859_1_it(),
  34. newRecognizer_8859_1_nl(),
  35. newRecognizer_8859_1_no(),
  36. newRecognizer_8859_1_pt(),
  37. newRecognizer_8859_1_sv(),
  38. newRecognizer_8859_2_cs(),
  39. newRecognizer_8859_2_hu(),
  40. newRecognizer_8859_2_pl(),
  41. newRecognizer_8859_2_ro(),
  42. newRecognizer_8859_5_ru(),
  43. newRecognizer_8859_6_ar(),
  44. newRecognizer_8859_7_el(),
  45. newRecognizer_8859_8_I_he(),
  46. newRecognizer_8859_8_he(),
  47. newRecognizer_windows_1251(),
  48. newRecognizer_windows_1256(),
  49. newRecognizer_KOI8_R(),
  50. newRecognizer_8859_9_tr(),
  51. newRecognizer_sjis(),
  52. newRecognizer_gb_18030(),
  53. newRecognizer_euc_jp(),
  54. newRecognizer_euc_kr(),
  55. newRecognizer_big5(),
  56. newRecognizer_2022JP(),
  57. newRecognizer_2022KR(),
  58. newRecognizer_2022CN(),
  59. newRecognizer_IBM424_he_rtl(),
  60. newRecognizer_IBM424_he_ltr(),
  61. newRecognizer_IBM420_ar_rtl(),
  62. newRecognizer_IBM420_ar_ltr(),
  63. }
  64. // NewTextDetector creates a Detector for plain text.
  65. func NewTextDetector() *Detector {
  66. return &Detector{recognizers, false}
  67. }
  68. // NewHtmlDetector creates a Detector for Html.
  69. func NewHtmlDetector() *Detector {
  70. return &Detector{recognizers, true}
  71. }
  72. var (
  73. NotDetectedError = errors.New("Charset not detected.")
  74. )
  75. // DetectBest returns the Result with highest Confidence.
  76. func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
  77. var all []Result
  78. if all, err = d.DetectAll(b); err == nil {
  79. r = &all[0]
  80. }
  81. return
  82. }
  83. // DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
  84. func (d *Detector) DetectAll(b []byte) ([]Result, error) {
  85. input := newRecognizerInput(b, d.stripTag)
  86. outputChan := make(chan recognizerOutput)
  87. for _, r := range d.recognizers {
  88. go matchHelper(r, input, outputChan)
  89. }
  90. outputs := make([]recognizerOutput, 0, len(d.recognizers))
  91. for i := 0; i < len(d.recognizers); i++ {
  92. o := <-outputChan
  93. if o.Confidence > 0 {
  94. outputs = append(outputs, o)
  95. }
  96. }
  97. if len(outputs) == 0 {
  98. return nil, NotDetectedError
  99. }
  100. sort.Sort(recognizerOutputs(outputs))
  101. dedupOutputs := make([]Result, 0, len(outputs))
  102. foundCharsets := make(map[string]struct{}, len(outputs))
  103. for _, o := range outputs {
  104. if _, found := foundCharsets[o.Charset]; !found {
  105. dedupOutputs = append(dedupOutputs, Result(o))
  106. foundCharsets[o.Charset] = struct{}{}
  107. }
  108. }
  109. if len(dedupOutputs) == 0 {
  110. return nil, NotDetectedError
  111. }
  112. return dedupOutputs, nil
  113. }
  114. func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
  115. outputChan <- r.Match(input)
  116. }
  117. type recognizerOutputs []recognizerOutput
  118. func (r recognizerOutputs) Len() int { return len(r) }
  119. func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
  120. func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }