123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- // Package chardet ports character set detection from ICU.
- package chardet
- import (
- "errors"
- "sort"
- )
- // Result contains all the information that charset detector gives.
- type Result struct {
- // IANA name of the detected charset.
- Charset string
- // IANA name of the detected language. It may be empty for some charsets.
- Language string
- // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
- Confidence int
- }
- // Detector implements charset detection.
- type Detector struct {
- recognizers []recognizer
- stripTag bool
- }
- // List of charset recognizers
- var recognizers = []recognizer{
- newRecognizer_utf8(),
- newRecognizer_utf16be(),
- newRecognizer_utf16le(),
- newRecognizer_utf32be(),
- newRecognizer_utf32le(),
- newRecognizer_8859_1_en(),
- newRecognizer_8859_1_da(),
- newRecognizer_8859_1_de(),
- newRecognizer_8859_1_es(),
- newRecognizer_8859_1_fr(),
- newRecognizer_8859_1_it(),
- newRecognizer_8859_1_nl(),
- newRecognizer_8859_1_no(),
- newRecognizer_8859_1_pt(),
- newRecognizer_8859_1_sv(),
- newRecognizer_8859_2_cs(),
- newRecognizer_8859_2_hu(),
- newRecognizer_8859_2_pl(),
- newRecognizer_8859_2_ro(),
- newRecognizer_8859_5_ru(),
- newRecognizer_8859_6_ar(),
- newRecognizer_8859_7_el(),
- newRecognizer_8859_8_I_he(),
- newRecognizer_8859_8_he(),
- newRecognizer_windows_1251(),
- newRecognizer_windows_1256(),
- newRecognizer_KOI8_R(),
- newRecognizer_8859_9_tr(),
- newRecognizer_sjis(),
- newRecognizer_gb_18030(),
- newRecognizer_euc_jp(),
- newRecognizer_euc_kr(),
- newRecognizer_big5(),
- newRecognizer_2022JP(),
- newRecognizer_2022KR(),
- newRecognizer_2022CN(),
- newRecognizer_IBM424_he_rtl(),
- newRecognizer_IBM424_he_ltr(),
- newRecognizer_IBM420_ar_rtl(),
- newRecognizer_IBM420_ar_ltr(),
- }
- // NewTextDetector creates a Detector for plain text.
- func NewTextDetector() *Detector {
- return &Detector{recognizers, false}
- }
- // NewHtmlDetector creates a Detector for Html.
- func NewHtmlDetector() *Detector {
- return &Detector{recognizers, true}
- }
- var (
- NotDetectedError = errors.New("Charset not detected.")
- )
- // DetectBest returns the Result with highest Confidence.
- func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
- var all []Result
- if all, err = d.DetectAll(b); err == nil {
- r = &all[0]
- }
- return
- }
- // DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
- func (d *Detector) DetectAll(b []byte) ([]Result, error) {
- input := newRecognizerInput(b, d.stripTag)
- outputChan := make(chan recognizerOutput)
- for _, r := range d.recognizers {
- go matchHelper(r, input, outputChan)
- }
- outputs := make([]recognizerOutput, 0, len(d.recognizers))
- for i := 0; i < len(d.recognizers); i++ {
- o := <-outputChan
- if o.Confidence > 0 {
- outputs = append(outputs, o)
- }
- }
- if len(outputs) == 0 {
- return nil, NotDetectedError
- }
- sort.Sort(recognizerOutputs(outputs))
- dedupOutputs := make([]Result, 0, len(outputs))
- foundCharsets := make(map[string]struct{}, len(outputs))
- for _, o := range outputs {
- if _, found := foundCharsets[o.Charset]; !found {
- dedupOutputs = append(dedupOutputs, Result(o))
- foundCharsets[o.Charset] = struct{}{}
- }
- }
- if len(dedupOutputs) == 0 {
- return nil, NotDetectedError
- }
- return dedupOutputs, nil
- }
- func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
- outputChan <- r.Match(input)
- }
- type recognizerOutputs []recognizerOutput
- func (r recognizerOutputs) Len() int { return len(r) }
- func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
- func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|