123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- package chardet
- import (
- "bytes"
- )
- var (
- utf16beBom = []byte{0xFE, 0xFF}
- utf16leBom = []byte{0xFF, 0xFE}
- utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
- utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
- )
- type recognizerUtf16be struct {
- }
- func newRecognizer_utf16be() *recognizerUtf16be {
- return &recognizerUtf16be{}
- }
- func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
- output = recognizerOutput{
- Charset: "UTF-16BE",
- }
- if bytes.HasPrefix(input.raw, utf16beBom) {
- output.Confidence = 100
- }
- return
- }
- type recognizerUtf16le struct {
- }
- func newRecognizer_utf16le() *recognizerUtf16le {
- return &recognizerUtf16le{}
- }
- func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
- output = recognizerOutput{
- Charset: "UTF-16LE",
- }
- if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
- output.Confidence = 100
- }
- return
- }
- type recognizerUtf32 struct {
- name string
- bom []byte
- decodeChar func(input []byte) uint32
- }
- func decodeUtf32be(input []byte) uint32 {
- return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
- }
- func decodeUtf32le(input []byte) uint32 {
- return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
- }
- func newRecognizer_utf32be() *recognizerUtf32 {
- return &recognizerUtf32{
- "UTF-32BE",
- utf32beBom,
- decodeUtf32be,
- }
- }
- func newRecognizer_utf32le() *recognizerUtf32 {
- return &recognizerUtf32{
- "UTF-32LE",
- utf32leBom,
- decodeUtf32le,
- }
- }
- func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
- output = recognizerOutput{
- Charset: r.name,
- }
- hasBom := bytes.HasPrefix(input.raw, r.bom)
- var numValid, numInvalid uint32
- for b := input.raw; len(b) >= 4; b = b[4:] {
- if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
- numInvalid++
- } else {
- numValid++
- }
- }
- if hasBom && numInvalid == 0 {
- output.Confidence = 100
- } else if hasBom && numValid > numInvalid*10 {
- output.Confidence = 80
- } else if numValid > 3 && numInvalid == 0 {
- output.Confidence = 100
- } else if numValid > 0 && numInvalid == 0 {
- output.Confidence = 80
- } else if numValid > numInvalid*10 {
- output.Confidence = 25
- }
- return
- }
|