-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchardet.go
63 lines (56 loc) · 1.44 KB
/
chardet.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package chardet
import (
"github.com/wlynxg/chardet/consts"
"github.com/wlynxg/chardet/probe"
"sort"
)
// Detect the encoding of the given byte string.
func Detect(buf []byte) Result {
d := NewUniversalDetector(consts.UnknownLangFilter)
d.Feed(buf)
return d.GetResult()
}
// DetectAll the possible encodings of the given byte string.
func DetectAll(buf []byte) []Result {
d := NewUniversalDetector(consts.UnknownLangFilter)
d.Feed(buf)
result := d.GetResult()
if d.inputState == consts.HighByteInputState {
var (
results []Result
probes []probe.Probe
)
for _, p := range d.charsetProbes {
switch rp := p.(type) {
case *probe.CharSetGroupProbe:
probes = append(probes, rp.Probes()...)
default:
probes = append(probes, p)
}
}
for _, setProbe := range probes {
if setProbe.GetConfidence() > d.MinimumThreshold {
charsetName := setProbe.CharSetName()
if d.hasWinBytes {
// Use Windows encoding name instead of ISO-8859 if we saw any
// extra Windows-specific bytes
if n, ok := d.IsoWinMap[setProbe.CharSetName()]; ok {
charsetName = n
}
}
results = append(results, Result{
Encoding: charsetName,
Confidence: setProbe.GetConfidence(),
Language: setProbe.Language(),
})
}
}
if len(results) > 0 {
sort.Slice(results, func(i, j int) bool {
return results[i].Confidence > results[j].Confidence
})
return results
}
}
return []Result{result}
}