Skip to content

Commit af5e329

Browse files
committed
fix
1 parent 31086b4 commit af5e329

File tree

8 files changed

+81
-126
lines changed

8 files changed

+81
-126
lines changed

modules/charset/charset.go

Lines changed: 42 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,10 @@ package charset
55

66
import (
77
"bytes"
8-
"fmt"
98
"io"
109
"strings"
1110
"unicode/utf8"
1211

13-
"code.gitea.io/gitea/modules/log"
1412
"code.gitea.io/gitea/modules/setting"
1513
"code.gitea.io/gitea/modules/util"
1614

@@ -23,118 +21,98 @@ import (
2321
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
2422

2523
type ConvertOpts struct {
26-
KeepBOM bool
24+
KeepBOM bool
25+
ErrorReplacement []byte
26+
ErrorReturnOrigin bool
2727
}
2828

2929
// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
3030
func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
31-
buf := make([]byte, 2048)
31+
buf := make([]byte, 16*1024)
3232
n, err := util.ReadAtMost(rd, buf)
3333
if err != nil {
34-
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
34+
// read error occurs, don't do any processing
35+
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
3536
}
3637

37-
charsetLabel, err := DetectEncoding(buf[:n])
38-
if err != nil || charsetLabel == "UTF-8" {
39-
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
38+
charsetLabel, _ := DetectEncoding(buf[:n])
39+
if charsetLabel == "UTF-8" {
40+
// is utf-8, try to remove BOM and read it as-is
41+
return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd)
4042
}
4143

4244
encoding, _ := charset.Lookup(charsetLabel)
4345
if encoding == nil {
46+
// unknown charset, don't do any processing
4447
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
4548
}
4649

50+
// convert from charset to utf-8
4751
return transform.NewReader(
48-
io.MultiReader(
49-
bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)),
50-
rd,
51-
),
52+
io.MultiReader(bytes.NewReader(buf[:n]), rd),
5253
encoding.NewDecoder(),
5354
)
5455
}
5556

56-
// ToUTF8 converts content to UTF8 encoding
57-
func ToUTF8(content []byte, opts ConvertOpts) (string, error) {
58-
charsetLabel, err := DetectEncoding(content)
59-
if err != nil {
60-
return "", err
61-
} else if charsetLabel == "UTF-8" {
62-
return string(MaybeRemoveBOM(content, opts)), nil
63-
}
64-
65-
encoding, _ := charset.Lookup(charsetLabel)
66-
if encoding == nil {
67-
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
68-
}
69-
70-
// If there is an error, we concatenate the nicely decoded part and the
71-
// original left over. This way we won't lose much data.
72-
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
73-
if err != nil {
74-
result = append(result, content[n:]...)
75-
}
76-
77-
result = MaybeRemoveBOM(result, opts)
78-
79-
return string(result), err
80-
}
81-
8257
// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
8358
func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
8459
bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts))
8560
return bs
8661
}
8762

88-
// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
89-
func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte {
90-
charsetLabel, err := DetectEncoding(content)
91-
if err != nil || charsetLabel == "UTF-8" {
92-
return MaybeRemoveBOM(content, opts)
63+
func ToUTF8DropErrors(content []byte) []byte {
64+
return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}})
65+
}
66+
67+
func ToUTF8(content []byte, opts ConvertOpts) []byte {
68+
charsetLabel, _ := DetectEncoding(content)
69+
if charsetLabel == "UTF-8" {
70+
return maybeRemoveBOM(content, opts)
9371
}
9472

9573
encoding, _ := charset.Lookup(charsetLabel)
9674
if encoding == nil {
75+
setting.PanicInDevOrTesting("unknown detected charset %q, it shouldn't happen", charsetLabel)
9776
return content
9877
}
9978

100-
// We ignore any non-decodable parts from the file.
101-
// Some parts might be lost
10279
var decoded []byte
10380
decoder := encoding.NewDecoder()
10481
idx := 0
105-
for {
82+
for idx < len(content) {
10683
result, n, err := transform.Bytes(decoder, content[idx:])
10784
decoded = append(decoded, result...)
10885
if err == nil {
10986
break
11087
}
111-
decoded = append(decoded, ' ')
112-
idx = idx + n + 1
113-
if idx >= len(content) {
114-
break
88+
if opts.ErrorReturnOrigin {
89+
return content
11590
}
91+
if opts.ErrorReplacement == nil {
92+
decoded = append(decoded, content[idx+n])
93+
} else {
94+
decoded = append(decoded, opts.ErrorReplacement...)
95+
}
96+
idx += n + 1
11697
}
117-
118-
return MaybeRemoveBOM(decoded, opts)
98+
return maybeRemoveBOM(decoded, opts)
11999
}
120100

121-
// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
122-
func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
101+
// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
102+
func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
123103
if opts.KeepBOM {
124104
return content
125105
}
126-
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
127-
return content[3:]
128-
}
129-
return content
106+
return bytes.TrimPrefix(content, UTF8BOM)
130107
}
131108

132109
// DetectEncoding detect the encoding of content
133-
func DetectEncoding(content []byte) (string, error) {
110+
// it always returns a detected or guessed "encoding" string, no matter error happens or not
111+
func DetectEncoding(content []byte) (encoding string, _ error) {
134112
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
135113

136114
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
137-
// instead we walk backwards from the end to trim off a the incomplete character
115+
// instead we walk backwards from the end to trim off the incomplete character
138116
toValidate := content
139117
end := len(toValidate) - 1
140118

@@ -150,8 +128,8 @@ func DetectEncoding(content []byte) (string, error) {
150128
// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
151129
toValidate = toValidate[:end-2]
152130
}
131+
153132
if utf8.Valid(toValidate) {
154-
log.Debug("Detected encoding: utf-8 (fast)")
155133
return "UTF-8", nil
156134
}
157135

@@ -171,14 +149,10 @@ func DetectEncoding(content []byte) (string, error) {
171149
detectContent = content
172150
}
173151

174-
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
152+
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break
175153
results, err := textDetector.DetectAll(detectContent)
176154
if err != nil {
177-
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
178-
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
179-
return setting.Repository.AnsiCharset, nil
180-
}
181-
return "", err
155+
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
182156
}
183157

184158
topConfidence := results[0].Confidence
@@ -202,10 +176,8 @@ func DetectEncoding(content []byte) (string, error) {
202176

203177
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
204178
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
205-
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
206179
return setting.Repository.AnsiCharset, err
207180
}
208181

209-
log.Debug("Detected encoding: %s", topResult.Charset)
210-
return topResult.Charset, err
182+
return topResult.Charset, nil
211183
}

modules/charset/charset_test.go

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ func resetDefaultCharsetsOrder() {
3131
}
3232

3333
func TestMaybeRemoveBOM(t *testing.T) {
34-
res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
34+
res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
3535
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
3636

37-
res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
37+
res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
3838
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
3939
}
4040

@@ -45,63 +45,54 @@ func TestToUTF8(t *testing.T) {
4545
// locale, so some conversions might behave differently. For that reason, we don't
4646
// depend on particular conversions but in expected behaviors.
4747

48-
res, err := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
49-
assert.NoError(t, err)
50-
assert.Equal(t, "ABC", res)
48+
res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
49+
assert.Equal(t, "ABC", string(res))
5150

5251
// "áéíóú"
53-
res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
54-
assert.NoError(t, err)
52+
res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
5553
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
5654

5755
// "áéíóú"
58-
res, err = ToUTF8([]byte{
56+
res = ToUTF8([]byte{
5957
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
6058
0xc3, 0xba,
6159
}, ConvertOpts{})
62-
assert.NoError(t, err)
6360
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
6461

65-
res, err = ToUTF8([]byte{
62+
res = ToUTF8([]byte{
6663
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
6764
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
6865
}, ConvertOpts{})
69-
assert.NoError(t, err)
7066
stringMustStartWith(t, "Hola,", res)
7167
stringMustEndWith(t, "AAA.", res)
7268

73-
res, err = ToUTF8([]byte{
69+
res = ToUTF8([]byte{
7470
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
7571
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
7672
}, ConvertOpts{})
77-
assert.NoError(t, err)
7873
stringMustStartWith(t, "Hola,", res)
7974
stringMustEndWith(t, "AAA.", res)
8075

81-
res, err = ToUTF8([]byte{
76+
res = ToUTF8([]byte{
8277
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
8378
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
8479
}, ConvertOpts{})
85-
assert.NoError(t, err)
8680
stringMustStartWith(t, "Hola,", res)
8781
stringMustEndWith(t, "AAA.", res)
8882

8983
// Japanese (Shift-JIS)
9084
// 日属秘ぞしちゅ。
91-
res, err = ToUTF8([]byte{
85+
res = ToUTF8([]byte{
9286
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
9387
0xBF, 0x82, 0xE3, 0x81, 0x42,
9488
}, ConvertOpts{})
95-
assert.NoError(t, err)
9689
assert.Equal(t, []byte{
9790
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
9891
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
99-
},
100-
[]byte(res))
92+
}, res)
10193

102-
res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
103-
assert.NoError(t, err)
104-
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
94+
res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
95+
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
10596
}
10697

10798
func TestToUTF8WithFallback(t *testing.T) {
@@ -153,43 +144,44 @@ func TestToUTF8WithFallback(t *testing.T) {
153144

154145
func TestToUTF8DropErrors(t *testing.T) {
155146
resetDefaultCharsetsOrder()
147+
156148
// "ABC"
157-
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
149+
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
158150
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
159151

160152
// "áéíóú"
161-
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
153+
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
162154
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
163155

164156
// UTF8 BOM + "áéíóú"
165-
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
157+
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
166158
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
167159

168160
// "Hola, así cómo ños"
169-
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{})
161+
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
170162
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
171163
assert.Equal(t, []byte{0x73}, res[len(res)-1:])
172164

173165
// "Hola, así cómo "
174166
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
175167

176-
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
168+
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
177169
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
178170
assert.Equal(t, minmatch, res[0:len(minmatch)])
179171

180-
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
172+
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
181173
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
182174
assert.Equal(t, minmatch, res[0:len(minmatch)])
183175

184176
// Japanese (Shift-JIS)
185177
// "日属秘ぞしちゅ。"
186-
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
178+
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
187179
assert.Equal(t, []byte{
188180
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
189181
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
190182
}, res)
191183

192-
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
184+
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
193185
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
194186
}
195187

@@ -231,12 +223,12 @@ func TestDetectEncoding(t *testing.T) {
231223
assert.Error(t, err)
232224
}
233225

234-
func stringMustStartWith(t *testing.T, expected, value string) {
235-
assert.Equal(t, expected, value[:len(expected)])
226+
func stringMustStartWith(t *testing.T, expected string, value []byte) {
227+
assert.Equal(t, expected, string(value[:len(expected)]))
236228
}
237229

238-
func stringMustEndWith(t *testing.T, expected, value string) {
239-
assert.Equal(t, expected, value[len(value)-len(expected):])
230+
func stringMustEndWith(t *testing.T, expected string, value []byte) {
231+
assert.Equal(t, expected, string(value[len(value)-len(expected):]))
240232
}
241233

242234
func TestToUTF8WithFallbackReader(t *testing.T) {

modules/httplib/serve.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
charsetModule "code.gitea.io/gitea/modules/charset"
2020
"code.gitea.io/gitea/modules/container"
2121
"code.gitea.io/gitea/modules/httpcache"
22-
"code.gitea.io/gitea/modules/log"
2322
"code.gitea.io/gitea/modules/setting"
2423
"code.gitea.io/gitea/modules/typesniffer"
2524
"code.gitea.io/gitea/modules/util"
@@ -109,11 +108,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, mineBuf []byt
109108
}
110109

111110
if isPlain {
112-
charset, err := charsetModule.DetectEncoding(mineBuf)
113-
if err != nil {
114-
log.Error("Detect raw file %s charset failed: %v, using by default utf-8", opts.Filename, err)
115-
charset = "utf-8"
116-
}
111+
charset, _ := charsetModule.DetectEncoding(mineBuf)
117112
opts.ContentTypeCharset = strings.ToLower(charset)
118113
}
119114

modules/indexer/code/bleve/bleve.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
203203
RepoID: repo.ID,
204204
CommitID: commitSha,
205205
Filename: update.Filename,
206-
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
206+
Content: string(charset.ToUTF8DropErrors(fileContents)),
207207
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
208208
UpdatedAt: time.Now().UTC(),
209209
})

0 commit comments

Comments
 (0)