Skip to content

Commit c1ebc84

Browse files
committed
zip: Simplify encoding config; add empty header matching
1 parent 2932322 commit c1ebc84

File tree

1 file changed

+18
-65
lines changed

1 file changed

+18
-65
lines changed

zip.go

Lines changed: 18 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,12 @@ import (
1212
"strings"
1313

1414
szip "github.com/STARRY-S/zip"
15+
"golang.org/x/text/encoding"
1516

1617
"github.com/dsnet/compress/bzip2"
1718
"github.com/klauspost/compress/zip"
1819
"github.com/klauspost/compress/zstd"
1920
"github.com/ulikunitz/xz"
20-
"golang.org/x/text/encoding"
21-
"golang.org/x/text/encoding/charmap"
22-
"golang.org/x/text/encoding/japanese"
23-
"golang.org/x/text/encoding/korean"
24-
"golang.org/x/text/encoding/simplifiedchinese"
25-
"golang.org/x/text/encoding/traditionalchinese"
26-
"golang.org/x/text/encoding/unicode"
2721
)
2822

2923
func init() {
@@ -80,7 +74,7 @@ type Zip struct {
8074
// For files in zip archives that do not have UTF-8
8175
// encoded filenames and comments, specify the character
8276
// encoding here.
83-
TextEncoding string
77+
TextEncoding encoding.Encoding
8478
}
8579

8680
func (z Zip) Extension() string { return ".zip" }
@@ -94,11 +88,16 @@ func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchR
9488
}
9589

9690
// match file header
97-
buf, err := readAtMost(stream, len(zipHeader))
98-
if err != nil {
99-
return mr, err
91+
for _, hdr := range zipHeaders {
92+
buf, err := readAtMost(stream, len(hdr))
93+
if err != nil {
94+
return mr, err
95+
}
96+
if bytes.Equal(buf, hdr) {
97+
mr.ByStream = true
98+
break
99+
}
100100
}
101-
mr.ByStream = bytes.Equal(buf, zipHeader)
102101

103102
return mr, nil
104103
}
@@ -255,13 +254,14 @@ func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile Fi
255254
// It is a no-op if the text is already UTF-8 encoded or if z.TextEncoding
256255
// is not specified.
257256
func (z Zip) decodeText(hdr *zip.FileHeader) {
258-
if hdr.NonUTF8 && z.TextEncoding != "" {
259-
filename, err := decodeText(hdr.Name, z.TextEncoding)
257+
if hdr.NonUTF8 && z.TextEncoding != nil {
258+
dec := z.TextEncoding.NewDecoder()
259+
filename, err := dec.String(hdr.Name)
260260
if err == nil {
261261
hdr.Name = filename
262262
}
263263
if hdr.Comment != "" {
264-
comment, err := decodeText(hdr.Comment, z.TextEncoding)
264+
comment, err := dec.String(hdr.Comment)
265265
if err == nil {
266266
hdr.Comment = comment
267267
}
@@ -384,58 +384,11 @@ var compressedFormats = map[string]struct{}{
384384
".zipx": {},
385385
}
386386

387-
var encodings = map[string]encoding.Encoding{
388-
"ibm866": charmap.CodePage866,
389-
"iso8859_2": charmap.ISO8859_2,
390-
"iso8859_3": charmap.ISO8859_3,
391-
"iso8859_4": charmap.ISO8859_4,
392-
"iso8859_5": charmap.ISO8859_5,
393-
"iso8859_6": charmap.ISO8859_6,
394-
"iso8859_7": charmap.ISO8859_7,
395-
"iso8859_8": charmap.ISO8859_8,
396-
"iso8859_8I": charmap.ISO8859_8I,
397-
"iso8859_10": charmap.ISO8859_10,
398-
"iso8859_13": charmap.ISO8859_13,
399-
"iso8859_14": charmap.ISO8859_14,
400-
"iso8859_15": charmap.ISO8859_15,
401-
"iso8859_16": charmap.ISO8859_16,
402-
"koi8r": charmap.KOI8R,
403-
"koi8u": charmap.KOI8U,
404-
"macintosh": charmap.Macintosh,
405-
"windows874": charmap.Windows874,
406-
"windows1250": charmap.Windows1250,
407-
"windows1251": charmap.Windows1251,
408-
"windows1252": charmap.Windows1252,
409-
"windows1253": charmap.Windows1253,
410-
"windows1254": charmap.Windows1254,
411-
"windows1255": charmap.Windows1255,
412-
"windows1256": charmap.Windows1256,
413-
"windows1257": charmap.Windows1257,
414-
"windows1258": charmap.Windows1258,
415-
"macintoshcyrillic": charmap.MacintoshCyrillic,
416-
"gbk": simplifiedchinese.GBK,
417-
"gb18030": simplifiedchinese.GB18030,
418-
"big5": traditionalchinese.Big5,
419-
"eucjp": japanese.EUCJP,
420-
"iso2022jp": japanese.ISO2022JP,
421-
"shiftjis": japanese.ShiftJIS,
422-
"euckr": korean.EUCKR,
423-
"utf16be": unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
424-
"utf16le": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
387+
var zipHeaders = [][]byte{
388+
[]byte("PK\x03\x04"), // normal
389+
[]byte("PK\x05\x06"), // empty
425390
}
426391

427-
// decodeText returns UTF-8 encoded text from the given charset.
428-
// Thanks to @zxdvd for contributing non-UTF-8 encoding logic in
429-
// #149, and to @pashifika for helping in #305.
430-
func decodeText(input, charset string) (string, error) {
431-
if enc, ok := encodings[charset]; ok {
432-
return enc.NewDecoder().String(input)
433-
}
434-
return "", fmt.Errorf("unrecognized charset %s", charset)
435-
}
436-
437-
var zipHeader = []byte("PK\x03\x04") // NOTE: headers of empty zip files might end with 0x05,0x06 or 0x06,0x06 instead of 0x03,0x04
438-
439392
// Interface guards
440393
var (
441394
_ Archiver = Zip{}

0 commit comments

Comments
 (0)