Skip to content

Commit 8394279

Browse files
duonglaiquangrbri
authored andcommitted
EncodingSniffer: fix missing encoding label conversions
This commit brings EncodingSniffer.ENCODING_FROM_LABEL up to date with the latest at https://encoding.spec.whatwg.org/#names-and-labels
1 parent 1a9032f commit 8394279

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

src/main/java/org/htmlunit/util/EncodingSniffer.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,16 +90,19 @@ public final class EncodingSniffer {
9090
private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
9191
private static final byte[] COMMENT_END = {'-', '-', '>'};
9292

93-
/** <a href="http://encoding.spec.whatwg.org/#encodings">Reference</a> */
93+
/** <a href="https://encoding.spec.whatwg.org/#names-and-labels">Encoding names and labels</a> */
9494
private static final Map<String, String> ENCODING_FROM_LABEL;
9595
static {
9696
ENCODING_FROM_LABEL = new HashMap<>();
9797

9898
// The Encoding
9999
// ------------
100100
ENCODING_FROM_LABEL.put("unicode-1-1-utf-8", "utf-8");
101+
ENCODING_FROM_LABEL.put("unicode11utf8", "utf-8");
102+
ENCODING_FROM_LABEL.put("unicode20utf8", "utf-8");
101103
ENCODING_FROM_LABEL.put("utf-8", "utf-8");
102104
ENCODING_FROM_LABEL.put("utf8", "utf-8");
105+
ENCODING_FROM_LABEL.put("x-unicode20utf8", "utf-8");
103106

104107
// Legacy single-byte encodings
105108
// ----------------------------
@@ -367,8 +370,9 @@ public final class EncodingSniffer {
367370
ENCODING_FROM_LABEL.put("csiso2022jp", "iso-2022-jp");
368371
ENCODING_FROM_LABEL.put("iso-2022-jp", "iso-2022-jp");
369372

370-
// iso-2022-jp
373+
// shift_jis
371374
ENCODING_FROM_LABEL.put("csshiftjis", "shift_jis");
375+
ENCODING_FROM_LABEL.put("ms932", "shift_jis");
372376
ENCODING_FROM_LABEL.put("ms_kanji", "shift_jis");
373377
ENCODING_FROM_LABEL.put("shift-jis", "shift_jis");
374378
ENCODING_FROM_LABEL.put("shift_jis", "shift_jis");
@@ -396,14 +400,22 @@ public final class EncodingSniffer {
396400

397401
// replacement
398402
ENCODING_FROM_LABEL.put("csiso2022kr", "replacement");
403+
ENCODING_FROM_LABEL.put("hz-gb-2312", "replacement");
399404
ENCODING_FROM_LABEL.put("iso-2022-cn", "replacement");
400405
ENCODING_FROM_LABEL.put("iso-2022-cn-ext", "replacement");
401406
ENCODING_FROM_LABEL.put("iso-2022-kr", "replacement");
407+
ENCODING_FROM_LABEL.put("replacement", "replacement");
402408

403409
// utf-16be
410+
ENCODING_FROM_LABEL.put("unicodefffe", "utf-16be");
404411
ENCODING_FROM_LABEL.put("utf-16be", "utf-16be");
405412

406413
// utf-16le
414+
ENCODING_FROM_LABEL.put("csunicode", "utf-16le");
415+
ENCODING_FROM_LABEL.put("iso-10646-ucs-2", "utf-16le");
416+
ENCODING_FROM_LABEL.put("ucs-2", "utf-16le");
417+
ENCODING_FROM_LABEL.put("unicode", "utf-16le");
418+
ENCODING_FROM_LABEL.put("unicodefeff", "utf-16le");
407419
ENCODING_FROM_LABEL.put("utf-16", "utf-16le");
408420
ENCODING_FROM_LABEL.put("utf-16le", "utf-16le");
409421

0 commit comments

Comments
 (0)