Skip to content

Commit 7f1f591

Browse files
committed
Implement a decoding tokenizer
Signed-off-by: Simon Wülker <[email protected]>
1 parent 6ef6986 commit 7f1f591

File tree

18 files changed

+452
-36
lines changed

18 files changed

+452
-36
lines changed

html5ever/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
1313
rust-version.workspace = true
1414

1515
[features]
16+
default = ["encoding"]
1617
trace_tokenizer = []
18+
encoding = ["dep:encoding_rs", "markup5ever/encoding"]
1719

1820
[dependencies]
1921
log = "0.4"
2022
mac = "0.1"
2123
markup5ever = { version = "0.15", path = "../markup5ever" }
2224
match_token = { workspace = true }
25+
encoding_rs = { version = "0.8", optional = true }
2326

2427
[dev-dependencies]
2528
criterion = "0.5"

html5ever/examples/noop-tokenize.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ use std::cell::RefCell;
1515
use std::io;
1616

1717
use html5ever::tendril::*;
18-
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
18+
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
19+
use markup5ever::buffer_queue::BufferQueue;
1920

2021
/// In our case, our sink only contains a tokens vector
2122
struct Sink(RefCell<Vec<Token>>);

html5ever/examples/tokenize.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ use std::cell::Cell;
1313
use std::io;
1414

1515
use html5ever::tendril::*;
16-
use html5ever::tokenizer::BufferQueue;
1716
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
1817
use html5ever::tokenizer::{
1918
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
2019
};
20+
use markup5ever::buffer_queue::BufferQueue;
2121

2222
#[derive(Clone)]
2323
struct TokenPrinter {

html5ever/src/tokenizer/char_ref/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
// except according to those terms.
99

1010
use super::{TokenSink, Tokenizer};
11-
use crate::buffer_queue::BufferQueue;
1211
use crate::data;
1312
use crate::tendril::StrTendril;
1413

1514
use log::debug;
1615
use mac::format_if;
16+
use markup5ever::buffer_queue::BufferQueue;
1717
use std::borrow::Cow::Borrowed;
1818
use std::char::from_u32;
1919

html5ever/src/tokenizer/interface.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
7777
Script(Handle),
7878
Plaintext,
7979
RawData(states::RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
/// Types which can receive tokens from the tokenizer.

html5ever/src/tokenizer/mod.rs

+35-8
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
2222
use self::char_ref::{CharRef, CharRefTokenizer};
2323

2424
use crate::util::str::lower_ascii_letter;
25-
2625
use log::{debug, trace};
2726
use mac::format_if;
28-
use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult};
27+
use markup5ever::{
28+
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
29+
TokenizerResult,
30+
};
2931
use std::borrow::Cow::{self, Borrowed};
3032
use std::cell::{Cell, RefCell, RefMut};
3133
use std::collections::BTreeMap;
32-
use std::mem;
34+
use std::{iter, mem};
3335

34-
pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
36+
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
3537
use crate::tendril::StrTendril;
3638
use crate::{Attribute, LocalName, QualName, SmallCharSet};
3739

@@ -43,6 +45,8 @@ pub enum ProcessResult<Handle> {
4345
Continue,
4446
Suspend,
4547
Script(Handle),
48+
#[cfg(feature = "encoding")]
49+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
4650
}
4751

4852
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
357361
ProcessResult::Continue => (),
358362
ProcessResult::Suspend => break,
359363
ProcessResult::Script(node) => return TokenizerResult::Script(node),
364+
#[cfg(feature = "encoding")]
365+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
366+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
367+
},
360368
}
361369
}
362370
} else {
@@ -365,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
365373
ProcessResult::Continue => (),
366374
ProcessResult::Suspend => break,
367375
ProcessResult::Script(node) => return TokenizerResult::Script(node),
376+
#[cfg(feature = "encoding")]
377+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
378+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
379+
},
368380
}
369381
}
370382
}
@@ -445,6 +457,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
445457
self.state.set(states::RawData(kind));
446458
ProcessResult::Continue
447459
},
460+
#[cfg(feature = "encoding")]
461+
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
462+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
463+
},
448464
}
449465
}
450466

@@ -1448,6 +1464,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
14481464
ProcessResult::Continue => (),
14491465
ProcessResult::Suspend => break,
14501466
ProcessResult::Script(_) => unreachable!(),
1467+
#[cfg(feature = "encoding")]
1468+
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
14511469
}
14521470
}
14531471

@@ -1575,13 +1593,24 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
15751593
}
15761594
}
15771595

1596+
impl<Sink> InputSink for Tokenizer<Sink>
1597+
where
1598+
Sink: TokenSink,
1599+
{
1600+
type Handle = Sink::Handle;
1601+
1602+
fn feed(&self, input: &BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> {
1603+
iter::from_fn(|| self.feed(input).into())
1604+
}
1605+
}
1606+
15781607
#[cfg(test)]
15791608
#[allow(non_snake_case)]
15801609
mod test {
15811610
use super::option_push; // private items
1582-
use crate::tendril::{SliceExt, StrTendril};
1583-
15841611
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1612+
use crate::tendril::{SliceExt, StrTendril};
1613+
use crate::LocalName;
15851614

15861615
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
15871616
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1590,8 +1619,6 @@ mod test {
15901619
use markup5ever::buffer_queue::BufferQueue;
15911620
use std::cell::RefCell;
15921621

1593-
use crate::LocalName;
1594-
15951622
// LinesMatch implements the TokenSink trait. It is used for testing to see
15961623
// if current_line is being updated when process_token is called. The lines
15971624
// vector is a collection of the line numbers that each token is on.

html5ever/src/tree_builder/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,10 @@ where
392392
assert!(more_tokens.is_empty());
393393
return tokenizer::TokenSinkResult::RawData(k);
394394
},
395+
#[cfg(feature = "encoding")]
396+
MaybeChangeEncodingAndStartOver(encoding) => {
397+
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
398+
},
395399
}
396400
}
397401
}

html5ever/src/tree_builder/rules.rs

+25-9
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@
1010
// The tree builder rules, as a single, enormous nested match expression.
1111

1212
use crate::interface::Quirks;
13-
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
13+
use crate::tokenizer::states::{Rawtext, Rcdata};
1414
use crate::tokenizer::TagKind::{EndTag, StartTag};
1515
use crate::tree_builder::tag_sets::*;
1616
use crate::tree_builder::types::*;
17-
use crate::tree_builder::{
18-
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
19-
TreeSink,
20-
};
21-
use crate::QualName;
22-
use markup5ever::{expanded_name, local_name, namespace_url, ns};
17+
use crate::tree_builder::RawKind::ScriptData;
18+
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
19+
20+
use markup5ever::interface::create_element;
21+
use markup5ever::interface::NodeOrText::AppendNode;
22+
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
2323
use std::borrow::Cow::Borrowed;
2424

2525
use crate::tendril::SliceExt;
2626
use match_token::match_token;
2727

28+
#[cfg(feature = "encoding")]
29+
use encoding_rs::Encoding;
30+
2831
fn any_not_whitespace(x: &StrTendril) -> bool {
2932
// FIXME: this might be much faster as a byte scan
3033
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where
113116

114117
<html> => self.step(InBody, token),
115118

116-
tag @ <base> <basefont> <bgsound> <link> <meta> => {
117-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
119+
tag @ <meta> => {
120+
// FIXME: handle <meta http-equiv="Content-Type">
121+
#[cfg(feature = "encoding")]
122+
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
123+
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
124+
self.insert_and_pop_element_for(tag);
125+
return MaybeChangeEncodingAndStartOver(encoding);
126+
}
127+
}
128+
129+
self.insert_and_pop_element_for(tag);
130+
DoneAckSelfClosing
131+
},
132+
133+
tag @ <base> <basefont> <bgsound> <link> => {
118134
self.insert_and_pop_element_for(tag);
119135
DoneAckSelfClosing
120136
}

html5ever/src/tree_builder/types.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub(crate) enum ProcessResult<Handle> {
7777
Script(Handle),
7878
ToPlaintext,
7979
ToRawData(RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
pub(crate) enum FormatEntry<Handle> {

markup5ever/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ rust-version.workspace = true
1414
[lib]
1515
path = "lib.rs"
1616

17+
[features]
18+
encoding = ["dep:encoding_rs"]
19+
1720
[dependencies]
1821
string_cache = "0.8"
1922
phf = "0.11"
2023
tendril = "0.4"
2124
log = "0.4"
25+
encoding_rs = { version = "0.8", optional = true }
2226

2327
[build-dependencies]
2428
string_cache_codegen = "0.5.4"

markup5ever/encoding.rs

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright 2014-2025 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
11+
use tendril::{fmt::Bytes, Tendril};
12+
13+
use crate::buffer_queue::BufferQueue;
14+
15+
/// <https://html.spec.whatwg.org/#concept-encoding-confidence>
16+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17+
pub enum Confidence {
18+
Tentative,
19+
Certain,
20+
Irrelevant,
21+
}
22+
23+
pub struct Decoder {
24+
inner: encoding_rs::Decoder,
25+
confidence: Confidence,
26+
}
27+
28+
impl Decoder {
29+
pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self {
30+
Self {
31+
inner: encoding.new_decoder(),
32+
confidence,
33+
}
34+
}
35+
36+
pub fn confidence(&self) -> Confidence {
37+
self.confidence
38+
}
39+
40+
/// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding
41+
/// should be changed to `encoding`
42+
pub fn change_the_encoding_to(
43+
&mut self,
44+
mut new_encoding: &'static Encoding,
45+
) -> Option<&'static Encoding> {
46+
let current_encoding = self.inner.encoding();
47+
// Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE,
48+
// then set the confidence to certain and return. The new encoding is ignored; if it was anything
49+
// but the same encoding, then it would be clearly incorrect.
50+
if current_encoding == UTF_16BE || current_encoding == UTF_16BE {
51+
self.confidence = Confidence::Certain;
52+
return None;
53+
}
54+
55+
// Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8.
56+
if new_encoding == UTF_16BE || new_encoding == UTF_16BE {
57+
new_encoding = UTF_8;
58+
}
59+
60+
// Step 3. If the new encoding is x-user-defined, then change it to windows-1252.
61+
if new_encoding == X_USER_DEFINED {
62+
new_encoding = WINDOWS_1252;
63+
}
64+
65+
// Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret
66+
// the input stream, then set the confidence to certain and return. This happens when the encoding information found
67+
// in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass
68+
// through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section
69+
// failed to find the right encoding.
70+
if current_encoding == new_encoding {
71+
self.confidence = Confidence::Certain;
72+
return None;
73+
}
74+
75+
// Step 5. If all the bytes up to the last byte converted by the current decoder have the same
76+
// Unicode interpretations in both the current encoding and the new encoding, and if the user agent
77+
// supports changing the converter on the fly, then the user agent may change to the new converter
78+
// for the encoding on the fly. Set the document's character encoding and the encoding used to convert
79+
// the input stream to the new encoding, set the confidence to certain, and return.
80+
// NOTE: We don't support changing the converter on the fly
81+
82+
// Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and
83+
// other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just
84+
// set the encoding to the new encoding and the confidence to certain. Whenever possible, this should
85+
// be done without actually contacting the network layer (the bytes should be re-parsed from memory),
86+
// even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting
87+
// the network layer would involve repeating a request that uses a method other than `GET`, then instead
88+
// set the confidence to certain and ignore the new encoding. The resource will be misinterpreted.
89+
// User agents may notify the user of the situation, to aid in application development.
90+
Some(new_encoding)
91+
}
92+
93+
/// Decode the given chunk with the current encoding. The result will be pushed to the end
94+
/// of the input stream.
95+
pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) {
96+
let mut remaining = chunk;
97+
loop {
98+
let mut out: Tendril<Bytes> = Tendril::new();
99+
let max_len = self
100+
.inner
101+
.max_utf8_buffer_length_without_replacement(remaining.len())
102+
.unwrap_or(8192)
103+
.min(8192);
104+
105+
// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize
106+
// part of the buffer. We are only going to access the initialized segment.
107+
unsafe {
108+
out.push_uninitialized(max_len as u32);
109+
}
110+
111+
let (result, bytes_read, bytes_written) = self
112+
.inner
113+
.decode_to_utf8_without_replacement(&remaining, &mut out, last);
114+
115+
if bytes_written > 0 {
116+
let bytes_chunk = out.subtendril(0, bytes_written as u32);
117+
118+
// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8
119+
let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() };
120+
output.push_back(utf8_chunk);
121+
}
122+
123+
if matches!(result, DecoderResult::Malformed(_, _)) {
124+
output.push_back("\u{FFFD}".into());
125+
}
126+
127+
remaining = &remaining[bytes_read..];
128+
if remaining.is_empty() {
129+
return;
130+
}
131+
}
132+
}
133+
}

0 commit comments

Comments
 (0)