Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle decoding of input in html5ever #590

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions html5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ readme = "../README.md"
rust-version.workspace = true

[features]
default = ["encoding"]
trace_tokenizer = []
encoding = ["dep:encoding_rs", "markup5ever/encoding"]

[dependencies]
log = "0.4"
mac = "0.1"
markup5ever = { version = "0.16", path = "../markup5ever" }
match_token = { workspace = true }
encoding_rs = { version = "0.8", optional = true }

[dev-dependencies]
criterion = "0.5"
Expand Down
3 changes: 2 additions & 1 deletion html5ever/examples/noop-tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ use std::cell::RefCell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
use markup5ever::buffer_queue::BufferQueue;

/// In our case, our sink only contains a tokens vector
struct Sink(RefCell<Vec<Token>>);
Expand Down
2 changes: 1 addition & 1 deletion html5ever/examples/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ use std::cell::Cell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
use html5ever::tokenizer::{
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
use markup5ever::buffer_queue::BufferQueue;

#[derive(Clone)]
struct TokenPrinter {
Expand Down
2 changes: 1 addition & 1 deletion html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
// except according to those terms.

use super::{TokenSink, Tokenizer};
use crate::buffer_queue::BufferQueue;
use crate::data;
use crate::tendril::StrTendril;

use log::debug;
use mac::format_if;
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;

Expand Down
2 changes: 2 additions & 0 deletions html5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
Script(Handle),
Plaintext,
RawData(states::RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

/// Types which can receive tokens from the tokenizer.
Expand Down
43 changes: 35 additions & 8 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};

use crate::util::str::lower_ascii_letter;

use log::{debug, trace};
use mac::format_if;
use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult};
use markup5ever::{
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
TokenizerResult,
};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
use std::mem;
use std::{iter, mem};

pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
use crate::tendril::StrTendril;
use crate::{Attribute, LocalName, QualName, SmallCharSet};

Expand All @@ -43,6 +45,8 @@ pub enum ProcessResult<Handle> {
Continue,
Suspend,
Script(Handle),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
Expand Down Expand Up @@ -357,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
} else {
Expand All @@ -365,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
}
Expand Down Expand Up @@ -445,6 +457,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
#[cfg(feature = "encoding")]
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}

Expand Down Expand Up @@ -1448,6 +1464,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(_) => unreachable!(),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
}
}

Expand Down Expand Up @@ -1575,13 +1593,24 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

impl<Sink> InputSink for Tokenizer<Sink>
where
Sink: TokenSink,
{
type Handle = Sink::Handle;

fn feed<'a>(&'a self, input: &'a BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
iter::from_fn(|| self.feed(input).into())
}
}

#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
use crate::tendril::{SliceExt, StrTendril};

use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use crate::tendril::{SliceExt, StrTendril};
use crate::LocalName;

use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use super::interface::{EndTag, StartTag, Tag, TagKind};
Expand All @@ -1590,8 +1619,6 @@ mod test {
use markup5ever::buffer_queue::BufferQueue;
use std::cell::RefCell;

use crate::LocalName;

// LinesMatch implements the TokenSink trait. It is used for testing to see
// if current_line is being updated when process_token is called. The lines
// vector is a collection of the line numbers that each token is on.
Expand Down
4 changes: 4 additions & 0 deletions html5ever/src/tree_builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,10 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
},
}
}
}
Expand Down
34 changes: 25 additions & 9 deletions html5ever/src/tree_builder/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@
// The tree builder rules, as a single, enormous nested match expression.

use crate::interface::Quirks;
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
use crate::tokenizer::states::{Rawtext, Rcdata};
use crate::tokenizer::TagKind::{EndTag, StartTag};
use crate::tree_builder::tag_sets::*;
use crate::tree_builder::types::*;
use crate::tree_builder::{
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
TreeSink,
};
use crate::QualName;
use markup5ever::{expanded_name, local_name, namespace_url, ns};
use crate::tree_builder::RawKind::ScriptData;
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};

use markup5ever::interface::create_element;
use markup5ever::interface::NodeOrText::AppendNode;
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
use std::borrow::Cow::Borrowed;

use crate::tendril::SliceExt;
use match_token::match_token;

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;

fn any_not_whitespace(x: &StrTendril) -> bool {
// FIXME: this might be much faster as a byte scan
x.chars().any(|c| !c.is_ascii_whitespace())
Expand Down Expand Up @@ -113,8 +116,21 @@ where

<html> => self.step(InsertionMode::InBody, token),

tag @ <base> <basefont> <bgsound> <link> <meta> => {
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
tag @ <meta> => {
// FIXME: handle <meta http-equiv="Content-Type">
#[cfg(feature = "encoding")]
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
self.insert_and_pop_element_for(tag);
return ProcessResult::MaybeChangeEncodingAndStartOver(encoding);
}
}

self.insert_and_pop_element_for(tag);
ProcessResult::DoneAckSelfClosing
},

tag @ <base> <basefont> <bgsound> <link> => {
self.insert_and_pop_element_for(tag);
ProcessResult::DoneAckSelfClosing
}
Expand Down
2 changes: 2 additions & 0 deletions html5ever/src/tree_builder/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ pub(crate) enum ProcessResult<Handle> {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

pub(crate) enum FormatEntry<Handle> {
Expand Down
4 changes: 4 additions & 0 deletions markup5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@ rust-version.workspace = true
[lib]
path = "lib.rs"

[features]
encoding = ["dep:encoding_rs"]

[dependencies]
string_cache = "0.8"
phf = "0.11"
tendril = "0.4"
log = "0.4"
encoding_rs = { version = "0.8", optional = true }

[build-dependencies]
string_cache_codegen = "0.5.4"
Expand Down
133 changes: 133 additions & 0 deletions markup5ever/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2014-2025 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
use tendril::{fmt::Bytes, Tendril};

use crate::buffer_queue::BufferQueue;

/// <https://html.spec.whatwg.org/#concept-encoding-confidence>
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Confidence {
Tentative,
Certain,
Irrelevant,
}

pub struct Decoder {
inner: encoding_rs::Decoder,
confidence: Confidence,
}

impl Decoder {
pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self {
Self {
inner: encoding.new_decoder(),
confidence,
}
}

pub fn confidence(&self) -> Confidence {
self.confidence
}

/// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding
/// should be changed to `encoding`
pub fn change_the_encoding_to(
&mut self,
mut new_encoding: &'static Encoding,
) -> Option<&'static Encoding> {
let current_encoding = self.inner.encoding();
// Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE,
// then set the confidence to certain and return. The new encoding is ignored; if it was anything
// but the same encoding, then it would be clearly incorrect.
if current_encoding == UTF_16BE || current_encoding == UTF_16BE {
self.confidence = Confidence::Certain;
return None;
}

// Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8.
if new_encoding == UTF_16BE || new_encoding == UTF_16BE {
new_encoding = UTF_8;
}

// Step 3. If the new encoding is x-user-defined, then change it to windows-1252.
if new_encoding == X_USER_DEFINED {
new_encoding = WINDOWS_1252;
}

// Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret
// the input stream, then set the confidence to certain and return. This happens when the encoding information found
// in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass
// through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section
// failed to find the right encoding.
if current_encoding == new_encoding {
self.confidence = Confidence::Certain;
return None;
}

// Step 5. If all the bytes up to the last byte converted by the current decoder have the same
// Unicode interpretations in both the current encoding and the new encoding, and if the user agent
// supports changing the converter on the fly, then the user agent may change to the new converter
// for the encoding on the fly. Set the document's character encoding and the encoding used to convert
// the input stream to the new encoding, set the confidence to certain, and return.
// NOTE: We don't support changing the converter on the fly

// Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and
// other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just
// set the encoding to the new encoding and the confidence to certain. Whenever possible, this should
// be done without actually contacting the network layer (the bytes should be re-parsed from memory),
// even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting
// the network layer would involve repeating a request that uses a method other than `GET`, then instead
// set the confidence to certain and ignore the new encoding. The resource will be misinterpreted.
// User agents may notify the user of the situation, to aid in application development.
Some(new_encoding)
}

/// Decode the given chunk with the current encoding. The result will be pushed to the end
/// of the input stream.
pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) {
let mut remaining = chunk;
loop {
let mut out: Tendril<Bytes> = Tendril::new();
let max_len = self
.inner
.max_utf8_buffer_length_without_replacement(remaining.len())
.unwrap_or(8192)
.min(8192);

// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize
// part of the buffer. We are only going to access the initialized segment.
unsafe {
out.push_uninitialized(max_len as u32);
}

let (result, bytes_read, bytes_written) = self
.inner
.decode_to_utf8_without_replacement(&remaining, &mut out, last);

if bytes_written > 0 {
let bytes_chunk = out.subtendril(0, bytes_written as u32);

// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8
let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() };
output.push_back(utf8_chunk);
}

if matches!(result, DecoderResult::Malformed(_, _)) {
output.push_back("\u{FFFD}".into());
}

remaining = &remaining[bytes_read..];
if remaining.is_empty() {
return;
}
}
}
}
Loading
Loading