Skip to content

Commit fddd90e

Browse files
committed
Bump html5ever to its current stable version and adjust our usage accordingly
1 parent 7d422d8 commit fddd90e

File tree

5 files changed

+103
-58
lines changed

5 files changed

+103
-58
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scraper/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ readme = "README.md"
1616
ahash = "0.8.0"
1717
cssparser = "0.34.0"
1818
ego-tree = "0.9.0"
19-
html5ever = "0.27.0"
19+
html5ever = "0.29.0"
2020
indexmap = { version = "2.6.0", optional = true }
2121
precomputed-hash = "0.1.1"
2222
selectors = "0.26.0"

scraper/src/html/mod.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ use tendril::TendrilSink;
1616
use crate::selector::Selector;
1717
use crate::{ElementRef, Node};
1818

19+
pub use tree_sink::HtmlTreeSink;
20+
1921
/// An HTML tree.
2022
///
2123
/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
@@ -67,22 +69,23 @@ impl Html {
6769
/// # fn main() {
6870
/// # let document = "";
6971
/// use html5ever::driver::{self, ParseOpts};
70-
/// use scraper::Html;
72+
/// use scraper::{Html, HtmlTreeSink};
7173
/// use tendril::TendrilSink;
7274
///
73-
/// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
75+
/// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default());
7476
/// let html = parser.one(document);
7577
/// # }
7678
/// ```
7779
pub fn parse_document(document: &str) -> Self {
78-
let parser = driver::parse_document(Self::new_document(), Default::default());
80+
let parser =
81+
driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default());
7982
parser.one(document)
8083
}
8184

8285
/// Parses a string of HTML as a fragment.
8386
pub fn parse_fragment(fragment: &str) -> Self {
8487
let parser = driver::parse_fragment(
85-
Self::new_fragment(),
88+
HtmlTreeSink::new(Self::new_fragment()),
8689
Default::default(),
8790
QualName::new(None, ns!(html), local_name!("body")),
8891
Vec::new(),

scraper/src/html/tree_sink.rs

Lines changed: 90 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,47 @@ use ego_tree::NodeId;
55
use html5ever::tendril::StrTendril;
66
use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
77
use html5ever::Attribute;
8-
use html5ever::{ExpandedName, QualName};
8+
use html5ever::QualName;
99
use std::borrow::Cow;
10+
use std::cell::{Ref, RefCell};
11+
12+
/// Wraps `Html` instances as sinks to drive parsing
13+
#[derive(Debug)]
14+
pub struct HtmlTreeSink(RefCell<Html>);
15+
16+
impl HtmlTreeSink {
17+
/// Wrap a `Html`instance as a sink to drive parsing
18+
pub fn new(html: Html) -> Self {
19+
Self(RefCell::new(html))
20+
}
21+
}
1022

1123
/// Note: does not support the `<template>` element.
12-
impl TreeSink for Html {
13-
type Output = Self;
24+
impl TreeSink for HtmlTreeSink {
25+
type Output = Html;
1426
type Handle = NodeId;
27+
type ElemName<'a> = Ref<'a, QualName>;
1528

16-
fn finish(self) -> Self {
17-
self
29+
fn finish(self) -> Html {
30+
self.0.into_inner()
1831
}
1932

2033
// Signal a parse error.
21-
fn parse_error(&mut self, msg: Cow<'static, str>) {
34+
fn parse_error(&self, msg: Cow<'static, str>) {
2235
#[cfg(feature = "errors")]
23-
self.errors.push(msg);
36+
self.0.borrow_mut().errors.push(msg);
2437
#[cfg(not(feature = "errors"))]
2538
let _ = msg;
2639
}
2740

2841
// Set the document's quirks mode.
29-
fn set_quirks_mode(&mut self, mode: QuirksMode) {
30-
self.quirks_mode = mode;
42+
fn set_quirks_mode(&self, mode: QuirksMode) {
43+
self.0.borrow_mut().quirks_mode = mode;
3144
}
3245

3346
// Get a handle to the Document node.
34-
fn get_document(&mut self) -> Self::Handle {
35-
self.tree.root().id()
47+
fn get_document(&self) -> Self::Handle {
48+
self.0.borrow().tree.root().id()
3649
}
3750

3851
// Do two handles refer to the same node?
@@ -43,15 +56,17 @@ impl TreeSink for Html {
4356
// What is the name of this element?
4457
//
4558
// Should never be called on a non-element node; feel free to panic!.
46-
fn elem_name(&self, target: &Self::Handle) -> ExpandedName {
47-
self.tree
48-
.get(*target)
49-
.unwrap()
50-
.value()
51-
.as_element()
52-
.unwrap()
53-
.name
54-
.expanded()
59+
fn elem_name<'a>(&'a self, target: &Self::Handle) -> Ref<'a, QualName> {
60+
Ref::map(self.0.borrow(), |this| {
61+
&this
62+
.tree
63+
.get(*target)
64+
.unwrap()
65+
.value()
66+
.as_element()
67+
.unwrap()
68+
.name
69+
})
5570
}
5671

5772
// Create an element.
@@ -60,14 +75,15 @@ impl TreeSink for Html {
6075
// associated document fragment called the "template contents" should also be created. Later
6176
// calls to self.get_template_contents() with that given element return it.
6277
fn create_element(
63-
&mut self,
78+
&self,
6479
name: QualName,
6580
attrs: Vec<Attribute>,
6681
_flags: ElementFlags,
6782
) -> Self::Handle {
6883
let fragment = name.expanded() == expanded_name!(html "template");
6984

70-
let mut node = self.tree.orphan(Node::Element(Element::new(name, attrs)));
85+
let mut this = self.0.borrow_mut();
86+
let mut node = this.tree.orphan(Node::Element(Element::new(name, attrs)));
7187

7288
if fragment {
7389
node.append(Node::Fragment);
@@ -77,8 +93,10 @@ impl TreeSink for Html {
7793
}
7894

7995
// Create a comment node.
80-
fn create_comment(&mut self, text: StrTendril) -> Self::Handle {
81-
self.tree
96+
fn create_comment(&self, text: StrTendril) -> Self::Handle {
97+
self.0
98+
.borrow_mut()
99+
.tree
82100
.orphan(Node::Comment(Comment {
83101
comment: make_tendril(text),
84102
}))
@@ -87,7 +105,7 @@ impl TreeSink for Html {
87105

88106
// Append a DOCTYPE element to the Document node.
89107
fn append_doctype_to_document(
90-
&mut self,
108+
&self,
91109
name: StrTendril,
92110
public_id: StrTendril,
93111
system_id: StrTendril,
@@ -100,15 +118,20 @@ impl TreeSink for Html {
100118
public_id,
101119
system_id,
102120
};
103-
self.tree.root_mut().append(Node::Doctype(doctype));
121+
self.0
122+
.borrow_mut()
123+
.tree
124+
.root_mut()
125+
.append(Node::Doctype(doctype));
104126
}
105127

106128
// Append a node as the last child of the given node. If this would produce adjacent sibling
107129
// text nodes, it should concatenate the text instead.
108130
//
109131
// The child node will not already have a parent.
110-
fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
111-
let mut parent = self.tree.get_mut(*parent).unwrap();
132+
fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
133+
let mut this = self.0.borrow_mut();
134+
let mut parent = this.tree.get_mut(*parent).unwrap();
112135

113136
match child {
114137
NodeOrText::AppendNode(id) => {
@@ -141,16 +164,14 @@ impl TreeSink for Html {
141164
// also a text node, the two should be merged, as in the behavior of append.
142165
//
143166
// NB: new_node may have an old parent, from which it should be removed.
144-
fn append_before_sibling(
145-
&mut self,
146-
sibling: &Self::Handle,
147-
new_node: NodeOrText<Self::Handle>,
148-
) {
167+
fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
168+
let mut this = self.0.borrow_mut();
169+
149170
if let NodeOrText::AppendNode(id) = new_node {
150-
self.tree.get_mut(id).unwrap().detach();
171+
this.tree.get_mut(id).unwrap().detach();
151172
}
152173

153-
let mut sibling = self.tree.get_mut(*sibling).unwrap();
174+
let mut sibling = this.tree.get_mut(*sibling).unwrap();
154175
if sibling.parent().is_some() {
155176
match new_node {
156177
NodeOrText::AppendNode(id) => {
@@ -180,22 +201,25 @@ impl TreeSink for Html {
180201
}
181202

182203
// Detach the given node from its parent.
183-
fn remove_from_parent(&mut self, target: &Self::Handle) {
184-
self.tree.get_mut(*target).unwrap().detach();
204+
fn remove_from_parent(&self, target: &Self::Handle) {
205+
self.0.borrow_mut().tree.get_mut(*target).unwrap().detach();
185206
}
186207

187208
// Remove all the children from node and append them to new_parent.
188-
fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) {
189-
self.tree
209+
fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
210+
self.0
211+
.borrow_mut()
212+
.tree
190213
.get_mut(*new_parent)
191214
.unwrap()
192215
.reparent_from_id_append(*node);
193216
}
194217

195218
// Add each attribute to the given element, if no attribute with that name already exists. The
196219
// tree builder promises this will never be called with something else than an element.
197-
fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>) {
198-
let mut node = self.tree.get_mut(*target).unwrap();
220+
fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
221+
let mut this = self.0.borrow_mut();
222+
let mut node = this.tree.get_mut(*target).unwrap();
199223
let element = match *node.value() {
200224
Node::Element(ref mut e) => e,
201225
_ => unreachable!(),
@@ -213,18 +237,27 @@ impl TreeSink for Html {
213237
//
214238
// The tree builder promises this will never be called with something else than a template
215239
// element.
216-
fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle {
217-
self.tree.get(*target).unwrap().first_child().unwrap().id()
240+
fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
241+
self.0
242+
.borrow()
243+
.tree
244+
.get(*target)
245+
.unwrap()
246+
.first_child()
247+
.unwrap()
248+
.id()
218249
}
219250

220251
// Mark a HTML <script> element as "already started".
221-
fn mark_script_already_started(&mut self, _node: &Self::Handle) {}
252+
fn mark_script_already_started(&self, _node: &Self::Handle) {}
222253

223254
// Create Processing Instruction.
224-
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle {
255+
fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
225256
let target = make_tendril(target);
226257
let data = make_tendril(data);
227-
self.tree
258+
self.0
259+
.borrow_mut()
260+
.tree
228261
.orphan(Node::ProcessingInstruction(ProcessingInstruction {
229262
target,
230263
data,
@@ -233,12 +266,21 @@ impl TreeSink for Html {
233266
}
234267

235268
fn append_based_on_parent_node(
236-
&mut self,
269+
&self,
237270
element: &Self::Handle,
238271
prev_element: &Self::Handle,
239272
child: NodeOrText<Self::Handle>,
240273
) {
241-
if self.tree.get(*element).unwrap().parent().is_some() {
274+
let has_parent = self
275+
.0
276+
.borrow()
277+
.tree
278+
.get(*element)
279+
.unwrap()
280+
.parent()
281+
.is_some();
282+
283+
if has_parent {
242284
self.append_before_sibling(element, child)
243285
} else {
244286
self.append(prev_element, child)

scraper/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@
139139
extern crate html5ever;
140140

141141
pub use crate::element_ref::ElementRef;
142-
pub use crate::html::Html;
142+
pub use crate::html::{Html, HtmlTreeSink};
143143
pub use crate::node::Node;
144144
pub use crate::selector::Selector;
145145

0 commit comments

Comments
 (0)