Skip to content

Commit 7c8a9a6

Browse files
author
bors-servo
authoredJan 21, 2017
Auto merge of #112 - servo:perf, r=SimonSapin
Some perf tweaks to the tokenizer. This makes parsing quite faster, by stripping UTF-8 logic and using tables instead of branching everywhere. We may be able to tweak it a bit more (sometimes the table may be overkill? I don't know). I've written the table macro so you can skip it easily if you want. In any case, benchmark results: Before: > test tests::big_stylesheet ... bench: 10,392,017 ns/iter (+/- 1,954,644) > test tests::unquoted_url ... bench: 261,854 ns/iter (+/- 53,335) After: > test tests::big_stylesheet ... bench: 8,638,215 ns/iter (+/- 381,980) > test tests::unquoted_url ... bench: 211,863 ns/iter (+/- 73,418) Which is quite good if you ask me. <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-cssparser/112) <!-- Reviewable:end -->
2 parents 664111f + 21f8573 commit 7c8a9a6

File tree

7 files changed

+538
-162
lines changed

7 files changed

+538
-162
lines changed
 

‎.travis.yml

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ script:
99
- cargo test --verbose
1010
- cargo doc --verbose
1111
- cargo test --features heapsize
12+
- cargo test --features dummy_match_byte
13+
- if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features bench; fi
14+
- if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features "bench dummy_match_byte"; fi
1215

1316
notifications:
1417
webhooks: http://build.servo.org:54856/travis

‎Cargo.toml

+6
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ repository = "https://github.com/servo/rust-cssparser"
1010
readme = "README.md"
1111
keywords = ["css", "syntax", "parser"]
1212
license = "MPL-2.0"
13+
build = "build.rs"
1314

1415

1516
[dev-dependencies]
@@ -22,7 +23,12 @@ heapsize = {version = ">=0.1.1, <0.4.0", optional = true}
2223
matches = "0.1"
2324
serde = {version = ">=0.6.6, <0.9", optional = true}
2425

26+
[build-dependencies]
27+
syn = { version = "0.10.6", features = ["full", "visit"]}
28+
quote = "0.3"
29+
2530
[features]
2631
serde-serialization = [ "serde" ]
2732
heap_size = [ "heapsize" ]
2833
bench = []
34+
dummy_match_byte = []

‎build.rs

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
#[macro_use] extern crate quote;
6+
extern crate syn;
7+
8+
use std::env;
9+
use std::path::Path;
10+
11+
12+
#[cfg(feature = "dummy_match_byte")]
13+
mod codegen {
14+
use std::path::Path;
15+
pub fn main(_: &Path) {}
16+
}
17+
18+
#[cfg(not(feature = "dummy_match_byte"))]
19+
#[path = "src/macros/mod.rs"]
20+
mod macros;
21+
22+
#[cfg(not(feature = "dummy_match_byte"))]
23+
mod codegen {
24+
use macros;
25+
use std::env;
26+
use std::path::Path;
27+
28+
pub fn main(tokenizer_rs: &Path) {
29+
macros::match_byte::expand(tokenizer_rs,
30+
&Path::new(&env::var("OUT_DIR").unwrap()).join("tokenizer.rs"));
31+
32+
}
33+
}
34+
35+
fn main() {
36+
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
37+
let tokenizer_rs = Path::new(&manifest_dir).join("src/tokenizer.rs");
38+
codegen::main(&tokenizer_rs);
39+
println!("cargo:rerun-if-changed={}", tokenizer_rs.display());
40+
}

‎src/lib.rs

+18
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,25 @@ macro_rules! match_ignore_ascii_case {
137137
}
138138

139139
mod rules_and_declarations;
140+
141+
#[cfg(feature = "dummy_match_byte")]
142+
macro_rules! match_byte {
143+
($value:expr, $($rest:tt)* ) => {
144+
match $value {
145+
$(
146+
$rest
147+
)+
148+
}
149+
};
150+
}
151+
152+
#[cfg(feature = "dummy_match_byte")]
140153
mod tokenizer;
154+
155+
#[cfg(not(feature = "dummy_match_byte"))]
156+
mod tokenizer {
157+
include!(concat!(env!("OUT_DIR"), "/tokenizer.rs"));
158+
}
141159
mod parser;
142160
mod from_bytes;
143161
mod color;

‎src/macros/match_byte.rs

+271
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
use quote::{ToTokens, Tokens};
6+
use std::fs::File;
7+
use std::io::{Read, Write};
8+
use std::path::Path;
9+
use std::vec;
10+
use std::iter;
11+
use syn;
12+
13+
pub fn expand(from: &Path, to: &Path) {
14+
let mut source = String::new();
15+
File::open(from).unwrap().read_to_string(&mut source).unwrap();
16+
let tts = syn::parse_token_trees(&source).expect("Parsing rules.rs module");
17+
let mut tokens = Tokens::new();
18+
tokens.append_all(expand_tts(tts));
19+
20+
let code = tokens.to_string().replace("{ ", "{\n").replace(" }", "\n}");
21+
File::create(to).unwrap().write_all(code.as_bytes()).unwrap();
22+
}
23+
24+
fn expand_tts(tts: Vec<syn::TokenTree>) -> Vec<syn::TokenTree> {
25+
use syn::*;
26+
let mut expanded = Vec::new();
27+
let mut tts = tts.into_iter();
28+
while let Some(tt) = tts.next() {
29+
match tt {
30+
TokenTree::Token(Token::Ident(ident)) => {
31+
if ident != "match_byte" {
32+
expanded.push(TokenTree::Token(Token::Ident(ident)));
33+
continue;
34+
}
35+
36+
match tts.next() {
37+
Some(TokenTree::Token(Token::Not)) => {},
38+
other => {
39+
expanded.push(TokenTree::Token(Token::Ident(ident)));
40+
if let Some(other) = other {
41+
expanded.push(other);
42+
}
43+
continue;
44+
}
45+
}
46+
47+
let tts = match tts.next() {
48+
Some(TokenTree::Delimited(Delimited { tts, .. })) => tts,
49+
other => {
50+
expanded.push(TokenTree::Token(Token::Ident(ident)));
51+
expanded.push(TokenTree::Token(Token::Not));
52+
if let Some(other) = other {
53+
expanded.push(other);
54+
}
55+
continue;
56+
}
57+
};
58+
59+
let (to_be_matched, table, cases, wildcard_binding) = parse_match_bytes_macro(tts);
60+
let expr = expand_match_bytes_macro(to_be_matched,
61+
&table,
62+
cases,
63+
wildcard_binding);
64+
65+
let tts = syn::parse_token_trees(&expr)
66+
.expect("parsing macro expansion as token trees");
67+
expanded.extend(expand_tts(tts));
68+
}
69+
TokenTree::Delimited(Delimited { delim, tts }) => {
70+
expanded.push(TokenTree::Delimited(Delimited {
71+
delim: delim,
72+
tts: expand_tts(tts),
73+
}))
74+
}
75+
other => expanded.push(other),
76+
}
77+
}
78+
expanded
79+
}
80+
81+
/// Parses a token tree corresponding to the `match_byte` macro.
82+
///
83+
/// ## Example
84+
///
85+
/// ```rust
86+
/// match_byte! { tokenizer.next_byte_unchecked(),
87+
/// b'a'..b'z' => { ... }
88+
/// b'0'..b'9' => { ... }
89+
/// b'\n' | b'\\' => { ... }
90+
/// foo => { ... }
91+
/// }
92+
///
93+
/// Returns:
94+
/// * The token tree that contains the expression to be matched (in this case
95+
/// `tokenizer.next_byte_unchecked()`.
96+
///
97+
/// * The table with the different cases per byte, each entry in the table
98+
/// contains a non-zero integer representing a different arm of the
99+
/// match expression.
100+
///
101+
/// * The list of cases containing the expansion of the arms of the match
102+
/// expression.
103+
///
104+
/// * An optional identifier to which the wildcard pattern is matched (`foo` in
105+
/// this case).
106+
///
107+
fn parse_match_bytes_macro(tts: Vec<syn::TokenTree>) -> (Vec<syn::TokenTree>, [u8; 256], Vec<Case>, Option<syn::Ident>) {
108+
let mut tts = tts.into_iter();
109+
110+
// Grab the thing we're matching, until we find a comma.
111+
let mut left_hand_side = vec![];
112+
loop {
113+
match tts.next() {
114+
Some(syn::TokenTree::Token(syn::Token::Comma)) => break,
115+
Some(other) => left_hand_side.push(other),
116+
None => panic!("Expected not to run out of tokens looking for a comma"),
117+
}
118+
}
119+
120+
let mut cases = vec![];
121+
let mut table = [0u8; 256];
122+
123+
let mut tts = tts.peekable();
124+
let mut case_id: u8 = 1;
125+
let mut binding = None;
126+
while tts.len() > 0 {
127+
cases.push(parse_case(&mut tts, &mut table, &mut binding, case_id));
128+
129+
// Allow an optional comma between cases.
130+
match tts.peek() {
131+
Some(&syn::TokenTree::Token(syn::Token::Comma)) => {
132+
tts.next();
133+
},
134+
_ => {},
135+
}
136+
137+
case_id += 1;
138+
}
139+
140+
(left_hand_side, table, cases, binding)
141+
}
142+
143+
#[derive(Debug)]
144+
struct Case(Vec<syn::TokenTree>);
145+
146+
/// Parses a single pattern => expression, and returns the case, filling in the
147+
/// table with the case id for every byte that matched.
148+
///
149+
/// The `binding` parameter is the identifier that is used by the wildcard
150+
/// pattern.
151+
fn parse_case(tts: &mut iter::Peekable<vec::IntoIter<syn::TokenTree>>,
152+
table: &mut [u8; 256],
153+
binding: &mut Option<syn::Ident>,
154+
case_id: u8)
155+
-> Case {
156+
// The last byte checked, as part of this pattern, to properly detect
157+
// ranges.
158+
let mut last_byte: Option<u8> = None;
159+
160+
// Loop through the pattern filling with bytes the table.
161+
loop {
162+
match tts.next() {
163+
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
164+
table[byte as usize] = case_id;
165+
last_byte = Some(byte);
166+
}
167+
Some(syn::TokenTree::Token(syn::Token::BinOp(syn::BinOpToken::Or))) => {
168+
last_byte = None; // This pattern is over.
169+
},
170+
Some(syn::TokenTree::Token(syn::Token::DotDotDot)) => {
171+
assert!(last_byte.is_some(), "Expected closed range!");
172+
match tts.next() {
173+
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
174+
for b in last_byte.take().unwrap()..byte {
175+
if table[b as usize] == 0 {
176+
table[b as usize] = case_id;
177+
}
178+
}
179+
if table[byte as usize] == 0 {
180+
table[byte as usize] = case_id;
181+
}
182+
}
183+
other => panic!("Expected closed range, got: {:?}", other),
184+
}
185+
},
186+
Some(syn::TokenTree::Token(syn::Token::FatArrow)) => break,
187+
Some(syn::TokenTree::Token(syn::Token::Ident(ident))) => {
188+
assert_eq!(last_byte, None, "I don't support ranges with identifiers!");
189+
assert_eq!(*binding, None);
190+
for mut byte in table.iter_mut() {
191+
if *byte == 0 {
192+
*byte = case_id;
193+
}
194+
}
195+
*binding = Some(ident)
196+
}
197+
Some(syn::TokenTree::Token(syn::Token::Underscore)) => {
198+
assert_eq!(last_byte, None);
199+
for mut byte in table.iter_mut() {
200+
if *byte == 0 {
201+
*byte = case_id;
202+
}
203+
}
204+
},
205+
other => panic!("Expected literal byte, got: {:?}", other),
206+
}
207+
}
208+
209+
match tts.next() {
210+
Some(syn::TokenTree::Delimited(syn::Delimited { delim: syn::DelimToken::Brace, tts })) => {
211+
Case(tts)
212+
}
213+
other => panic!("Expected case with braces after fat arrow, got: {:?}", other),
214+
}
215+
}
216+
217+
fn expand_match_bytes_macro(to_be_matched: Vec<syn::TokenTree>,
218+
table: &[u8; 256],
219+
cases: Vec<Case>,
220+
binding: Option<syn::Ident>)
221+
-> String {
222+
use std::fmt::Write;
223+
224+
assert!(!to_be_matched.is_empty());
225+
assert!(table.iter().all(|b| *b != 0), "Incomplete pattern? Bogus code!");
226+
227+
// We build the expression with text since it's easier.
228+
let mut expr = "{\n".to_owned();
229+
expr.push_str("enum Case {\n");
230+
for (i, _) in cases.iter().enumerate() {
231+
write!(&mut expr, "Case{} = {},", i + 1, i + 1).unwrap();
232+
}
233+
expr.push_str("}\n"); // enum Case
234+
235+
expr.push_str("static __CASES: [Case; 256] = [");
236+
for byte in table.iter() {
237+
write!(&mut expr, "Case::Case{}, ", *byte).unwrap();
238+
}
239+
expr.push_str("];\n");
240+
241+
let mut tokens = Tokens::new();
242+
let to_be_matched = syn::Delimited {
243+
delim: if binding.is_some() { syn::DelimToken::Brace } else { syn::DelimToken::Paren },
244+
tts: to_be_matched
245+
};
246+
to_be_matched.to_tokens(&mut tokens);
247+
248+
if let Some(ref binding) = binding {
249+
write!(&mut expr, "let {} = {};\n", binding.to_string(), tokens.as_str()).unwrap();
250+
}
251+
252+
write!(&mut expr, "match __CASES[{} as usize] {{", match binding {
253+
Some(binding) => binding.to_string(),
254+
None => tokens.to_string(),
255+
}).unwrap();
256+
257+
for (i, case) in cases.into_iter().enumerate() {
258+
let mut case_tokens = Tokens::new();
259+
let case = syn::Delimited {
260+
delim: syn::DelimToken::Brace,
261+
tts: case.0
262+
};
263+
case.to_tokens(&mut case_tokens);
264+
write!(&mut expr, "Case::Case{} => {},\n", i + 1, case_tokens.as_str()).unwrap();
265+
}
266+
expr.push_str("}\n"); // match
267+
268+
expr.push_str("}\n"); // top
269+
270+
expr
271+
}

‎src/macros/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
pub mod match_byte;

‎src/tokenizer.rs

+195-162
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.