Skip to content

Commit 3005d89

Browse files
committed
Add a fast path for the data state using SSE2
Signed-off-by: Simon Wülker <[email protected]>
1 parent a1486b0 commit 3005d89

File tree

2 files changed

+162
-2
lines changed

2 files changed

+162
-2
lines changed

html5ever/src/tokenizer/mod.rs

+145-1
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,39 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700700
match self.state.get() {
701701
//§ data-state
702702
states::Data => loop {
703-
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
703+
let set = small_char_set!('\r' '\0' '&' '<' '\n');
704+
705+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
706+
let set_result =
707+
if !(self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get())
708+
&& is_x86_feature_detected!("sse2")
709+
{
710+
let front_buffer = input.peek_front_chunk_mut();
711+
let Some(mut front_buffer) = front_buffer else {
712+
return ProcessResult::Suspend;
713+
};
714+
715+
// SAFETY:
716+
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
717+
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
718+
719+
if front_buffer.is_empty() {
720+
drop(front_buffer);
721+
input.pop_front();
722+
}
723+
724+
result
725+
} else {
726+
self.pop_except_from(input, set)
727+
};
728+
729+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
730+
let set_result = self.pop_except_from(input, set);
731+
732+
let Some(set_result) = set_result else {
733+
return ProcessResult::Suspend;
734+
};
735+
match set_result {
704736
FromSet('\0') => {
705737
self.bad_char_error();
706738
go!(self: emit '\0')
@@ -1752,6 +1784,118 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17521784
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
17531785
}
17541786
}
1787+
1788+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1789+
#[target_feature(enable = "sse2")]
1790+
/// Implements the [data state] with SIMD instructions.
1791+
///
1792+
/// The algorithm implemented is the naive SIMD approach described [here].
1793+
///
1794+
/// ### SAFETY:
1795+
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1796+
///
1797+
/// [data state]: https://html.spec.whatwg.org/#data-state
1798+
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1799+
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1800+
#[cfg(target_arch = "x86")]
1801+
use std::arch::x86::__m128i;
1802+
#[cfg(target_arch = "x86_64")]
1803+
use std::arch::x86_64::{
1804+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1805+
_mm_set1_epi8,
1806+
};
1807+
1808+
debug_assert!(!input.is_empty());
1809+
1810+
let quote_mask = _mm_set1_epi8('<' as i8);
1811+
let escape_mask = _mm_set1_epi8('&' as i8);
1812+
let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1813+
let zero_mask = _mm_set1_epi8('\0' as i8);
1814+
let newline_mask = _mm_set1_epi8('\n' as i8);
1815+
1816+
let raw_bytes: &[u8] = &input.as_bytes();
1817+
let start = raw_bytes.as_ptr();
1818+
1819+
const STRIDE: usize = 16;
1820+
let mut i = 0;
1821+
let mut n_newlines = 0;
1822+
while i + STRIDE <= raw_bytes.len() {
1823+
// Load a 16 byte chunk from the input
1824+
let data = _mm_loadu_si128(start.offset(i as isize) as *const __m128i);
1825+
1826+
// Compare the chunk against each mask
1827+
let quotes = _mm_cmpeq_epi8(data, quote_mask);
1828+
let escapes = _mm_cmpeq_epi8(data, escape_mask);
1829+
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
1830+
let zeros = _mm_cmpeq_epi8(data, zero_mask);
1831+
let newlines = _mm_cmpeq_epi8(data, newline_mask);
1832+
1833+
// Combine all test results and create a bitmask from them.
1834+
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1835+
let test_result = _mm_or_si128(
1836+
_mm_or_si128(quotes, zeros),
1837+
_mm_or_si128(escapes, carriage_returns),
1838+
);
1839+
let bitmask = _mm_movemask_epi8(test_result);
1840+
let newline_mask = _mm_movemask_epi8(newlines);
1841+
1842+
if (bitmask != 0) {
1843+
// We have reached one of the characters that cause the state machine to transition
1844+
let position = if cfg!(target_endian = "little") {
1845+
bitmask.trailing_zeros() as usize
1846+
} else {
1847+
bitmask.leading_zeros() as usize
1848+
};
1849+
1850+
n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
1851+
i += position;
1852+
break;
1853+
} else {
1854+
n_newlines += newline_mask.count_ones() as u64;
1855+
}
1856+
1857+
i += STRIDE;
1858+
}
1859+
1860+
// Process any remaining bytes (less than STRIDE)
1861+
while let Some(c) = raw_bytes.get(i) {
1862+
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1863+
break;
1864+
}
1865+
if *c == b'\n' {
1866+
n_newlines += 1;
1867+
}
1868+
1869+
i += 1;
1870+
}
1871+
1872+
let set_result = if i == 0 {
1873+
let c = input.pop_front_char().unwrap();
1874+
debug_assert!(matches!(c, '<' | '&' | '\r' | '\0'));
1875+
1876+
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1877+
// Still, it would be nice to not have to do that.
1878+
// The same is true for the unwrap call.
1879+
let preprocessed_char = self
1880+
.get_preprocessed_char(c, &BufferQueue::default())
1881+
.unwrap();
1882+
SetResult::FromSet(preprocessed_char)
1883+
} else {
1884+
debug_assert!(
1885+
input.len() >= i,
1886+
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1887+
i,
1888+
input.len()
1889+
);
1890+
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1891+
input.unsafe_pop_front(i as u32);
1892+
SetResult::NotFromSet(consumed_chunk)
1893+
};
1894+
1895+
self.current_line.set(self.current_line.get() + n_newlines);
1896+
1897+
Some(set_result)
1898+
}
17551899
}
17561900

17571901
#[cfg(test)]

markup5ever/util/buffer_queue.rs

+17-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
//!
1919
//! [`BufferQueue`]: struct.BufferQueue.html
2020
21-
use std::{cell::RefCell, collections::VecDeque, mem};
21+
use std::{
22+
cell::{RefCell, RefMut},
23+
collections::VecDeque,
24+
mem,
25+
};
2226

2327
use tendril::StrTendril;
2428

@@ -246,6 +250,18 @@ impl BufferQueue {
246250
&mut *other.buffers.borrow_mut(),
247251
);
248252
}
253+
254+
pub unsafe fn peek_front_chunk_mut(&self) -> Option<RefMut<StrTendril>> {
255+
let buffers = self.buffers.borrow_mut();
256+
if buffers.is_empty() {
257+
return None;
258+
}
259+
260+
let front_buffer = RefMut::map(buffers, |buffers| {
261+
buffers.front_mut().expect("there is at least one buffer")
262+
});
263+
Some(front_buffer)
264+
}
249265
}
250266

251267
#[cfg(test)]

0 commit comments

Comments
 (0)