@@ -700,7 +700,39 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700
700
match self . state . get ( ) {
701
701
//§ data-state
702
702
states:: Data => loop {
703
- match pop_except_from ! ( self , input, small_char_set!( '\r' '\0' '&' '<' '\n' ) ) {
703
+ let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
704
+
705
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
706
+ let set_result =
707
+ if !( self . opts . exact_errors || self . reconsume . get ( ) || self . ignore_lf . get ( ) )
708
+ && is_x86_feature_detected ! ( "sse2" )
709
+ {
710
+ let front_buffer = input. peek_front_chunk_mut ( ) ;
711
+ let Some ( mut front_buffer) = front_buffer else {
712
+ return ProcessResult :: Suspend ;
713
+ } ;
714
+
715
+ // SAFETY:
716
+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
717
+ let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
718
+
719
+ if front_buffer. is_empty ( ) {
720
+ drop ( front_buffer) ;
721
+ input. pop_front ( ) ;
722
+ }
723
+
724
+ result
725
+ } else {
726
+ self . pop_except_from ( input, set)
727
+ } ;
728
+
729
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
730
+ let set_result = self . pop_except_from ( input, set) ;
731
+
732
+ let Some ( set_result) = set_result else {
733
+ return ProcessResult :: Suspend ;
734
+ } ;
735
+ match set_result {
704
736
FromSet ( '\0' ) => {
705
737
self . bad_char_error ( ) ;
706
738
go ! ( self : emit '\0' )
@@ -1752,6 +1784,118 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
1752
1784
states:: CdataSectionEnd => go ! ( self : push_temp ']' ; push_temp ']' ; to CdataSection ) ,
1753
1785
}
1754
1786
}
1787
+
1788
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1789
+ #[ target_feature( enable = "sse2" ) ]
1790
+ /// Implements the [data state] with SIMD instructions.
1791
+ ///
1792
+ /// The algorithm implemented is the naive SIMD approach described [here].
1793
+ ///
1794
+ /// ### SAFETY:
1795
+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1796
+ ///
1797
+ /// [data state]: https://html.spec.whatwg.org/#data-state
1798
+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1799
+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1800
+ #[ cfg( target_arch = "x86" ) ]
1801
+ use std:: arch:: x86:: __m128i;
1802
+ #[ cfg( target_arch = "x86_64" ) ]
1803
+ use std:: arch:: x86_64:: {
1804
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1805
+ _mm_set1_epi8,
1806
+ } ;
1807
+
1808
+ debug_assert ! ( !input. is_empty( ) ) ;
1809
+
1810
+ let quote_mask = _mm_set1_epi8 ( '<' as i8 ) ;
1811
+ let escape_mask = _mm_set1_epi8 ( '&' as i8 ) ;
1812
+ let carriage_return_mask = _mm_set1_epi8 ( '\r' as i8 ) ;
1813
+ let zero_mask = _mm_set1_epi8 ( '\0' as i8 ) ;
1814
+ let newline_mask = _mm_set1_epi8 ( '\n' as i8 ) ;
1815
+
1816
+ let raw_bytes: & [ u8 ] = & input. as_bytes ( ) ;
1817
+ let start = raw_bytes. as_ptr ( ) ;
1818
+
1819
+ const STRIDE : usize = 16 ;
1820
+ let mut i = 0 ;
1821
+ let mut n_newlines = 0 ;
1822
+ while i + STRIDE <= raw_bytes. len ( ) {
1823
+ // Load a 16 byte chunk from the input
1824
+ let data = _mm_loadu_si128 ( start. offset ( i as isize ) as * const __m128i ) ;
1825
+
1826
+ // Compare the chunk against each mask
1827
+ let quotes = _mm_cmpeq_epi8 ( data, quote_mask) ;
1828
+ let escapes = _mm_cmpeq_epi8 ( data, escape_mask) ;
1829
+ let carriage_returns = _mm_cmpeq_epi8 ( data, carriage_return_mask) ;
1830
+ let zeros = _mm_cmpeq_epi8 ( data, zero_mask) ;
1831
+ let newlines = _mm_cmpeq_epi8 ( data, newline_mask) ;
1832
+
1833
+ // Combine all test results and create a bitmask from them.
1834
+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1835
+ let test_result = _mm_or_si128 (
1836
+ _mm_or_si128 ( quotes, zeros) ,
1837
+ _mm_or_si128 ( escapes, carriage_returns) ,
1838
+ ) ;
1839
+ let bitmask = _mm_movemask_epi8 ( test_result) ;
1840
+ let newline_mask = _mm_movemask_epi8 ( newlines) ;
1841
+
1842
+ if ( bitmask != 0 ) {
1843
+ // We have reached one of the characters that cause the state machine to transition
1844
+ let position = if cfg ! ( target_endian = "little" ) {
1845
+ bitmask. trailing_zeros ( ) as usize
1846
+ } else {
1847
+ bitmask. leading_zeros ( ) as usize
1848
+ } ;
1849
+
1850
+ n_newlines += ( newline_mask & ( ( 1 << position) - 1 ) ) . count_ones ( ) as u64 ;
1851
+ i += position;
1852
+ break ;
1853
+ } else {
1854
+ n_newlines += newline_mask. count_ones ( ) as u64 ;
1855
+ }
1856
+
1857
+ i += STRIDE ;
1858
+ }
1859
+
1860
+ // Process any remaining bytes (less than STRIDE)
1861
+ while let Some ( c) = raw_bytes. get ( i) {
1862
+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1863
+ break ;
1864
+ }
1865
+ if * c == b'\n' {
1866
+ n_newlines += 1 ;
1867
+ }
1868
+
1869
+ i += 1 ;
1870
+ }
1871
+
1872
+ let set_result = if i == 0 {
1873
+ let c = input. pop_front_char ( ) . unwrap ( ) ;
1874
+ debug_assert ! ( matches!( c, '<' | '&' | '\r' | '\0' ) ) ;
1875
+
1876
+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1877
+ // Still, it would be nice to not have to do that.
1878
+ // The same is true for the unwrap call.
1879
+ let preprocessed_char = self
1880
+ . get_preprocessed_char ( c, & BufferQueue :: default ( ) )
1881
+ . unwrap ( ) ;
1882
+ SetResult :: FromSet ( preprocessed_char)
1883
+ } else {
1884
+ debug_assert ! (
1885
+ input. len( ) >= i,
1886
+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1887
+ i,
1888
+ input. len( )
1889
+ ) ;
1890
+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1891
+ input. unsafe_pop_front ( i as u32 ) ;
1892
+ SetResult :: NotFromSet ( consumed_chunk)
1893
+ } ;
1894
+
1895
+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1896
+
1897
+ Some ( set_result)
1898
+ }
1755
1899
}
1756
1900
1757
1901
#[ cfg( test) ]
0 commit comments