Add ParserState method to get current utf16 position

nchevobbe · nchevobbe · commit c8440766bdca · 2024-03-26T07:40:29.000+01:00
This adds a `utf16_position` method on `ParserState`, exposing a `current_position`
field that we compute.
The implementation is closely following what what done to compute the utf16 column
position.
diff --git a/src/parser.rs b/src/parser.rs
@@ -18,6 +18,7 @@ use std::ops::Range;
 pub struct ParserState {
     pub(crate) position: usize,
     pub(crate) current_line_start_position: usize,
+    pub(crate) current_position: usize,
     pub(crate) current_line_number: u32,
     pub(crate) at_start_of: Option<BlockType>,
 }
@@ -37,6 +38,12 @@ impl ParserState {
             column: (self.position - self.current_line_start_position + 1) as u32,
         }
     }
+
+    /// The position from the start of the input, counted in UTF-16 code units
+    #[inline]
+    pub fn utf16_position(&self) -> u32 {
+        self.current_position as u32
+    }
 }
 
 /// When parsing until a given token, sometimes the caller knows that parsing is going to restart
diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
 size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
 size_of_test!(cow_rc_str, CowRcStr, 16);
 
-size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
-size_of_test!(parser_input, crate::parser::ParserInput, 136);
+size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
+size_of_test!(parser_input, crate::parser::ParserInput, 152);
 size_of_test!(parser, crate::parser::Parser, 16);
 size_of_test!(source_position, crate::SourcePosition, 8);
-size_of_test!(parser_state, crate::ParserState, 24);
+size_of_test!(parser_state, crate::ParserState, 32);
 
 size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
 size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);
diff --git a/src/tests.rs b/src/tests.rs
@@ -1276,7 +1276,7 @@ fn roundtrip_percentage_token() {
 }
 
 #[test]
-fn utf16_columns() {
+fn utf16_columns_and_positions() {
     // This particular test serves two purposes.  First, it checks
     // that the column number computations are correct.  Second, it
     // checks that tokenizer code paths correctly differentiate
@@ -1287,24 +1287,26 @@ fn utf16_columns() {
     // the column is in units of UTF-16, the 4-byte sequence results
     // in two columns.
     let tests = vec![
-        ("", 1),
-        ("ascii", 6),
-        ("/*QΡ✈🆒*/", 10),
-        ("'QΡ✈🆒*'", 9),
-        ("\"\\\"'QΡ✈🆒*'", 12),
-        ("\\Q\\Ρ\\✈\\🆒", 10),
-        ("QΡ✈🆒", 6),
-        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("newline\r\nQΡ✈🆒", 6),
-        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20),
-        ("url(QΡ✈🆒)", 11),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17),
-        ("QΡ✈🆒()", 8),
+        ("", 1, 0),
+        ("ascii", 6, 5),
+        ("/*QΡ✈🆒*/", 10, 9),
+        ("/*QΡ✈\r\n🆒*/", 5, 11),
+        ("'QΡ✈🆒*'", 9, 8),
+        ("\"\\\"'QΡ✈🆒*'", 12, 11),
+        ("\\Q\\Ρ\\✈\\🆒", 10, 9),
+        ("QΡ✈🆒", 6, 5),
+        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 14),
+        ("newline\r\nQΡ✈🆒", 6, 14),
+        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20, 19),
+        ("url(QΡ✈🆒)", 11, 10),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16, 21),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 20),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17, 22),
+        ("url(  \tQ)", 10, 9),
+        ("QΡ✈🆒()", 8, 7),
         // Test that under/over-flow of current_line_start_position is
         // handled properly; see the special case in consume_4byte_intro.
-        ("🆒", 3),
+        ("🆒", 3, 2),
     ];
 
     for test in tests {
@@ -1329,6 +1331,7 @@ fn utf16_columns() {
 
         // Check the resulting column.
         assert_eq!(parser.current_source_location().column, test.1);
+        assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
     }
 }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -214,6 +214,7 @@ pub struct Tokenizer<'a> {
     /// ensure that computing the column will give the result in units
     /// of UTF-16 characters.
     current_line_start_position: usize,
+    current_position: usize,
     current_line_number: u32,
     var_or_env_functions: SeenStatus,
     source_map_url: Option<&'a str>,
@@ -234,6 +235,7 @@ impl<'a> Tokenizer<'a> {
             input,
             position: 0,
             current_line_start_position: 0,
+            current_position: 0,
             current_line_number: 0,
             var_or_env_functions: SeenStatus::DontCare,
             source_map_url: None,
@@ -296,6 +298,7 @@ impl<'a> Tokenizer<'a> {
         ParserState {
             position: self.position,
             current_line_start_position: self.current_line_start_position,
+            current_position: self.current_position,
             current_line_number: self.current_line_number,
             at_start_of: None,
         }
@@ -305,6 +308,7 @@ impl<'a> Tokenizer<'a> {
     pub fn reset(&mut self, state: &ParserState) {
         self.position = state.position;
         self.current_line_start_position = state.current_line_start_position;
+        self.current_position = state.current_position;
         self.current_line_number = state.current_line_number;
     }
 
@@ -368,6 +372,7 @@ impl<'a> Tokenizer<'a> {
                 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
             }
         }
+        self.current_position = self.current_position.wrapping_add(n);
         self.position += n
     }
 
@@ -390,6 +395,7 @@ impl<'a> Tokenizer<'a> {
         // This takes two UTF-16 characters to represent, so we
         // actually have an undercount.
         self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+        self.current_position = self.current_position.wrapping_add(2);
         self.position += 1;
     }
 
@@ -415,10 +421,13 @@ impl<'a> Tokenizer<'a> {
             // This takes two UTF-16 characters to represent, so we
             // actually have an undercount.
             self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+            self.current_position = self.current_position.wrapping_add(2);
         } else if byte & 0xC0 == 0x80 {
             // Note that due to the special case for the 4-byte
             // sequence intro, we must use wrapping add here.
             self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+        } else {
+            self.current_position = self.current_position.wrapping_add(1);
         }
     }
 
@@ -434,8 +443,10 @@ impl<'a> Tokenizer<'a> {
         let byte = self.next_byte_unchecked();
         debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
         self.position += 1;
+        self.current_position = self.current_position.wrapping_add(1);
         if byte == b'\r' && self.next_byte() == Some(b'\n') {
             self.position += 1;
+            self.current_position = self.current_position.wrapping_add(1);
         }
         self.current_line_start_position = self.position;
         self.current_line_number += 1;
@@ -454,9 +465,11 @@ impl<'a> Tokenizer<'a> {
         self.position += len_utf8;
         // Note that due to the special case for the 4-byte sequence
         // intro, we must use wrapping add here.
+        let len_utf16 = c.len_utf16();
         self.current_line_start_position = self
             .current_line_start_position
-            .wrapping_add(len_utf8 - c.len_utf16());
+            .wrapping_add(len_utf8 - len_utf16);
+        self.current_position = self.current_position.wrapping_add(len_utf16);
         c
     }
 
@@ -1147,12 +1160,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
         };
         match_byte! { b,
-            b' ' | b'\t' => {},
+            b' ' | b'\t' => {
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
+            },
             b'\n' | b'\x0C' => {
                 newlines += 1;
                 last_newline = offset;
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
             }
             b'\r' => {
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
                 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
                     newlines += 1;
                     last_newline = offset;

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ use std::ops::Range;`
`18`	`18`	`pub struct ParserState {`
`19`	`19`	`pub(crate) position: usize,`
`20`	`20`	`pub(crate) current_line_start_position: usize,`
	`21`	`+ pub(crate) current_position: usize,`
`21`	`22`	`pub(crate) current_line_number: u32,`
`22`	`23`	`pub(crate) at_start_of: Option<BlockType>,`
`23`	`24`	`}`
`@@ -37,6 +38,12 @@ impl ParserState {`
`37`	`38`	`column: (self.position - self.current_line_start_position + 1) as u32,`
`38`	`39`	`}`
`39`	`40`	`}`
	`41`	`+`
	`42`	`+ /// The position from the start of the input, counted in UTF-16 code units`
	`43`	`+ #[inline]`
	`44`	`+ pub fn utf16_position(&self) -> u32 {`
	`45`	`+ self.current_position as u32`
	`46`	`+ }`
`40`	`47`	`}`
`41`	`48`
`42`	`49`	`/// When parsing until a given token, sometimes the caller knows that parsing is going to restart`