Skip to content

Commit c844076

Browse files
committed
Add ParserState method to get current utf16 position
This adds a `utf16_position` method on `ParserState`, exposing a `current_position` field that we compute. The implementation is closely following what what done to compute the utf16 column position.
1 parent c3314d1 commit c844076

File tree

4 files changed

+49
-22
lines changed

4 files changed

+49
-22
lines changed

src/parser.rs

+7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use std::ops::Range;
1818
pub struct ParserState {
1919
pub(crate) position: usize,
2020
pub(crate) current_line_start_position: usize,
21+
pub(crate) current_position: usize,
2122
pub(crate) current_line_number: u32,
2223
pub(crate) at_start_of: Option<BlockType>,
2324
}
@@ -37,6 +38,12 @@ impl ParserState {
3738
column: (self.position - self.current_line_start_position + 1) as u32,
3839
}
3940
}
41+
42+
/// The position from the start of the input, counted in UTF-16 code units
43+
#[inline]
44+
pub fn utf16_position(&self) -> u32 {
45+
self.current_position as u32
46+
}
4047
}
4148

4249
/// When parsing until a given token, sometimes the caller knows that parsing is going to restart

src/size_of_tests.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
4242
size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
4343
size_of_test!(cow_rc_str, CowRcStr, 16);
4444

45-
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
46-
size_of_test!(parser_input, crate::parser::ParserInput, 136);
45+
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
46+
size_of_test!(parser_input, crate::parser::ParserInput, 152);
4747
size_of_test!(parser, crate::parser::Parser, 16);
4848
size_of_test!(source_position, crate::SourcePosition, 8);
49-
size_of_test!(parser_state, crate::ParserState, 24);
49+
size_of_test!(parser_state, crate::ParserState, 32);
5050

5151
size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
5252
size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);

src/tests.rs

+20-17
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,7 @@ fn roundtrip_percentage_token() {
12761276
}
12771277

12781278
#[test]
1279-
fn utf16_columns() {
1279+
fn utf16_columns_and_positions() {
12801280
// This particular test serves two purposes. First, it checks
12811281
// that the column number computations are correct. Second, it
12821282
// checks that tokenizer code paths correctly differentiate
@@ -1287,24 +1287,26 @@ fn utf16_columns() {
12871287
// the column is in units of UTF-16, the 4-byte sequence results
12881288
// in two columns.
12891289
let tests = vec![
1290-
("", 1),
1291-
("ascii", 6),
1292-
("/*QΡ✈🆒*/", 10),
1293-
("'QΡ✈🆒*'", 9),
1294-
("\"\\\"'QΡ✈🆒*'", 12),
1295-
("\\Q\\Ρ\\\\🆒", 10),
1296-
("QΡ✈🆒", 6),
1297-
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 15),
1298-
("newline\r\nQΡ✈🆒", 6),
1299-
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 20),
1300-
("url(QΡ✈🆒)", 11),
1301-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 16),
1302-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 15),
1303-
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 17),
1304-
("QΡ✈🆒()", 8),
1290+
("", 1, 0),
1291+
("ascii", 6, 5),
1292+
("/*QΡ✈🆒*/", 10, 9),
1293+
("/*QΡ✈\r\n🆒*/", 5, 11),
1294+
("'QΡ✈🆒*'", 9, 8),
1295+
("\"\\\"'QΡ✈🆒*'", 12, 11),
1296+
("\\Q\\Ρ\\\\🆒", 10, 9),
1297+
("QΡ✈🆒", 6, 5),
1298+
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 15, 14),
1299+
("newline\r\nQΡ✈🆒", 6, 14),
1300+
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 20, 19),
1301+
("url(QΡ✈🆒)", 11, 10),
1302+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 16, 21),
1303+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 15, 20),
1304+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 17, 22),
1305+
("url( \tQ)", 10, 9),
1306+
("QΡ✈🆒()", 8, 7),
13051307
// Test that under/over-flow of current_line_start_position is
13061308
// handled properly; see the special case in consume_4byte_intro.
1307-
("🆒", 3),
1309+
("🆒", 3, 2),
13081310
];
13091311

13101312
for test in tests {
@@ -1329,6 +1331,7 @@ fn utf16_columns() {
13291331

13301332
// Check the resulting column.
13311333
assert_eq!(parser.current_source_location().column, test.1);
1334+
assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
13321335
}
13331336
}
13341337

src/tokenizer.rs

+19-2
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ pub struct Tokenizer<'a> {
214214
/// ensure that computing the column will give the result in units
215215
/// of UTF-16 characters.
216216
current_line_start_position: usize,
217+
current_position: usize,
217218
current_line_number: u32,
218219
var_or_env_functions: SeenStatus,
219220
source_map_url: Option<&'a str>,
@@ -234,6 +235,7 @@ impl<'a> Tokenizer<'a> {
234235
input,
235236
position: 0,
236237
current_line_start_position: 0,
238+
current_position: 0,
237239
current_line_number: 0,
238240
var_or_env_functions: SeenStatus::DontCare,
239241
source_map_url: None,
@@ -296,6 +298,7 @@ impl<'a> Tokenizer<'a> {
296298
ParserState {
297299
position: self.position,
298300
current_line_start_position: self.current_line_start_position,
301+
current_position: self.current_position,
299302
current_line_number: self.current_line_number,
300303
at_start_of: None,
301304
}
@@ -305,6 +308,7 @@ impl<'a> Tokenizer<'a> {
305308
pub fn reset(&mut self, state: &ParserState) {
306309
self.position = state.position;
307310
self.current_line_start_position = state.current_line_start_position;
311+
self.current_position = state.current_position;
308312
self.current_line_number = state.current_line_number;
309313
}
310314

@@ -368,6 +372,7 @@ impl<'a> Tokenizer<'a> {
368372
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
369373
}
370374
}
375+
self.current_position = self.current_position.wrapping_add(n);
371376
self.position += n
372377
}
373378

@@ -390,6 +395,7 @@ impl<'a> Tokenizer<'a> {
390395
// This takes two UTF-16 characters to represent, so we
391396
// actually have an undercount.
392397
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
398+
self.current_position = self.current_position.wrapping_add(2);
393399
self.position += 1;
394400
}
395401

@@ -415,10 +421,13 @@ impl<'a> Tokenizer<'a> {
415421
// This takes two UTF-16 characters to represent, so we
416422
// actually have an undercount.
417423
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
424+
self.current_position = self.current_position.wrapping_add(2);
418425
} else if byte & 0xC0 == 0x80 {
419426
// Note that due to the special case for the 4-byte
420427
// sequence intro, we must use wrapping add here.
421428
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
429+
} else {
430+
self.current_position = self.current_position.wrapping_add(1);
422431
}
423432
}
424433

@@ -434,8 +443,10 @@ impl<'a> Tokenizer<'a> {
434443
let byte = self.next_byte_unchecked();
435444
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
436445
self.position += 1;
446+
self.current_position = self.current_position.wrapping_add(1);
437447
if byte == b'\r' && self.next_byte() == Some(b'\n') {
438448
self.position += 1;
449+
self.current_position = self.current_position.wrapping_add(1);
439450
}
440451
self.current_line_start_position = self.position;
441452
self.current_line_number += 1;
@@ -454,9 +465,11 @@ impl<'a> Tokenizer<'a> {
454465
self.position += len_utf8;
455466
// Note that due to the special case for the 4-byte sequence
456467
// intro, we must use wrapping add here.
468+
let len_utf16 = c.len_utf16();
457469
self.current_line_start_position = self
458470
.current_line_start_position
459-
.wrapping_add(len_utf8 - c.len_utf16());
471+
.wrapping_add(len_utf8 - len_utf16);
472+
self.current_position = self.current_position.wrapping_add(len_utf16);
460473
c
461474
}
462475

@@ -1147,12 +1160,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11471160
}
11481161
};
11491162
match_byte! { b,
1150-
b' ' | b'\t' => {},
1163+
b' ' | b'\t' => {
1164+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
1165+
},
11511166
b'\n' | b'\x0C' => {
11521167
newlines += 1;
11531168
last_newline = offset;
1169+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11541170
}
11551171
b'\r' => {
1172+
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11561173
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
11571174
newlines += 1;
11581175
last_newline = offset;

0 commit comments

Comments
 (0)