Skip to content

Commit 75823d5

Browse files
committed
Fix incorrect reading of CDATA and comments when end sequence crosses the boundary of chunks in buffered reader
The bug was introduced in f2b99f0
1 parent e052a46 commit 75823d5

File tree

4 files changed

+39
-9
lines changed

4 files changed

+39
-9
lines changed

src/reader/buffered_reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ macro_rules! impl_buffered_source {
117117
// somewhere sane rather than at the EOF
118118
Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
119119
Ok(available) => {
120-
if let Some((consumed, used)) = bang_type.parse(available, read) {
120+
if let Some((consumed, used)) = bang_type.parse(buf, available) {
121121
buf.extend_from_slice(consumed);
122122

123123
self $(.$reader)? .consume(used);

src/reader/mod.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -742,25 +742,50 @@ impl BangType {
742742

743743
/// If element is finished, returns its content up to `>` symbol and
744744
/// an index of this symbol, otherwise returns `None`
745+
///
746+
/// # Parameters
747+
/// - `buf`: buffer with data consumed on previous iterations
748+
/// - `chunk`: data read on current iteration and not yet consumed from reader
745749
#[inline(always)]
746-
fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
750+
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
747751
for i in memchr::memchr_iter(b'>', chunk) {
748752
match self {
749753
// Need to read at least 6 symbols (`!---->`) for properly finished comment
750754
// <!----> - XML comment
751755
// 012345 - i
752-
Self::Comment => {
753-
if offset + i > 4 && chunk[..i].ends_with(b"--") {
756+
Self::Comment if buf.len() + i > 4 => {
757+
if chunk[..i].ends_with(b"--") {
754758
// We cannot strip last `--` from the buffer because we need it in case of
755759
// check_comments enabled option. XML standard requires that comment
756760
// will not end with `--->` sequence because this is a special case of
757761
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
758762
return Some((&chunk[..i], i + 1)); // +1 for `>`
759763
}
764+
// End sequence `-|->` was splitted at |
765+
// buf --/ \-- chunk
766+
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
767+
return Some((&chunk[..i], i + 1)); // +1 for `>`
768+
}
769+
// End sequence `--|>` was splitted at |
770+
// buf --/ \-- chunk
771+
if i == 0 && buf.ends_with(b"--") {
772+
return Some((&[], i + 1)); // +1 for `>`
773+
}
760774
}
775+
Self::Comment => {}
761776
Self::CData => {
762777
if chunk[..i].ends_with(b"]]") {
763-
return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
778+
return Some((&chunk[..i], i + 1)); // +1 for `>`
779+
}
780+
// End sequence `]|]>` was splitted at |
781+
// buf --/ \-- chunk
782+
if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
783+
return Some((&chunk[..i], i + 1)); // +1 for `>`
784+
}
785+
// End sequence `]]|>` was splitted at |
786+
// buf --/ \-- chunk
787+
if i == 0 && buf.ends_with(b"]]") {
788+
return Some((&[], i + 1)); // +1 for `>`
764789
}
765790
}
766791
Self::DocType => {
@@ -1021,7 +1046,7 @@ mod test {
10211046
$(.$await)?
10221047
.unwrap()
10231048
.map(|(ty, data)| (ty, Bytes(data))),
1024-
Some((BangType::CData, Bytes(b"![CDATA[")))
1049+
Some((BangType::CData, Bytes(b"![CDATA[]]")))
10251050
);
10261051
assert_eq!(position, 11);
10271052
}
@@ -1042,7 +1067,7 @@ mod test {
10421067
$(.$await)?
10431068
.unwrap()
10441069
.map(|(ty, data)| (ty, Bytes(data))),
1045-
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
1070+
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
10461071
);
10471072
assert_eq!(position, 28);
10481073
}

src/reader/parser.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ impl Parser {
9090
let len = buf.len();
9191
match bang_type {
9292
BangType::Comment if buf.starts_with(b"!--") => {
93+
debug_assert!(buf.ends_with(b"--"));
9394
if self.check_comments {
9495
// search if '--' not in comments
9596
if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
@@ -105,7 +106,11 @@ impl Parser {
105106
)))
106107
}
107108
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
108-
Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder())))
109+
debug_assert!(buf.ends_with(b"]]"));
110+
Ok(Event::CData(BytesCData::wrap(
111+
&buf[8..len - 2],
112+
self.decoder(),
113+
)))
109114
}
110115
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
111116
let start = buf[8..]

src/reader/slice_reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
289289

290290
let bang_type = BangType::new(self[1..].first().copied())?;
291291

292-
if let Some((bytes, i)) = bang_type.parse(self, 0) {
292+
if let Some((bytes, i)) = bang_type.parse(&[], self) {
293293
*position += i;
294294
*self = &self[i..];
295295
return Ok(Some((bang_type, bytes)));

0 commit comments

Comments
 (0)