Skip to content

Commit a10b1c3

Browse files
authored
Merge pull request #471 from Mingun/fix-buffered-parsing
Fix #469 - parsing from buffered reader
2 parents f8b292b + 75823d5 commit a10b1c3

File tree

5 files changed

+202
-12
lines changed

5 files changed

+202
-12
lines changed

src/reader/async_tokio.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
377377
#[cfg(test)]
378378
mod test {
379379
use super::TokioAdapter;
380-
use crate::reader::test::check;
380+
use crate::reader::test::{check, small_buffers};
381381

382382
check!(
383383
#[tokio::test]
@@ -387,4 +387,10 @@ mod test {
387387
&mut Vec::new(),
388388
async, await
389389
);
390+
391+
small_buffers!(
392+
#[tokio::test]
393+
read_event_into_async: tokio::io::BufReader<_>,
394+
async, await
395+
);
390396
}

src/reader/buffered_reader.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ macro_rules! impl_buffered_source {
117117
// somewhere sane rather than at the EOF
118118
Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
119119
Ok(available) => {
120-
if let Some((consumed, used)) = bang_type.parse(available, read) {
120+
if let Some((consumed, used)) = bang_type.parse(buf, available) {
121121
buf.extend_from_slice(consumed);
122122

123123
self $(.$reader)? .consume(used);
@@ -406,7 +406,7 @@ impl Reader<BufReader<File>> {
406406

407407
#[cfg(test)]
408408
mod test {
409-
use crate::reader::test::check;
409+
use crate::reader::test::{check, small_buffers};
410410
use crate::reader::XmlSource;
411411

412412
/// Default buffer constructor just pass the byte array from the test
@@ -422,6 +422,11 @@ mod test {
422422
&mut Vec::new()
423423
);
424424

425+
small_buffers!(
426+
#[test]
427+
read_event_into: std::io::BufReader<_>
428+
);
429+
425430
#[cfg(feature = "encoding")]
426431
mod encoding {
427432
use crate::events::Event;

src/reader/mod.rs

+181-7
Original file line numberDiff line numberDiff line change
@@ -742,25 +742,50 @@ impl BangType {
742742

743743
/// If element is finished, returns its content up to `>` symbol and
744744
/// an index of this symbol, otherwise returns `None`
745+
///
746+
/// # Parameters
747+
/// - `buf`: buffer with data consumed on previous iterations
748+
/// - `chunk`: data read on current iteration and not yet consumed from reader
745749
#[inline(always)]
746-
fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
750+
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
747751
for i in memchr::memchr_iter(b'>', chunk) {
748752
match self {
749753
// Need to read at least 6 symbols (`!---->`) for properly finished comment
750754
// <!----> - XML comment
751755
// 012345 - i
752-
Self::Comment => {
753-
if offset + i > 4 && chunk[..i].ends_with(b"--") {
756+
Self::Comment if buf.len() + i > 4 => {
757+
if chunk[..i].ends_with(b"--") {
754758
// We cannot strip last `--` from the buffer because we need it in case of
755759
// check_comments enabled option. XML standard requires that comment
756760
// will not end with `--->` sequence because this is a special case of
757761
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
758762
return Some((&chunk[..i], i + 1)); // +1 for `>`
759763
}
764+
// End sequence `-|->` was splitted at |
765+
// buf --/ \-- chunk
766+
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
767+
return Some((&chunk[..i], i + 1)); // +1 for `>`
768+
}
769+
// End sequence `--|>` was splitted at |
770+
// buf --/ \-- chunk
771+
if i == 0 && buf.ends_with(b"--") {
772+
return Some((&[], i + 1)); // +1 for `>`
773+
}
760774
}
775+
Self::Comment => {}
761776
Self::CData => {
762777
if chunk[..i].ends_with(b"]]") {
763-
return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
778+
return Some((&chunk[..i], i + 1)); // +1 for `>`
779+
}
780+
// End sequence `]|]>` was splitted at |
781+
// buf --/ \-- chunk
782+
if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
783+
return Some((&chunk[..i], i + 1)); // +1 for `>`
784+
}
785+
// End sequence `]]|>` was splitted at |
786+
// buf --/ \-- chunk
787+
if i == 0 && buf.ends_with(b"]]") {
788+
return Some((&[], i + 1)); // +1 for `>`
764789
}
765790
}
766791
Self::DocType => {
@@ -1021,7 +1046,7 @@ mod test {
10211046
$(.$await)?
10221047
.unwrap()
10231048
.map(|(ty, data)| (ty, Bytes(data))),
1024-
Some((BangType::CData, Bytes(b"![CDATA[")))
1049+
Some((BangType::CData, Bytes(b"![CDATA[]]")))
10251050
);
10261051
assert_eq!(position, 11);
10271052
}
@@ -1042,7 +1067,7 @@ mod test {
10421067
$(.$await)?
10431068
.unwrap()
10441069
.map(|(ty, data)| (ty, Bytes(data))),
1045-
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
1070+
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
10461071
);
10471072
assert_eq!(position, 28);
10481073
}
@@ -1751,8 +1776,157 @@ mod test {
17511776
};
17521777
}
17531778

1754-
// Export a macro for the child modules:
1779+
/// Tests for https://github.com/tafia/quick-xml/issues/469
1780+
macro_rules! small_buffers {
1781+
(
1782+
#[$test:meta]
1783+
$read_event:ident: $BufReader:ty
1784+
$(, $async:ident, $await:ident)?
1785+
) => {
1786+
mod small_buffers {
1787+
use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1788+
use crate::reader::Reader;
1789+
use pretty_assertions::assert_eq;
1790+
1791+
#[$test]
1792+
$($async)? fn decl() {
1793+
let xml = "<?xml ?>";
1794+
// ^^^^^^^ data that fit into buffer
1795+
let size = xml.match_indices("?>").next().unwrap().0 + 1;
1796+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1797+
let mut reader = Reader::from_reader(br);
1798+
let mut buf = Vec::new();
1799+
1800+
assert_eq!(
1801+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1802+
Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1803+
);
1804+
assert_eq!(
1805+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1806+
Event::Eof
1807+
);
1808+
}
1809+
1810+
#[$test]
1811+
$($async)? fn pi() {
1812+
let xml = "<?pi?>";
1813+
// ^^^^^ data that fit into buffer
1814+
let size = xml.match_indices("?>").next().unwrap().0 + 1;
1815+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1816+
let mut reader = Reader::from_reader(br);
1817+
let mut buf = Vec::new();
1818+
1819+
assert_eq!(
1820+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1821+
Event::PI(BytesText::new("pi"))
1822+
);
1823+
assert_eq!(
1824+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1825+
Event::Eof
1826+
);
1827+
}
1828+
1829+
#[$test]
1830+
$($async)? fn empty() {
1831+
let xml = "<empty/>";
1832+
// ^^^^^^^ data that fit into buffer
1833+
let size = xml.match_indices("/>").next().unwrap().0 + 1;
1834+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1835+
let mut reader = Reader::from_reader(br);
1836+
let mut buf = Vec::new();
1837+
1838+
assert_eq!(
1839+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1840+
Event::Empty(BytesStart::new("empty"))
1841+
);
1842+
assert_eq!(
1843+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1844+
Event::Eof
1845+
);
1846+
}
1847+
1848+
#[$test]
1849+
$($async)? fn cdata1() {
1850+
let xml = "<![CDATA[cdata]]>";
1851+
// ^^^^^^^^^^^^^^^ data that fit into buffer
1852+
let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1853+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1854+
let mut reader = Reader::from_reader(br);
1855+
let mut buf = Vec::new();
1856+
1857+
assert_eq!(
1858+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1859+
Event::CData(BytesCData::new("cdata"))
1860+
);
1861+
assert_eq!(
1862+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1863+
Event::Eof
1864+
);
1865+
}
1866+
1867+
#[$test]
1868+
$($async)? fn cdata2() {
1869+
let xml = "<![CDATA[cdata]]>";
1870+
// ^^^^^^^^^^^^^^^^ data that fit into buffer
1871+
let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1872+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1873+
let mut reader = Reader::from_reader(br);
1874+
let mut buf = Vec::new();
1875+
1876+
assert_eq!(
1877+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1878+
Event::CData(BytesCData::new("cdata"))
1879+
);
1880+
assert_eq!(
1881+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1882+
Event::Eof
1883+
);
1884+
}
1885+
1886+
#[$test]
1887+
$($async)? fn comment1() {
1888+
let xml = "<!--comment-->";
1889+
// ^^^^^^^^^^^^ data that fit into buffer
1890+
let size = xml.match_indices("-->").next().unwrap().0 + 1;
1891+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1892+
let mut reader = Reader::from_reader(br);
1893+
let mut buf = Vec::new();
1894+
1895+
assert_eq!(
1896+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1897+
Event::Comment(BytesText::new("comment"))
1898+
);
1899+
assert_eq!(
1900+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1901+
Event::Eof
1902+
);
1903+
}
1904+
1905+
#[$test]
1906+
$($async)? fn comment2() {
1907+
let xml = "<!--comment-->";
1908+
// ^^^^^^^^^^^^^ data that fit into buffer
1909+
let size = xml.match_indices("-->").next().unwrap().0 + 2;
1910+
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1911+
let mut reader = Reader::from_reader(br);
1912+
let mut buf = Vec::new();
1913+
1914+
assert_eq!(
1915+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1916+
Event::Comment(BytesText::new("comment"))
1917+
);
1918+
assert_eq!(
1919+
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1920+
Event::Eof
1921+
);
1922+
}
1923+
}
1924+
};
1925+
}
1926+
1927+
// Export macros for the child modules:
17551928
// - buffered_reader
17561929
// - slice_reader
17571930
pub(super) use check;
1931+
pub(super) use small_buffers;
17581932
}

src/reader/parser.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ impl Parser {
9090
let len = buf.len();
9191
match bang_type {
9292
BangType::Comment if buf.starts_with(b"!--") => {
93+
debug_assert!(buf.ends_with(b"--"));
9394
if self.check_comments {
9495
// search if '--' not in comments
9596
if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
@@ -105,7 +106,11 @@ impl Parser {
105106
)))
106107
}
107108
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
108-
Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder())))
109+
debug_assert!(buf.ends_with(b"]]"));
110+
Ok(Event::CData(BytesCData::wrap(
111+
&buf[8..len - 2],
112+
self.decoder(),
113+
)))
109114
}
110115
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
111116
let start = buf[8..]

src/reader/slice_reader.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
289289

290290
let bang_type = BangType::new(self[1..].first().copied())?;
291291

292-
if let Some((bytes, i)) = bang_type.parse(self, 0) {
292+
if let Some((bytes, i)) = bang_type.parse(&[], self) {
293293
*position += i;
294294
*self = &self[i..];
295295
return Ok(Some((bang_type, bytes)));

0 commit comments

Comments
 (0)