Skip to content

Commit ea93eeb

Browse files
committed
Remove the ability to use SliceReader with raw bytes.
In the near future, decoding will be performed automatically as the input is read. If the input has an unknown encoding, it must be decoded first, necessitating a buffer. Therefore only the buffered implementation can be used for `Reader::from_bytes()` If the encoding of the bytes is known up-front, you can decode them up-front and subsequently use `Reader::from_str()` if desired.
1 parent 559d0e8 commit ea93eeb

File tree

10 files changed

+158
-156
lines changed

10 files changed

+158
-156
lines changed

Changelog.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,12 @@
138138
- [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array,
139139
but since now escaping works on strings. Use `BytesText::from_plain_str` instead
140140
- [#425]: Split the internal implementation of `Reader` into multiple files to better separate the
141-
buffered and unbuffered implementations. The buffered methods, e.g. `read_event_into(&mut buf)`,
141+
buffered and unbuffered implementations. The unbuffered methods, e.g. `read_event()`,
142142
will no longer be available when reading from a slice.
143+
- [#436]: When using `Reader` with raw bytes, a buffered parsing implementation will always be used.
144+
If using `Reader::from_str()`, the reader will borrow directly from the `&str`. If you have a byte
145+
array known to be valid UTF-8, it is recommended to convert it to `&str` first, which will enable
146+
the unbuffered (borrowing) implementation.
143147

144148
### New Tests
145149

@@ -171,6 +175,7 @@
171175
[#421]: https://github.com/tafia/quick-xml/pull/421
172176
[#423]: https://github.com/tafia/quick-xml/pull/423
173177
[#425]: https://github.com/tafia/quick-xml/pull/425
178+
[#436]: https://github.com/tafia/quick-xml/pull/430
174179

175180
## 0.23.0 -- 2022-05-08
176181

src/de/mod.rs

+25-30
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,8 @@ where
306306
}
307307

308308
/// Deserialize from a reader. This method will do internal copies of data
309-
/// readed from `reader`. If you want have a `&[u8]` or `&str` input and want
310-
/// to borrow as much as possible, use [`from_slice`] or [`from_str`]
309+
/// readed from `reader`. If you want have a `&str` input and want
310+
/// to borrow as much as possible, use [`from_str`]
311311
pub fn from_reader<R, T>(reader: R) -> Result<T, DeError>
312312
where
313313
R: BufRead,
@@ -685,17 +685,7 @@ where
685685
impl<'de> Deserializer<'de, SliceReader<'de>> {
686686
/// Create new deserializer that will borrow data from the specified string
687687
pub fn from_str(s: &'de str) -> Self {
688-
Self::from_borrowing_reader(Reader::from_str(s))
689-
}
690-
691-
/// Create new deserializer that will borrow data from the specified byte array
692-
pub fn from_slice(bytes: &'de [u8]) -> Self {
693-
Self::from_borrowing_reader(Reader::from_bytes(bytes))
694-
}
695-
696-
/// Create new deserializer that will borrow data from the specified borrowing reader
697-
#[inline]
698-
fn from_borrowing_reader(mut reader: Reader<crate::SliceReader<'de>>) -> Self {
688+
let mut reader = Reader::from_str(s);
699689
reader
700690
.expand_empty_elements(true)
701691
.check_end_names(true)
@@ -726,6 +716,13 @@ where
726716
}
727717
}
728718

719+
impl<'de> Deserializer<'de, IoReader<&'de [u8]>> {
720+
/// Create new deserializer that will borrow data from the specified byte array
721+
pub fn from_slice(bytes: &'de [u8]) -> Self {
722+
Self::from_reader(bytes)
723+
}
724+
}
725+
729726
impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R>
730727
where
731728
R: XmlRead<'de>,
@@ -970,10 +967,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
970967
}
971968
}
972969

973-
/// XML input source that reads from a slice of bytes and can borrow from it.
970+
/// XML input source that reads from a `&str` and can borrow from it.
974971
///
975972
/// You cannot create it, it is created automatically when you call
976-
/// [`Deserializer::from_str`] or [`Deserializer::from_slice`]
973+
/// [`Deserializer::from_str`] or [`Deserializer::from_str`]
977974
pub struct SliceReader<'de> {
978975
reader: Reader<crate::SliceReader<'de>>,
979976
}
@@ -1025,8 +1022,8 @@ mod tests {
10251022
/// Checks that `peek()` and `read()` behaves correctly after `skip()`
10261023
#[test]
10271024
fn read_and_peek() {
1028-
let mut de = Deserializer::from_slice(
1029-
br#"
1025+
let mut de = Deserializer::from_str(
1026+
r#"
10301027
<root>
10311028
<inner>
10321029
text
@@ -1166,8 +1163,8 @@ mod tests {
11661163
/// Checks that `read_to_end()` behaves correctly after `skip()`
11671164
#[test]
11681165
fn read_to_end() {
1169-
let mut de = Deserializer::from_slice(
1170-
br#"
1166+
let mut de = Deserializer::from_str(
1167+
r#"
11711168
<root>
11721169
<skip>
11731170
text
@@ -1270,8 +1267,8 @@ mod tests {
12701267
item: Vec<()>,
12711268
}
12721269

1273-
let mut de = Deserializer::from_slice(
1274-
br#"
1270+
let mut de = Deserializer::from_str(
1271+
r#"
12751272
<any-name>
12761273
<item/>
12771274
<another-item>
@@ -1296,8 +1293,8 @@ mod tests {
12961293
fn read_to_end() {
12971294
use crate::de::DeEvent::*;
12981295

1299-
let mut de = Deserializer::from_slice(
1300-
br#"
1296+
let mut de = Deserializer::from_str(
1297+
r#"
13011298
<root>
13021299
<tag a="1"><tag>text</tag>content</tag>
13031300
<tag a="2"><![CDATA[cdata content]]></tag>
@@ -1343,15 +1340,14 @@ mod tests {
13431340
<item name="hello" source="world.rs">Some text</item>
13441341
<item2/>
13451342
<item3 value="world" />
1346-
"##
1347-
.as_bytes();
1343+
"##;
13481344

13491345
let mut reader1 = IoReader {
1350-
reader: Reader::from_reader(s),
1346+
reader: Reader::from_reader(s.as_bytes()),
13511347
buf: Vec::new(),
13521348
};
13531349
let mut reader2 = SliceReader {
1354-
reader: Reader::from_bytes(s),
1350+
reader: Reader::from_str(s),
13551351
};
13561352

13571353
loop {
@@ -1373,11 +1369,10 @@ mod tests {
13731369
<item2></item2>
13741370
<item3/>
13751371
<item4 value="world" />
1376-
"##
1377-
.as_bytes();
1372+
"##;
13781373

13791374
let mut reader = SliceReader {
1380-
reader: Reader::from_bytes(s),
1375+
reader: Reader::from_str(s),
13811376
};
13821377

13831378
reader

src/events/mod.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -983,8 +983,9 @@ pub enum Event<'a> {
983983
/// let xml = b"\xEF\xBB\xBF<?xml version='1.0'?>";
984984
/// let mut reader = Reader::from_bytes(xml);
985985
/// let mut events_processed = 0;
986+
/// let mut event_buffer = Vec::new();
986987
/// loop {
987-
/// match reader.read_event() {
988+
/// match reader.read_event_into(&mut event_buffer) {
988989
/// Ok(Event::StartText(e)) => {
989990
/// assert_eq!(events_processed, 0);
990991
/// // Content contains BOM

src/reader/buffered_reader.rs

+58-4
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,14 @@ impl Reader<BufferedReader<BufReader<File>>> {
406406
}
407407
}
408408

409+
/// Builder for reading from any [`&[u8]`].
410+
impl<'buf> Reader<BufferedReader<&'buf [u8]>> {
411+
/// Creates an XML reader from any type implementing [`BufRead`].
412+
pub fn from_bytes(s: &'buf [u8]) -> Self {
413+
Self::from_reader_internal(BufferedReader(s))
414+
}
415+
}
416+
409417
/// Builder for reading from any [`BufRead`].
410418
impl<R: BufRead> Reader<BufferedReader<R>> {
411419
/// Creates an XML reader from any type implementing [`BufRead`].
@@ -652,17 +660,63 @@ mod test {
652660
use super::*;
653661
use crate::reader::test::check;
654662

655-
fn input_from_bytes(bytes: &[u8]) -> BufferedReader<&[u8]> {
656-
BufferedReader(bytes)
663+
fn input_from_str(s: &str) -> BufferedReader<&[u8]> {
664+
BufferedReader(s.as_bytes())
657665
}
658666

659667
fn reader_from_str(s: &str) -> Reader<BufferedReader<&[u8]>> {
660668
Reader::from_reader_internal(BufferedReader(s.as_bytes()))
661669
}
662670

663671
#[allow(dead_code)]
664-
fn reader_from_bytes(s: &[u8]) -> Reader<BufferedReader<&[u8]>> {
665-
Reader::from_reader_internal(BufferedReader(s))
672+
fn reader_from_bytes(bytes: &[u8]) -> Reader<BufferedReader<&[u8]>> {
673+
Reader::from_reader_internal(BufferedReader(bytes))
674+
}
675+
676+
#[cfg(feature = "encoding")]
677+
mod encoding {
678+
use super::reader_from_bytes;
679+
use crate::events::Event;
680+
use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
681+
682+
mod bytes {
683+
use super::reader_from_bytes;
684+
use super::*;
685+
use pretty_assertions::assert_eq;
686+
687+
/// Checks that encoding is detected by BOM and changed after XML declaration
688+
#[test]
689+
fn bom_detected() {
690+
let mut reader = reader_from_bytes(b"\xFF\xFE<?xml encoding='windows-1251'?>");
691+
let mut buf = Vec::new();
692+
693+
assert_eq!(reader.decoder().encoding(), UTF_8);
694+
reader.read_event_impl(&mut buf).unwrap();
695+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
696+
697+
reader.read_event_impl(&mut buf).unwrap();
698+
assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
699+
700+
assert_eq!(reader.read_event_impl(&mut buf).unwrap(), Event::Eof);
701+
}
702+
703+
/// Checks that encoding is changed by XML declaration, but only once
704+
#[test]
705+
fn xml_declaration() {
706+
let mut reader =
707+
reader_from_bytes(b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>");
708+
let mut buf = Vec::new();
709+
710+
assert_eq!(reader.decoder().encoding(), UTF_8);
711+
reader.read_event_impl(&mut buf).unwrap();
712+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
713+
714+
reader.read_event_impl(&mut buf).unwrap();
715+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
716+
717+
assert_eq!(reader.read_event_impl(&mut buf).unwrap(), Event::Eof);
718+
}
719+
}
666720
}
667721

668722
check!(let mut buf = Vec::new(););

0 commit comments

Comments
 (0)