Skip to content

Commit d34b779

Browse files
authored
Merge pull request #421 from Mingun/unescape
Enforce unescape functions to operate over UTF-8 and fix serde bugs
2 parents 72e11d1 + 7578119 commit d34b779

18 files changed

+378
-298
lines changed

Changelog.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
- [#363]: Do not generate empty `Event::Text` events
4343
- [#412]: Fix using incorrect encoding if `read_to_end` family of methods or `read_text`
4444
method not found a corresponding end tag and reader has non-UTF-8 encoding
45+
- [#421]: Fix incorrect order of unescape and decode operations for serde deserializer:
46+
decoding should be first, unescape is the second
47+
- [#421]: Fixed unknown bug in serde deserialization of externally tagged enums
48+
when an enum variant represented as a `Text` event (i.e. `<xml>tag</xml>`)
49+
and a document encoding is not an UTF-8
4550

4651
### Misc Changes
4752

@@ -109,6 +114,7 @@
109114
|`read_event_unbuffered` |`read_event`
110115
|`read_to_end_unbuffered` |`read_to_end`
111116
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
117+
112118
- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
113119
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
114120
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
@@ -118,9 +124,16 @@
118124
`BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
119125
`Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
120126
renamed to `Attribute::escape_value_with()` for consistency across the API.
127+
121128
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
122129
added to all events
123130

131+
- [#421]: `decode_and_unescape*` methods now does one less allocation if unescaping is not required
132+
- [#421]: Removed ability to deserialize byte arrays from serde deserializer.
133+
XML is not able to store binary data directly, you should always use some encoding
134+
scheme, for example, HEX or Base64
135+
- [#421]: All unescaping functions now accepts and returns strings instead of byte slices
136+
124137
### New Tests
125138

126139
- [#9]: Added tests for incorrect nested tags in input
@@ -148,6 +161,7 @@
148161
[#415]: https://github.com/tafia/quick-xml/pull/415
149162
[#416]: https://github.com/tafia/quick-xml/pull/416
150163
[#418]: https://github.com/tafia/quick-xml/pull/418
164+
[#421]: https://github.com/tafia/quick-xml/pull/421
151165

152166
## 0.23.0 -- 2022-05-08
153167

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ loop {
5151
_ => (),
5252
}
5353
},
54-
Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
54+
Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap().into_owned()),
5555
Ok(Event::Eof) => break, // exits the loop when reaching end of file
5656
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
5757
_ => (), // There are several other `Event`s we do not consider here

benches/macrobenches.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
2424
match r.read_event()? {
2525
Event::Start(e) | Event::Empty(e) => {
2626
for attr in e.attributes() {
27-
criterion::black_box(attr?.unescape_value()?);
27+
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
2828
}
2929
}
3030
Event::Text(e) => {
31-
criterion::black_box(e.unescape()?);
31+
criterion::black_box(e.decode_and_unescape(&r)?);
3232
}
3333
Event::CData(e) => {
3434
criterion::black_box(e.into_inner());

benches/microbenches.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use quick_xml::Reader;
88
static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
99
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
1010

11-
static LOREM_IPSUM_TEXT: &[u8] =
12-
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
11+
static LOREM_IPSUM_TEXT: &str =
12+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
1313
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
1414
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
1515
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
@@ -176,7 +176,7 @@ fn one_event(c: &mut Criterion) {
176176
.check_comments(false)
177177
.trim_text(true);
178178
match r.read_event_into(&mut buf) {
179-
Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
179+
Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(),
180180
something_else => panic!("Did not expect {:?}", something_else),
181181
};
182182

@@ -293,7 +293,7 @@ fn escaping(c: &mut Criterion) {
293293

294294
group.bench_function("no_chars_to_escape_long", |b| {
295295
b.iter(|| {
296-
criterion::black_box(escape(LOREM_IPSUM_TEXT));
296+
criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes()));
297297
})
298298
});
299299

@@ -345,31 +345,31 @@ fn unescaping(c: &mut Criterion) {
345345

346346
group.bench_function("no_chars_to_unescape_short", |b| {
347347
b.iter(|| {
348-
criterion::black_box(unescape(b"just a bit of text")).unwrap();
348+
criterion::black_box(unescape("just a bit of text")).unwrap();
349349
})
350350
});
351351

352352
group.bench_function("char_reference", |b| {
353353
b.iter(|| {
354-
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
354+
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
355355
criterion::black_box(unescape(text)).unwrap();
356-
let text = b"&#38;&#60;";
356+
let text = "&#38;&#60;";
357357
criterion::black_box(unescape(text)).unwrap();
358358
})
359359
});
360360

361361
group.bench_function("entity_reference", |b| {
362362
b.iter(|| {
363-
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
363+
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
364364
criterion::black_box(unescape(text)).unwrap();
365-
let text = b"&quot;what&apos;s that?&quot;";
365+
let text = "&quot;what&apos;s that?&quot;";
366366
criterion::black_box(unescape(text)).unwrap();
367367
})
368368
});
369369

370370
group.bench_function("mixed", |b| {
371371
let text =
372-
b"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
372+
"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
373373
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
374374
Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium
375375
nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu

examples/custom_entities.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
2828
reader.trim_text(true);
2929

3030
let mut buf = Vec::new();
31-
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
31+
let mut custom_entities: HashMap<String, String> = HashMap::new();
3232
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
3333

3434
loop {
3535
match reader.read_event_into(&mut buf) {
3636
Ok(Event::DocType(ref e)) => {
3737
for cap in entity_re.captures_iter(&e) {
3838
custom_entities.insert(
39-
cap[1].to_vec(),
39+
reader.decoder().decode(&cap[1])?.into_owned(),
4040
reader.decoder().decode(&cap[2])?.into_owned(),
4141
);
4242
}
@@ -51,6 +51,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
5151
custom_entities.get(ent).map(|s| s.as_str())
5252
})
5353
.unwrap()
54+
.into_owned()
5455
})
5556
.collect::<Vec<_>>();
5657
println!("attributes values: {:?}", attributes);

src/de/escape.rs

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
//! Serde `Deserializer` module
22
33
use crate::de::deserialize_bool;
4-
use crate::{errors::serialize::DeError, errors::Error, escape::unescape, reader::Decoder};
4+
use crate::errors::serialize::DeError;
5+
use crate::escape::unescape;
6+
use crate::reader::Decoder;
57
use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
68
use serde::{self, forward_to_deserialize_any, serde_if_integer128};
79
use std::borrow::Cow;
@@ -30,13 +32,6 @@ impl<'a> EscapedDeserializer<'a> {
3032
escaped,
3133
}
3234
}
33-
fn unescaped(&self) -> Result<Cow<[u8]>, DeError> {
34-
if self.escaped {
35-
unescape(&self.escaped_value).map_err(|e| DeError::InvalidXml(Error::EscapeError(e)))
36-
} else {
37-
Ok(Cow::Borrowed(&self.escaped_value))
38-
}
39-
}
4035
}
4136

4237
macro_rules! deserialize_num {
@@ -66,20 +61,31 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
6661
where
6762
V: Visitor<'de>,
6863
{
69-
let unescaped = self.unescaped()?;
70-
let value = self.decoder.decode(&unescaped)?;
71-
72-
visitor.visit_str(&value)
64+
let decoded = self.decoder.decode(&self.escaped_value)?;
65+
if self.escaped {
66+
match unescape(&decoded)? {
67+
Cow::Borrowed(s) => visitor.visit_str(s),
68+
Cow::Owned(s) => visitor.visit_string(s),
69+
}
70+
} else {
71+
match decoded {
72+
Cow::Borrowed(s) => visitor.visit_str(s),
73+
Cow::Owned(s) => visitor.visit_string(s),
74+
}
75+
}
7376
}
7477

75-
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
78+
/// Returns [`DeError::Unsupported`]
79+
fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, Self::Error>
7680
where
7781
V: Visitor<'de>,
7882
{
79-
let v = self.unescaped()?;
80-
visitor.visit_bytes(&v)
83+
Err(DeError::Unsupported(
84+
"binary data content is not supported by XML format",
85+
))
8186
}
8287

88+
/// Forwards deserialization to the [`Self::deserialize_bytes`]
8389
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
8490
where
8591
V: Visitor<'de>,

src/de/map.rs

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@ use crate::{
44
de::escape::EscapedDeserializer,
55
de::seq::{not_in, TagFilter},
66
de::simple_type::SimpleTypeDeserializer,
7-
de::{deserialize_bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
7+
de::{str2bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
88
errors::serialize::DeError,
99
events::attributes::IterState,
10-
events::{BytesCData, BytesStart},
11-
reader::Decoder,
10+
events::BytesStart,
1211
};
1312
use serde::de::{self, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor};
1413
use serde::serde_if_integer128;
@@ -479,15 +478,9 @@ where
479478
{
480479
/// Returns a text event, used inside [`deserialize_primitives!()`]
481480
#[inline]
482-
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
481+
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
483482
self.map.de.next_text_impl(unescape, self.allow_start)
484483
}
485-
486-
/// Returns a decoder, used inside [`deserialize_primitives!()`]
487-
#[inline]
488-
fn decoder(&self) -> Decoder {
489-
self.map.de.reader.decoder()
490-
}
491484
}
492485

493486
impl<'de, 'a, 'm, R> de::Deserializer<'de> for MapValueDeserializer<'de, 'a, 'm, R>
@@ -649,15 +642,9 @@ where
649642
{
650643
/// Returns a text event, used inside [`deserialize_primitives!()`]
651644
#[inline]
652-
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
645+
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
653646
self.map.de.next_text_impl(unescape, true)
654647
}
655-
656-
/// Returns a decoder, used inside [`deserialize_primitives!()`]
657-
#[inline]
658-
fn decoder(&self) -> Decoder {
659-
self.map.de.reader.decoder()
660-
}
661648
}
662649

663650
impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R>

src/de/mod.rs

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,7 @@ macro_rules! deserialize_type {
117117
{
118118
// No need to unescape because valid integer representations cannot be escaped
119119
let text = self.next_text(false)?;
120-
let string = text.decode(self.decoder())?;
121-
visitor.$visit(string.parse()?)
120+
visitor.$visit(text.parse()?)
122121
}
123122
};
124123
}
@@ -152,7 +151,7 @@ macro_rules! deserialize_primitives {
152151
// No need to unescape because valid boolean representations cannot be escaped
153152
let text = self.next_text(false)?;
154153

155-
deserialize_bool(text.as_ref(), self.decoder(), visitor)
154+
str2bool(&text, visitor)
156155
}
157156

158157
/// Representation of owned strings the same as [non-owned](#method.deserialize_str).
@@ -176,30 +175,26 @@ macro_rules! deserialize_primitives {
176175
V: Visitor<'de>,
177176
{
178177
let text = self.next_text(true)?;
179-
let string = text.decode(self.decoder())?;
180-
match string {
178+
match text {
181179
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
182180
Cow::Owned(string) => visitor.visit_string(string),
183181
}
184182
}
185183

186-
fn deserialize_bytes<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
184+
/// Returns [`DeError::Unsupported`]
185+
fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, DeError>
187186
where
188187
V: Visitor<'de>,
189188
{
190-
// No need to unescape because bytes gives access to the raw XML input
191-
let text = self.next_text(false)?;
192-
visitor.visit_bytes(&text)
189+
Err(DeError::Unsupported("binary data content is not supported by XML format"))
193190
}
194191

195-
fn deserialize_byte_buf<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
192+
/// Forwards deserialization to the [`deserialize_bytes`](#method.deserialize_bytes).
193+
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
196194
where
197195
V: Visitor<'de>,
198196
{
199-
// No need to unescape because bytes gives access to the raw XML input
200-
let text = self.next_text(false)?;
201-
let value = text.into_inner().into_owned();
202-
visitor.visit_byte_buf(value)
197+
self.deserialize_bytes(visitor)
203198
}
204199

205200
/// Identifiers represented as [strings](#method.deserialize_str).
@@ -576,7 +571,7 @@ where
576571
}
577572

578573
#[inline]
579-
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
574+
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
580575
self.next_text_impl(unescape, true)
581576
}
582577

@@ -616,25 +611,24 @@ where
616611
&mut self,
617612
unescape: bool,
618613
allow_start: bool,
619-
) -> Result<BytesCData<'de>, DeError> {
614+
) -> Result<Cow<'de, str>, DeError> {
615+
let decoder = self.reader.decoder();
620616
match self.next()? {
621-
DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
622-
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
623-
DeEvent::CData(e) => Ok(e),
617+
DeEvent::Text(e) => Ok(e.decode(decoder, unescape)?),
618+
DeEvent::CData(e) => Ok(e.decode(decoder)?),
624619
DeEvent::Start(e) if allow_start => {
625620
// allow one nested level
626621
let inner = self.next()?;
627622
let t = match inner {
628-
DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
629-
DeEvent::Text(t) => BytesCData::new(t.into_inner()),
630-
DeEvent::CData(t) => t,
623+
DeEvent::Text(t) => t.decode(decoder, unescape)?,
624+
DeEvent::CData(t) => t.decode(decoder)?,
631625
DeEvent::Start(s) => {
632626
return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned()))
633627
}
634628
// We can get End event in case of `<tag></tag>` or `<tag/>` input
635629
// Return empty text in that case
636630
DeEvent::End(end) if end.name() == e.name() => {
637-
return Ok(BytesCData::new(&[] as &[u8]));
631+
return Ok("".into());
638632
}
639633
DeEvent::End(end) => {
640634
return Err(DeError::UnexpectedEnd(end.name().as_ref().to_owned()))
@@ -650,12 +644,6 @@ where
650644
}
651645
}
652646

653-
/// Returns a decoder, used inside `deserialize_primitives!()`
654-
#[inline]
655-
fn decoder(&self) -> Decoder {
656-
self.reader.decoder()
657-
}
658-
659647
/// Drops all events until event with [name](BytesEnd::name()) `name` won't be
660648
/// dropped. This method should be called after [`Self::next()`]
661649
#[cfg(feature = "overlapped-lists")]

0 commit comments

Comments
 (0)