Skip to content

Commit 068b36e

Browse files
authored
Merge pull request #423 from Mingun/escape
Enforce escape functions to operate over UTF-8
2 parents d34b779 + 1a1f4bf commit 068b36e

File tree

9 files changed

+84
-69
lines changed

9 files changed

+84
-69
lines changed

Changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@
134134
scheme, for example, HEX or Base64
135135
- [#421]: All unescaping functions now accepts and returns strings instead of byte slices
136136

137+
- [#423]: All escaping functions now accepts and returns strings instead of byte slices
138+
- [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array,
139+
but since now escaping works on strings. Use `BytesText::from_plain_str` instead
140+
137141
### New Tests
138142

139143
- [#9]: Added tests for incorrect nested tags in input
@@ -162,6 +166,7 @@
162166
[#416]: https://github.com/tafia/quick-xml/pull/416
163167
[#418]: https://github.com/tafia/quick-xml/pull/418
164168
[#421]: https://github.com/tafia/quick-xml/pull/421
169+
[#423]: https://github.com/tafia/quick-xml/pull/423
165170

166171
## 0.23.0 -- 2022-05-08
167172

benches/microbenches.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -293,26 +293,26 @@ fn escaping(c: &mut Criterion) {
293293

294294
group.bench_function("no_chars_to_escape_long", |b| {
295295
b.iter(|| {
296-
criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes()));
296+
criterion::black_box(escape(LOREM_IPSUM_TEXT));
297297
})
298298
});
299299

300300
group.bench_function("no_chars_to_escape_short", |b| {
301301
b.iter(|| {
302-
criterion::black_box(escape(b"just bit of text"));
302+
criterion::black_box(escape("just bit of text"));
303303
})
304304
});
305305

306306
group.bench_function("escaped_chars_short", |b| {
307307
b.iter(|| {
308-
criterion::black_box(escape(b"age > 72 && age < 21"));
309-
criterion::black_box(escape(b"\"what's that?\""));
308+
criterion::black_box(escape("age > 72 && age < 21"));
309+
criterion::black_box(escape("\"what's that?\""));
310310
})
311311
});
312312

313313
group.bench_function("escaped_chars_long", |b| {
314314
let lorem_ipsum_with_escape_chars =
315-
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
315+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
316316
ut labore et dolore magna aliqua. & Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
317317
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
318318
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus

src/de/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1405,7 +1405,7 @@ mod tests {
14051405
br#"item name="hello" source="world.rs""#,
14061406
4
14071407
)),
1408-
Text(BytesText::from_escaped(b"Some text".as_ref())),
1408+
Text(BytesText::from_escaped_str("Some text")),
14091409
End(BytesEnd::borrowed(b"item")),
14101410
Start(BytesStart::borrowed(b"item2", 5)),
14111411
End(BytesEnd::borrowed(b"item2")),

src/escapei.rs

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ impl std::fmt::Display for EscapeError {
5959

6060
impl std::error::Error for EscapeError {}
6161

62-
/// Escapes a `&[u8]` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
62+
/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
6363
/// with their corresponding xml escaped value.
6464
///
6565
/// This function performs following replacements:
@@ -71,11 +71,11 @@ impl std::error::Error for EscapeError {}
7171
/// | `&` | `&amp;`
7272
/// | `'` | `&apos;`
7373
/// | `"` | `&quot;`
74-
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
74+
pub fn escape(raw: &str) -> Cow<str> {
7575
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
7676
}
7777

78-
/// Escapes a `&[u8]` and replaces xml special characters (`<`, `>`, `&`)
78+
/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
7979
/// with their corresponding xml escaped value.
8080
///
8181
/// Should only be used for escaping text content. In XML text content, it is allowed
@@ -88,24 +88,25 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> {
8888
/// | `<` | `&lt;`
8989
/// | `>` | `&gt;`
9090
/// | `&` | `&amp;`
91-
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
91+
pub fn partial_escape(raw: &str) -> Cow<str> {
9292
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
9393
}
9494

95-
/// Escapes a `&[u8]` and replaces a subset of xml special characters (`<`, `>`,
95+
/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
9696
/// `&`, `'`, `"`) with their corresponding xml escaped value.
97-
fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
97+
fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
98+
let bytes = raw.as_bytes();
9899
let mut escaped = None;
99-
let mut bytes = raw.iter();
100+
let mut iter = bytes.iter();
100101
let mut pos = 0;
101-
while let Some(i) = bytes.position(|&b| escape_chars(b)) {
102+
while let Some(i) = iter.position(|&b| escape_chars(b)) {
102103
if escaped.is_none() {
103104
escaped = Some(Vec::with_capacity(raw.len()));
104105
}
105106
let escaped = escaped.as_mut().expect("initialized");
106107
let new_pos = pos + i;
107-
escaped.extend_from_slice(&raw[pos..new_pos]);
108-
match raw[new_pos] {
108+
escaped.extend_from_slice(&bytes[pos..new_pos]);
109+
match bytes[new_pos] {
109110
b'<' => escaped.extend_from_slice(b"&lt;"),
110111
b'>' => escaped.extend_from_slice(b"&gt;"),
111112
b'\'' => escaped.extend_from_slice(b"&apos;"),
@@ -117,10 +118,14 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
117118
}
118119

119120
if let Some(mut escaped) = escaped {
120-
if let Some(raw) = raw.get(pos..) {
121+
if let Some(raw) = bytes.get(pos..) {
121122
escaped.extend_from_slice(raw);
122123
}
123-
Cow::Owned(escaped)
124+
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
125+
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
126+
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
127+
// if unsafe code will be allowed
128+
Cow::Owned(String::from_utf8(escaped).unwrap())
124129
} else {
125130
Cow::Borrowed(raw)
126131
}
@@ -1717,10 +1722,10 @@ fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
17171722

17181723
#[test]
17191724
fn test_unescape() {
1720-
assert_eq!(&*unescape("test").unwrap(), "test");
1721-
assert_eq!(&*unescape("&lt;test&gt;").unwrap(), "<test>");
1722-
assert_eq!(&*unescape("&#x30;").unwrap(), "0");
1723-
assert_eq!(&*unescape("&#48;").unwrap(), "0");
1725+
assert_eq!(unescape("test").unwrap(), Cow::Borrowed("test"));
1726+
assert_eq!(unescape("&lt;test&gt;").unwrap(), "<test>");
1727+
assert_eq!(unescape("&#x30;").unwrap(), "0");
1728+
assert_eq!(unescape("&#48;").unwrap(), "0");
17241729
assert!(unescape("&foo;").is_err());
17251730
}
17261731

@@ -1731,37 +1736,40 @@ fn test_unescape_with() {
17311736
_ => None,
17321737
};
17331738

1734-
assert_eq!(&*unescape_with("test", custom_entities).unwrap(), "test");
17351739
assert_eq!(
1736-
&*unescape_with("&lt;test&gt;", custom_entities).unwrap(),
1740+
unescape_with("test", custom_entities).unwrap(),
1741+
Cow::Borrowed("test")
1742+
);
1743+
assert_eq!(
1744+
unescape_with("&lt;test&gt;", custom_entities).unwrap(),
17371745
"<test>"
17381746
);
1739-
assert_eq!(&*unescape_with("&#x30;", custom_entities).unwrap(), "0");
1740-
assert_eq!(&*unescape_with("&#48;", custom_entities).unwrap(), "0");
1741-
assert_eq!(&*unescape_with("&foo;", custom_entities).unwrap(), "BAR");
1747+
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
1748+
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
1749+
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");
17421750
assert!(unescape_with("&fop;", custom_entities).is_err());
17431751
}
17441752

17451753
#[test]
17461754
fn test_escape() {
1747-
assert_eq!(&*escape(b"test"), b"test");
1748-
assert_eq!(&*escape(b"<test>"), b"&lt;test&gt;");
1749-
assert_eq!(&*escape(b"\"a\"bc"), b"&quot;a&quot;bc");
1750-
assert_eq!(&*escape(b"\"a\"b&c"), b"&quot;a&quot;b&amp;c");
1755+
assert_eq!(escape("test"), Cow::Borrowed("test"));
1756+
assert_eq!(escape("<test>"), "&lt;test&gt;");
1757+
assert_eq!(escape("\"a\"bc"), "&quot;a&quot;bc");
1758+
assert_eq!(escape("\"a\"b&c"), "&quot;a&quot;b&amp;c");
17511759
assert_eq!(
1752-
&*escape(b"prefix_\"a\"b&<>c"),
1753-
"prefix_&quot;a&quot;b&amp;&lt;&gt;c".as_bytes()
1760+
escape("prefix_\"a\"b&<>c"),
1761+
"prefix_&quot;a&quot;b&amp;&lt;&gt;c"
17541762
);
17551763
}
17561764

17571765
#[test]
17581766
fn test_partial_escape() {
1759-
assert_eq!(&*partial_escape(b"test"), b"test");
1760-
assert_eq!(&*partial_escape(b"<test>"), b"&lt;test&gt;");
1761-
assert_eq!(&*partial_escape(b"\"a\"bc"), b"\"a\"bc");
1762-
assert_eq!(&*partial_escape(b"\"a\"b&c"), b"\"a\"b&amp;c");
1767+
assert_eq!(partial_escape("test"), Cow::Borrowed("test"));
1768+
assert_eq!(partial_escape("<test>"), "&lt;test&gt;");
1769+
assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc");
1770+
assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&amp;c");
17631771
assert_eq!(
1764-
&*partial_escape(b"prefix_\"a\"b&<>c"),
1765-
"prefix_\"a\"b&amp;&lt;&gt;c".as_bytes()
1772+
partial_escape("prefix_\"a\"b&<>c"),
1773+
"prefix_\"a\"b&amp;&lt;&gt;c"
17661774
);
17671775
}

src/events/attributes.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,10 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
143143
fn from(val: (&'a str, &'a str)) -> Attribute<'a> {
144144
Attribute {
145145
key: QName(val.0.as_bytes()),
146-
value: escape(val.1.as_bytes()),
146+
value: match escape(val.1) {
147+
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
148+
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
149+
},
147150
}
148151
}
149152
}

src/events/mod.rs

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -681,15 +681,6 @@ impl<'a> BytesText<'a> {
681681
}
682682
}
683683

684-
/// Creates a new `BytesText` from a byte sequence. The byte sequence is
685-
/// expected not to be escaped.
686-
#[inline]
687-
pub fn from_plain(content: &'a [u8]) -> Self {
688-
Self {
689-
content: escape(content),
690-
}
691-
}
692-
693684
/// Creates a new `BytesText` from an escaped string.
694685
#[inline]
695686
pub fn from_escaped_str<C: Into<Cow<'a, str>>>(content: C) -> Self {
@@ -703,7 +694,12 @@ impl<'a> BytesText<'a> {
703694
/// be escaped.
704695
#[inline]
705696
pub fn from_plain_str(content: &'a str) -> Self {
706-
Self::from_plain(content.as_bytes())
697+
Self {
698+
content: match escape(content) {
699+
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
700+
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
701+
},
702+
}
707703
}
708704

709705
/// Ensures that all data is owned to extend the object's lifetime if
@@ -901,11 +897,13 @@ impl<'a> BytesCData<'a> {
901897
/// | `&` | `&amp;`
902898
/// | `'` | `&apos;`
903899
/// | `"` | `&quot;`
904-
pub fn escape(self) -> BytesText<'a> {
905-
BytesText::from_escaped(match escape(&self.content) {
900+
pub fn escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
901+
let decoded = self.decode(decoder)?;
902+
Ok(BytesText::from_escaped(match escape(&decoded) {
903+
// Because result is borrowed, no replacements was done and we can use original content
906904
Cow::Borrowed(_) => self.content,
907-
Cow::Owned(escaped) => Cow::Owned(escaped),
908-
})
905+
Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()),
906+
}))
909907
}
910908

911909
/// Converts this CDATA content to an escaped version, that can be written
@@ -921,15 +919,16 @@ impl<'a> BytesCData<'a> {
921919
/// | `<` | `&lt;`
922920
/// | `>` | `&gt;`
923921
/// | `&` | `&amp;`
924-
pub fn partial_escape(self) -> BytesText<'a> {
925-
BytesText::from_escaped(match partial_escape(&self.content) {
922+
pub fn partial_escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
923+
let decoded = self.decode(decoder)?;
924+
Ok(BytesText::from_escaped(match partial_escape(&decoded) {
925+
// Because result is borrowed, no replacements was done and we can use original content
926926
Cow::Borrowed(_) => self.content,
927-
Cow::Owned(escaped) => Cow::Owned(escaped),
928-
})
927+
Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()),
928+
}))
929929
}
930930

931931
/// Gets content of this text buffer in the specified encoding
932-
#[cfg(feature = "serialize")]
933932
pub(crate) fn decode(&self, decoder: Decoder) -> Result<Cow<'a, str>> {
934933
Ok(match &self.content {
935934
Cow::Borrowed(bytes) => decoder.decode(bytes)?,

src/reader.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2563,7 +2563,7 @@ mod test {
25632563

25642564
assert_eq!(
25652565
reader.read_event_impl($buf).unwrap(),
2566-
Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into())
2566+
Event::StartText(BytesText::from_escaped_str("bom").into())
25672567
);
25682568
}
25692569

@@ -2583,7 +2583,7 @@ mod test {
25832583

25842584
assert_eq!(
25852585
reader.read_event_impl($buf).unwrap(),
2586-
Event::DocType(BytesText::from_escaped(b"x".as_ref()))
2586+
Event::DocType(BytesText::from_escaped_str("x"))
25872587
);
25882588
}
25892589

@@ -2593,7 +2593,7 @@ mod test {
25932593

25942594
assert_eq!(
25952595
reader.read_event_impl($buf).unwrap(),
2596-
Event::PI(BytesText::from_escaped(b"xml-stylesheet".as_ref()))
2596+
Event::PI(BytesText::from_escaped_str("xml-stylesheet"))
25972597
);
25982598
}
25992599

@@ -2642,7 +2642,7 @@ mod test {
26422642

26432643
assert_eq!(
26442644
reader.read_event_impl($buf).unwrap(),
2645-
Event::Text(BytesText::from_escaped(b"text".as_ref()))
2645+
Event::Text(BytesText::from_escaped_str("text"))
26462646
);
26472647
}
26482648

@@ -2662,7 +2662,7 @@ mod test {
26622662

26632663
assert_eq!(
26642664
reader.read_event_impl($buf).unwrap(),
2665-
Event::Comment(BytesText::from_escaped(b"".as_ref()))
2665+
Event::Comment(BytesText::from_escaped_str(""))
26662666
);
26672667
}
26682668

src/se/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,11 @@ impl<'r, W: Write> Serializer<'r, W> {
100100
value: P,
101101
escaped: bool,
102102
) -> Result<(), DeError> {
103-
let value = value.to_string().into_bytes();
103+
let value = value.to_string();
104104
let event = if escaped {
105-
BytesText::from_escaped(value)
105+
BytesText::from_escaped_str(&value)
106106
} else {
107-
BytesText::from_plain(&value)
107+
BytesText::from_plain_str(&value)
108108
};
109109
self.writer.write_event(Event::Text(event))?;
110110
Ok(())

src/writer.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ mod indentation {
422422
let start = BytesStart::borrowed_name(name)
423423
.with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter());
424424
let end = BytesEnd::borrowed(name);
425-
let text = BytesText::from_plain(b"text");
425+
let text = BytesText::from_plain_str("text");
426426

427427
writer
428428
.write_event(Event::Start(start))
@@ -449,7 +449,7 @@ mod indentation {
449449
let start = BytesStart::borrowed_name(name)
450450
.with_attributes(vec![("attr1", "value1"), ("attr2", "value2")].into_iter());
451451
let end = BytesEnd::borrowed(name);
452-
let text = BytesText::from_plain(b"text");
452+
let text = BytesText::from_plain_str("text");
453453
let inner = BytesStart::borrowed_name(b"inner");
454454

455455
writer

0 commit comments

Comments
 (0)