Skip to content

Commit cc17a44

Browse files
committed
Provide some utilities for decoding entire buffers
1 parent f4bb0db commit cc17a44

File tree

2 files changed

+50
-23
lines changed

2 files changed

+50
-23
lines changed

Changelog.md

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
the XML declared encoding and always use UTF-8
2828
- [#416]: Add `borrow()` methods in all event structs which allows to get
2929
a borrowed version of any event
30+
- [#436]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
31+
under the `quick-xml::encoding` namespace.
3032

3133
### Bug Fixes
3234

src/encoding.rs

+48-23
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ impl Decoder {
4646
///
4747
/// If you instead want to use XML declared encoding, use the `encoding` feature
4848
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
49-
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
49+
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
5050
&bytes[3..]
5151
} else {
5252
bytes
@@ -72,13 +72,7 @@ impl Decoder {
7272
///
7373
/// Returns an error in case of malformed sequences in the `bytes`.
7474
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
75-
match self
76-
.encoding
77-
.decode_without_bom_handling_and_without_replacement(bytes)
78-
{
79-
None => Err(Error::NonDecodable(None)),
80-
Some(s) => Ok(s),
81-
}
75+
decode(bytes, self.encoding)
8276
}
8377

8478
/// Decodes a slice with BOM removal if it is present in the `bytes` using
@@ -91,25 +85,54 @@ impl Decoder {
9185
///
9286
/// Returns an error in case of malformed sequences in the `bytes`.
9387
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
94-
self.decode(self.remove_bom(bytes))
88+
self.decode(remove_bom(bytes, self.encoding))
9589
}
96-
/// Copied from [`Encoding::decode_with_bom_removal`]
97-
#[inline]
98-
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
99-
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
100-
return &bytes[3..];
101-
}
102-
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
103-
return &bytes[2..];
104-
}
105-
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
106-
return &bytes[2..];
107-
}
90+
}
91+
92+
/// Decodes the provided bytes using the specified encoding, ignoring the BOM
93+
/// if it is present in the `bytes`.
94+
///
95+
/// Returns an error in case of malformed sequences in the `bytes`.
96+
#[cfg(feature = "encoding")]
97+
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
98+
encoding
99+
.decode_without_bom_handling_and_without_replacement(bytes)
100+
.ok_or(Error::NonDecodable(None))
101+
}
108102

109-
bytes
103+
/// Decodes a slice with an unknown encoding, removing the BOM if it is present
104+
/// in the bytes.
105+
///
106+
/// Returns an error in case of malformed sequences in the `bytes`.
107+
#[cfg(feature = "encoding")]
108+
pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
109+
if let Some(encoding) = detect_encoding(bytes) {
110+
let bytes = remove_bom(bytes, encoding);
111+
decode(bytes, encoding)
112+
} else {
113+
decode(bytes, UTF_8)
114+
}
115+
}
116+
117+
#[cfg(feature = "encoding")]
118+
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
119+
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
120+
bytes.split_at(3)
121+
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
122+
bytes.split_at(2)
123+
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
124+
bytes.split_at(2)
125+
} else {
126+
(&[], bytes)
110127
}
111128
}
112129

130+
#[cfg(feature = "encoding")]
131+
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
132+
let (_, bytes) = split_at_bom(bytes, encoding);
133+
bytes
134+
}
135+
113136
/// This implementation is required for tests of other parts of the library
114137
#[cfg(test)]
115138
#[cfg(feature = "serialize")]
@@ -158,7 +181,7 @@ impl Decoder {
158181
///
159182
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
160183
#[cfg(feature = "encoding")]
161-
pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
184+
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
162185
match bytes {
163186
// with BOM
164187
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
@@ -173,3 +196,5 @@ pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
173196
_ => None,
174197
}
175198
}
199+
200+
// TODO: add tests from these functions

0 commit comments

Comments
 (0)