Skip to content

Commit f4bb0db

Browse files
committed
Move everything related to actually decoding text to a new module
1 parent 7b27fd7 commit f4bb0db

File tree

9 files changed

+189
-178
lines changed

9 files changed

+189
-178
lines changed

src/de/escape.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
//! Serde `Deserializer` module
22
33
use crate::de::deserialize_bool;
4+
use crate::encoding::Decoder;
45
use crate::errors::serialize::DeError;
56
use crate::escape::unescape;
6-
use crate::reader::Decoder;
77
use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
88
use serde::{self, forward_to_deserialize_any, serde_if_integer128};
99
use std::borrow::Cow;

src/de/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,10 @@ mod var;
215215

216216
pub use crate::errors::serialize::DeError;
217217
use crate::{
218+
encoding::Decoder,
218219
errors::Error,
219220
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
220221
name::QName,
221-
reader::Decoder,
222222
Reader,
223223
};
224224
use serde::de::{self, Deserialize, DeserializeOwned, Visitor};

src/de/seq.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
2+
use crate::encoding::Decoder;
23
use crate::events::BytesStart;
3-
use crate::reader::Decoder;
44
use serde::de::{DeserializeSeed, SeqAccess};
55

66
/// Check if tag `start` is included in the `fields` list. `decoder` is used to

src/de/simple_type.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
//! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition
55
66
use crate::de::{deserialize_bool, str2bool};
7+
use crate::encoding::Decoder;
78
use crate::errors::serialize::DeError;
89
use crate::escape::unescape;
9-
use crate::reader::Decoder;
1010
use memchr::memchr;
1111
use serde::de::{DeserializeSeed, Deserializer, EnumAccess, SeqAccess, VariantAccess, Visitor};
1212
use serde::{self, serde_if_integer128};

src/encoding.rs

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
//! A module for wrappers that encode / decode data.
2+
3+
use std::borrow::Cow;
4+
5+
#[cfg(feature = "encoding")]
6+
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
7+
8+
use crate::{Error, Result};
9+
10+
/// Decoder of byte slices to the strings. This is lightweight object that can be copied.
11+
///
12+
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
13+
/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
14+
/// key is not defined or contains unknown encoding.
15+
///
16+
/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
17+
/// is supported. [*UTF-16 is not supported at the present*][utf16].
18+
///
19+
/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder:
20+
/// any XML declarations are ignored.
21+
///
22+
/// [utf16]: https://github.com/tafia/quick-xml/issues/158
23+
#[derive(Clone, Copy, Debug)]
24+
pub struct Decoder {
25+
#[cfg(feature = "encoding")]
26+
pub(crate) encoding: &'static Encoding,
27+
}
28+
29+
#[cfg(not(feature = "encoding"))]
30+
impl Decoder {
31+
/// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if
32+
/// it is present in the `bytes`.
33+
///
34+
/// Returns an error in case of malformed sequences in the `bytes`.
35+
///
36+
/// If you instead want to use XML declared encoding, use the `encoding` feature
37+
#[inline]
38+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
39+
Ok(Cow::Borrowed(std::str::from_utf8(bytes)?))
40+
}
41+
42+
/// Decodes a slice regardless of XML declaration with BOM removal if
43+
/// it is present in the `bytes`.
44+
///
45+
/// Returns an error in case of malformed sequences in the `bytes`.
46+
///
47+
/// If you instead want to use XML declared encoding, use the `encoding` feature
48+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
49+
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
50+
&bytes[3..]
51+
} else {
52+
bytes
53+
};
54+
self.decode(bytes)
55+
}
56+
}
57+
58+
#[cfg(feature = "encoding")]
59+
impl Decoder {
60+
/// Returns the `Reader`s encoding.
61+
///
62+
/// This encoding will be used by [`decode`].
63+
///
64+
/// [`decode`]: Self::decode
65+
pub fn encoding(&self) -> &'static Encoding {
66+
self.encoding
67+
}
68+
69+
/// Decodes specified bytes using encoding, declared in the XML, if it was
70+
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
71+
/// in the `bytes`.
72+
///
73+
/// Returns an error in case of malformed sequences in the `bytes`.
74+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
75+
match self
76+
.encoding
77+
.decode_without_bom_handling_and_without_replacement(bytes)
78+
{
79+
None => Err(Error::NonDecodable(None)),
80+
Some(s) => Ok(s),
81+
}
82+
}
83+
84+
/// Decodes a slice with BOM removal if it is present in the `bytes` using
85+
/// the reader encoding.
86+
///
87+
/// If this method called after reading XML declaration with the `"encoding"`
88+
/// key, then this encoding is used, otherwise UTF-8 is used.
89+
///
90+
/// If XML declaration is absent in the XML, UTF-8 is used.
91+
///
92+
/// Returns an error in case of malformed sequences in the `bytes`.
93+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
94+
self.decode(self.remove_bom(bytes))
95+
}
96+
/// Copied from [`Encoding::decode_with_bom_removal`]
97+
#[inline]
98+
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
99+
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
100+
return &bytes[3..];
101+
}
102+
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
103+
return &bytes[2..];
104+
}
105+
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
106+
return &bytes[2..];
107+
}
108+
109+
bytes
110+
}
111+
}
112+
113+
/// This implementation is required for tests of other parts of the library
114+
#[cfg(test)]
115+
#[cfg(feature = "serialize")]
116+
impl Decoder {
117+
pub(crate) fn utf8() -> Self {
118+
Decoder {
119+
#[cfg(feature = "encoding")]
120+
encoding: UTF_8,
121+
}
122+
}
123+
124+
#[cfg(feature = "encoding")]
125+
pub(crate) fn utf16() -> Self {
126+
Decoder { encoding: UTF_16LE }
127+
}
128+
}
129+
130+
/// Automatic encoding detection of XML files based using the [recommended algorithm]
131+
/// (https://www.w3.org/TR/xml11/#sec-guessing)
132+
///
133+
/// The algorithm suggests examine up to the first 4 bytes to determine encoding
134+
/// according to the following table:
135+
///
136+
/// | Bytes |Detected encoding
137+
/// |-------------|------------------------------------------
138+
/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order)
139+
/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order)
140+
/// |`00 00 FF FE`|UCS-4, unusual octet order (2143)
141+
/// |`FE FF 00 00`|UCS-4, unusual octet order (3412)
142+
/// |`FE FF ## ##`|UTF-16, big-endian
143+
/// |`FF FE ## ##`|UTF-16, little-endian
144+
/// |`EF BB BF` |UTF-8
145+
/// |-------------|------------------------------------------
146+
/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234)
147+
/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321)
148+
/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143)
149+
/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412)
150+
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
151+
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
152+
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
153+
/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use)
154+
/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind
155+
///
156+
/// Because [`encoding_rs`] crate supported only subset of those encodings, only
157+
/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
158+
///
159+
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
160+
#[cfg(feature = "encoding")]
161+
pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
162+
match bytes {
163+
// with BOM
164+
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
165+
_ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
166+
_ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
167+
168+
// without BOM
169+
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
170+
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
171+
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
172+
173+
_ => None,
174+
}
175+
}

src/events/mod.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,11 @@ use std::fmt::{self, Debug, Formatter};
4141
use std::ops::Deref;
4242
use std::str::from_utf8;
4343

44+
use crate::encoding::Decoder;
4445
use crate::errors::{Error, Result};
4546
use crate::escape::{escape, partial_escape, unescape_with};
4647
use crate::name::{LocalName, QName};
47-
use crate::reader::{Decoder, Reader};
48+
use crate::reader::Reader;
4849
use crate::utils::write_cow_string;
4950
use attributes::{Attribute, Attributes};
5051

src/lib.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
#[cfg(feature = "serialize")]
4646
pub mod de;
47+
pub mod encoding;
4748
mod errors;
4849
mod escapei;
4950
pub mod escape {
@@ -62,8 +63,9 @@ pub mod utils;
6263
mod writer;
6364

6465
// reexports
66+
pub use crate::encoding::Decoder;
6567
#[cfg(feature = "serialize")]
6668
pub use crate::errors::serialize::DeError;
6769
pub use crate::errors::{Error, Result};
68-
pub use crate::reader::{BufferedReader, Decoder, Reader, SliceReader};
70+
pub use crate::reader::{BufferedReader, Reader, SliceReader};
6971
pub use crate::writer::{ElementWriter, Writer};

src/reader/buffered_reader.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ use crate::name::{QName, ResolveResult};
1111
use crate::{Error, Result};
1212

1313
#[cfg(feature = "encoding")]
14-
use crate::reader::{detect_encoding, EncodingRef};
14+
use crate::encoding::detect_encoding;
15+
#[cfg(feature = "encoding")]
16+
use crate::reader::EncodingRef;
1517
use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState};
1618

1719
/// Private functions for a [`Reader`] based on an [`BufferedReader`].

0 commit comments

Comments
 (0)