Skip to content

Commit

Permalink
Fix tafia#324: lock encoding to UTF-8 when parse from &str
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun committed Jun 24, 2022
1 parent 181b0d1 commit c4845c3
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 24 deletions.
3 changes: 3 additions & 0 deletions Changelog.md
Expand Up @@ -36,6 +36,8 @@
returns `ResolveResult::Unknown` if prefix was not registered in namespace buffer
- [#393]: Fix breaking processing after encounter an attribute with a reserved name (started with "xmlns")
- [#363]: Do not generate empty `Event::Text` events
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
the XML declared encoding and always use UTF-8

### Misc Changes

Expand Down Expand Up @@ -106,6 +108,7 @@
[#9]: https://github.com/Mingun/fast-xml/pull/9
[#180]: https://github.com/tafia/quick-xml/issues/180
[#191]: https://github.com/tafia/quick-xml/issues/191
[#324]: https://github.com/tafia/quick-xml/issues/324
[#363]: https://github.com/tafia/quick-xml/issues/363
[#387]: https://github.com/tafia/quick-xml/pull/387
[#391]: https://github.com/tafia/quick-xml/pull/391
Expand Down
12 changes: 9 additions & 3 deletions src/de/mod.rs
Expand Up @@ -297,7 +297,8 @@ pub fn from_str<'de, T>(s: &'de str) -> Result<T, DeError>
where
T: Deserialize<'de>,
{
from_slice(s.as_bytes())
let mut de = Deserializer::from_str(s);
T::deserialize(&mut de)
}

/// Deserialize an instance of type `T` from bytes of XML text.
Expand Down Expand Up @@ -696,12 +697,17 @@ where
impl<'de> Deserializer<'de, SliceReader<'de>> {
/// Create new deserializer that will borrow data from the specified string
pub fn from_str(s: &'de str) -> Self {
Self::from_slice(s.as_bytes())
Self::from_borrowing_reader(Reader::from_str(s))
}

/// Create new deserializer that will borrow data from the specified byte array
pub fn from_slice(bytes: &'de [u8]) -> Self {
let mut reader = Reader::from_bytes(bytes);
Self::from_borrowing_reader(Reader::from_bytes(bytes))
}

/// Create new deserializer that will borrow data from the specified borrowing reader
#[inline]
fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self {
reader
.expand_empty_elements(true)
.check_end_names(true)
Expand Down
147 changes: 126 additions & 21 deletions src/reader.rs
Expand Up @@ -56,6 +56,55 @@ enum TagState {
Exit,
}

/// A reference to an encoding together with an information of how it was retrieved.
///
/// The state transition diagram:
///
/// ```mermaid
/// flowchart LR
/// Implicit -- from_str --> Explicit
/// Implicit -- BOM --> BomDetected
/// Implicit -- "encoding=..." --> XmlDetected
/// BomDetected -- "encoding=..." --> XmlDetected
/// ```
#[cfg(feature = "encoding")]
#[derive(Clone, Copy)]
enum EncodingRef {
/// Encoding was implicitly assumed to have a specified value. It can be refined
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
Implicit(&'static Encoding),
/// Encoding was explicitly set to the desired value. It cannot be changed
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
Explicit(&'static Encoding),
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
BomDetected(&'static Encoding),
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
/// It can no longer change
XmlDetected(&'static Encoding),
}
#[cfg(feature = "encoding")]
impl EncodingRef {
#[inline]
fn encoding(&self) -> &'static Encoding {
match self {
Self::Implicit(e) => e,
Self::Explicit(e) => e,
Self::BomDetected(e) => e,
Self::XmlDetected(e) => e,
}
}
#[inline]
fn can_be_refined(&self) -> bool {
match self {
Self::Implicit(_) | Self::BomDetected(_) => true,
Self::Explicit(_) | Self::XmlDetected(_) => false,
}
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// A low level encoding-agnostic XML event reader.
///
/// Consumes a `BufRead` and streams XML `Event`s.
Expand Down Expand Up @@ -144,11 +193,8 @@ pub struct Reader<R: BufRead> {
pending_pop: bool,

#[cfg(feature = "encoding")]
/// the encoding specified in the xml, defaults to utf8
encoding: &'static Encoding,
#[cfg(feature = "encoding")]
/// check if quick-rs could find out the encoding
is_encoding_set: bool,
/// Reference to the encoding used to read an XML
encoding: EncodingRef,
}

/// Builder methods
Expand All @@ -172,9 +218,7 @@ impl<R: BufRead> Reader<R> {
pending_pop: false,

#[cfg(feature = "encoding")]
encoding: ::encoding_rs::UTF_8,
#[cfg(feature = "encoding")]
is_encoding_set: false,
encoding: EncodingRef::Implicit(UTF_8),
}
}

Expand Down Expand Up @@ -412,7 +456,7 @@ impl<R: BufRead> Reader<R> {
pub fn decoder(&self) -> Decoder {
Decoder {
#[cfg(feature = "encoding")]
encoding: self.encoding,
encoding: self.encoding.encoding(),
}
}
}
Expand Down Expand Up @@ -683,10 +727,9 @@ impl<R: BufRead> Reader<R> {
{
Ok(Some(bytes)) => {
#[cfg(feature = "encoding")]
if first {
if first && self.encoding.can_be_refined() {
if let Some(encoding) = detect_encoding(bytes) {
self.encoding = encoding;
self.is_encoding_set = true;
self.encoding = EncodingRef::BomDetected(encoding);
}
}

Expand Down Expand Up @@ -843,9 +886,10 @@ impl<R: BufRead> Reader<R> {

// Try getting encoding from the declaration event
#[cfg(feature = "encoding")]
if let Some(enc) = event.encoder() {
self.encoding = enc;
self.is_encoding_set = true;
if self.encoding.can_be_refined() {
if let Some(encoding) = event.encoder() {
self.encoding = EncodingRef::XmlDetected(encoding);
}
}

Ok(Event::Decl(event))
Expand Down Expand Up @@ -905,6 +949,15 @@ impl Reader<BufReader<File>> {
impl<'a> Reader<&'a [u8]> {
/// Creates an XML reader from a string slice.
pub fn from_str(s: &'a str) -> Self {
// Rust strings a guaranteed in UTF-8, so lock the encoding
#[cfg(feature = "encoding")]
{
let mut reader = Self::from_reader(s.as_bytes());
reader.encoding = EncodingRef::Explicit(UTF_8);
reader
}

#[cfg(not(feature = "encoding"))]
Self::from_reader(s.as_bytes())
}

Expand Down Expand Up @@ -1533,8 +1586,6 @@ impl Decoder {
/// Copied from [`Encoding::decode_with_bom_removal`]
#[inline]
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
use encoding_rs::*;

if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
return &bytes[3..];
}
Expand All @@ -1556,15 +1607,13 @@ impl Decoder {
pub(crate) fn utf8() -> Self {
Decoder {
#[cfg(feature = "encoding")]
encoding: encoding_rs::UTF_8,
encoding: UTF_8,
}
}

#[cfg(feature = "encoding")]
pub(crate) fn utf16() -> Self {
Decoder {
encoding: encoding_rs::UTF_16LE,
}
Decoder { encoding: UTF_16LE }
}
}

Expand Down Expand Up @@ -2480,6 +2529,62 @@ mod test {
);
}
}

#[cfg(feature = "encoding")]
mod encoding {
use crate::events::Event;
use crate::reader::Reader;
use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251};
use pretty_assertions::assert_eq;

mod bytes {
use super::*;
use pretty_assertions::assert_eq;

/// Checks that encoding is detected by BOM and changed after XML declaration
#[test]
fn bom_detected() {
let mut reader = Reader::from_bytes(b"\xFF\xFE<?xml encoding='windows-1251'?>");

assert_eq!(reader.decoder().encoding(), UTF_8);
reader.read_event_buffered($buf).unwrap();
assert_eq!(reader.decoder().encoding(), UTF_16LE);

reader.read_event_buffered($buf).unwrap();
assert_eq!(reader.decoder().encoding(), WINDOWS_1251);

assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
}

/// Checks that encoding is changed by XML declaration, but only once
#[test]
fn xml_declaration() {
let mut reader = Reader::from_bytes(b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>");

assert_eq!(reader.decoder().encoding(), UTF_8);
reader.read_event_buffered($buf).unwrap();
assert_eq!(reader.decoder().encoding(), UTF_16LE);

reader.read_event_buffered($buf).unwrap();
assert_eq!(reader.decoder().encoding(), UTF_16LE);

assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
}
}

/// Checks that XML declaration cannot change the encoding from UTF-8 if
/// a `Reader` was created using `from_str` method
#[test]
fn str_always_has_utf8() {
let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");

assert_eq!(reader.decoder().encoding(), UTF_8);
reader.read_event_buffered($buf).unwrap();
assert_eq!(reader.decoder().encoding(), UTF_8);

assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
}
}
};
}

Expand Down
22 changes: 22 additions & 0 deletions tests/serde-de.rs
Expand Up @@ -4765,3 +4765,25 @@ mod xml_schema_lists {
}
}
}

/// Test for https://github.com/tafia/quick-xml/issues/324
#[test]
fn from_str_should_ignore_encoding() {
let xml = r#"
<?xml version="1.0" encoding="windows-1252" ?>
<A a="€" />
"#;

#[derive(Debug, PartialEq, Deserialize)]
struct A {
a: String,
}

let a: A = from_str(xml).unwrap();
assert_eq!(
a,
A {
a: "€".to_string()
}
);
}

0 comments on commit c4845c3

Please sign in to comment.