diff --git a/Changelog.md b/Changelog.md index 225ef372..1e89efe7 100644 --- a/Changelog.md +++ b/Changelog.md @@ -23,6 +23,8 @@ - [#191]: New event variant `StartText` emitted for bytes before the XML declaration or a start comment or a tag. For streams with BOM this event will contain a BOM - [#395]: Add support for XML Schema `xs:list` +- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore + the XML declared encoding and always use UTF-8 ### Bug Fixes @@ -106,6 +108,7 @@ [#9]: https://github.com/Mingun/fast-xml/pull/9 [#180]: https://github.com/tafia/quick-xml/issues/180 [#191]: https://github.com/tafia/quick-xml/issues/191 +[#324]: https://github.com/tafia/quick-xml/issues/324 [#363]: https://github.com/tafia/quick-xml/issues/363 [#387]: https://github.com/tafia/quick-xml/pull/387 [#391]: https://github.com/tafia/quick-xml/pull/391 diff --git a/src/de/mod.rs b/src/de/mod.rs index c4c8f8c9..462ec3b1 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -297,7 +297,8 @@ pub fn from_str<'de, T>(s: &'de str) -> Result where T: Deserialize<'de>, { - from_slice(s.as_bytes()) + let mut de = Deserializer::from_str(s); + T::deserialize(&mut de) } /// Deserialize an instance of type `T` from bytes of XML text. @@ -696,12 +697,17 @@ where impl<'de> Deserializer<'de, SliceReader<'de>> { /// Create new deserializer that will borrow data from the specified string pub fn from_str(s: &'de str) -> Self { - Self::from_slice(s.as_bytes()) + Self::from_borrowing_reader(Reader::from_str(s)) } /// Create new deserializer that will borrow data from the specified byte array pub fn from_slice(bytes: &'de [u8]) -> Self { - let mut reader = Reader::from_bytes(bytes); + Self::from_borrowing_reader(Reader::from_bytes(bytes)) + } + + /// Create new deserializer that will borrow data from the specified borrowing reader + #[inline] + fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self { reader .expand_empty_elements(true) .check_end_names(true) diff --git a/src/reader.rs b/src/reader.rs index c3d0bbf3..b7cdbbfe 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -56,6 +56,55 @@ enum TagState { Exit, } +/// A reference to an encoding together with an information of how it was retrieved. +/// +/// The state transition diagram: +/// +/// ```mermaid +/// flowchart LR +/// Implicit -- from_str --> Explicit +/// Implicit -- BOM --> BomDetected +/// Implicit -- "encoding=..." --> XmlDetected +/// BomDetected -- "encoding=..." --> XmlDetected +/// ``` +#[cfg(feature = "encoding")] +#[derive(Clone, Copy)] +enum EncodingRef { + /// Encoding was implicitly assumed to have a specified value. It can be refined + /// using BOM or by the XML declaration event (``) + Implicit(&'static Encoding), + /// Encoding was explicitly set to the desired value. It cannot be changed + /// nor by BOM, nor by parsing XML declaration (``) + Explicit(&'static Encoding), + /// Encoding was detected from a byte order mark (BOM) or by the first bytes + /// of the content. It can be refined by the XML declaration event (``) + BomDetected(&'static Encoding), + /// Encoding was detected using XML declaration event (``). + /// It can no longer change + XmlDetected(&'static Encoding), +} +#[cfg(feature = "encoding")] +impl EncodingRef { + #[inline] + fn encoding(&self) -> &'static Encoding { + match self { + Self::Implicit(e) => e, + Self::Explicit(e) => e, + Self::BomDetected(e) => e, + Self::XmlDetected(e) => e, + } + } + #[inline] + fn can_be_refined(&self) -> bool { + match self { + Self::Implicit(_) | Self::BomDetected(_) => true, + Self::Explicit(_) | Self::XmlDetected(_) => false, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// A low level encoding-agnostic XML event reader. /// /// Consumes a `BufRead` and streams XML `Event`s. @@ -144,11 +193,8 @@ pub struct Reader { pending_pop: bool, #[cfg(feature = "encoding")] - /// the encoding specified in the xml, defaults to utf8 - encoding: &'static Encoding, - #[cfg(feature = "encoding")] - /// check if quick-rs could find out the encoding - is_encoding_set: bool, + /// Reference to the encoding used to read an XML + encoding: EncodingRef, } /// Builder methods @@ -172,9 +218,7 @@ impl Reader { pending_pop: false, #[cfg(feature = "encoding")] - encoding: ::encoding_rs::UTF_8, - #[cfg(feature = "encoding")] - is_encoding_set: false, + encoding: EncodingRef::Implicit(UTF_8), } } @@ -412,7 +456,7 @@ impl Reader { pub fn decoder(&self) -> Decoder { Decoder { #[cfg(feature = "encoding")] - encoding: self.encoding, + encoding: self.encoding.encoding(), } } } @@ -683,10 +727,9 @@ impl Reader { { Ok(Some(bytes)) => { #[cfg(feature = "encoding")] - if first { + if first && self.encoding.can_be_refined() { if let Some(encoding) = detect_encoding(bytes) { - self.encoding = encoding; - self.is_encoding_set = true; + self.encoding = EncodingRef::BomDetected(encoding); } } @@ -843,9 +886,10 @@ impl Reader { // Try getting encoding from the declaration event #[cfg(feature = "encoding")] - if let Some(enc) = event.encoder() { - self.encoding = enc; - self.is_encoding_set = true; + if self.encoding.can_be_refined() { + if let Some(encoding) = event.encoder() { + self.encoding = EncodingRef::XmlDetected(encoding); + } } Ok(Event::Decl(event)) @@ -905,6 +949,15 @@ impl Reader> { impl<'a> Reader<&'a [u8]> { /// Creates an XML reader from a string slice. pub fn from_str(s: &'a str) -> Self { + // Rust strings a guaranteed in UTF-8, so lock the encoding + #[cfg(feature = "encoding")] + { + let mut reader = Self::from_reader(s.as_bytes()); + reader.encoding = EncodingRef::Explicit(UTF_8); + reader + } + + #[cfg(not(feature = "encoding"))] Self::from_reader(s.as_bytes()) } @@ -1533,8 +1586,6 @@ impl Decoder { /// Copied from [`Encoding::decode_with_bom_removal`] #[inline] fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { - use encoding_rs::*; - if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { return &bytes[3..]; } @@ -1556,15 +1607,13 @@ impl Decoder { pub(crate) fn utf8() -> Self { Decoder { #[cfg(feature = "encoding")] - encoding: encoding_rs::UTF_8, + encoding: UTF_8, } } #[cfg(feature = "encoding")] pub(crate) fn utf16() -> Self { - Decoder { - encoding: encoding_rs::UTF_16LE, - } + Decoder { encoding: UTF_16LE } } } @@ -2480,6 +2529,62 @@ mod test { ); } } + + #[cfg(feature = "encoding")] + mod encoding { + use crate::events::Event; + use crate::reader::Reader; + use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251}; + use pretty_assertions::assert_eq; + + mod bytes { + use super::*; + use pretty_assertions::assert_eq; + + /// Checks that encoding is detected by BOM and changed after XML declaration + #[test] + fn bom_detected() { + let mut reader = Reader::from_bytes(b"\xFF\xFE"); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_buffered($buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + reader.read_event_buffered($buf).unwrap(); + assert_eq!(reader.decoder().encoding(), WINDOWS_1251); + + assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof); + } + + /// Checks that encoding is changed by XML declaration, but only once + #[test] + fn xml_declaration() { + let mut reader = Reader::from_bytes(b""); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_buffered($buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + reader.read_event_buffered($buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof); + } + } + + /// Checks that XML declaration cannot change the encoding from UTF-8 if + /// a `Reader` was created using `from_str` method + #[test] + fn str_always_has_utf8() { + let mut reader = Reader::from_str(""); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_buffered($buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_8); + + assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof); + } + } }; } diff --git a/tests/serde-de.rs b/tests/serde-de.rs index be9e153e..07e4e274 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -4765,3 +4765,25 @@ mod xml_schema_lists { } } } + +/// Test for https://github.com/tafia/quick-xml/issues/324 +#[test] +fn from_str_should_ignore_encoding() { + let xml = r#" + + + "#; + + #[derive(Debug, PartialEq, Deserialize)] + struct A { + a: String, + } + + let a: A = from_str(xml).unwrap(); + assert_eq!( + a, + A { + a: "€".to_string() + } + ); +}