Merge pull request #399 from Mingun/bom

Rework BOM handling and encoding API
tafia · Jun 20, 2022 · 8fa6f1e · 8fa6f1e
2 parents 3b37c0e + d49a4b8
commit 8fa6f1e
Show file tree

Hide file tree

Showing 15 changed files with 1,002 additions and 945 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -17,6 +17,11 @@
 - [#393]: New module `name` with `QName`, `LocalName`, `Namespace`, `Prefix`
   and `PrefixDeclaration` wrappers around byte arrays and `ResolveResult` with
   the result of namespace resolution
+- [#180]: Make `Decoder` struct public. You already had access to it via the
+  `Reader::decoder()` method, but could not name it in the code. Now the preferred
+  way to access decoding functionality is via this struct
+- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
+  or a start comment or a tag. For streams with BOM this event will contain a BOM
 
 ### Bug Fixes
 
@@ -56,6 +61,33 @@
 - [#393]: Now `BytesStart::name()` and `BytesEnd::name()` returns `QName`, and
   `BytesStart::local_name()` and `BytesEnd::local_name()` returns `LocalName`
 
+- [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it,
+  use `String::from_utf8` instead (which that function did)
+- [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless.
+  Use the same-named methods without that suffix instead. Attribute values cannot contain BOM
+- [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by
+  `Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
+  Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
+  `Reader::encoding()` is replaced by `Decoder::encoding()` as well
+- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and
+  `BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked
+  as expected, this was only due to good luck. They was replaced by the
+  `BytesStartText::decode_with_bom_removal()`:
+  - conceptually, you should decode BOM only for the first `Text` event from the
+    reader (since now `StartText` event is emitted instead for this)
+  - text before the first tag is not an XML content at all, so it is meaningless
+    to try to unescape something in it
+
+- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
+  disabled. Signatures of functions are now the same regardless of whether or not the feature is
+  enabled, and an error will be returned instead of performing replacements for invalid characters
+  in both cases.
+
+  Previously, if the `encoding` feature was enabled, decoding functions would return `Result<Cow<&str>>`
+  while without this feature they would return `Result<&str>`. With this change, only `Result<Cow<&str>>`
+  is returned regardless of the status of the feature.
+- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`
+
 ### New Tests
 
 - [#9]: Added tests for incorrect nested tags in input
@@ -66,6 +98,8 @@
 
 [#8]: https://github.com/Mingun/fast-xml/pull/8
 [#9]: https://github.com/Mingun/fast-xml/pull/9
+[#180]: https://github.com/tafia/quick-xml/issues/180
+[#191]: https://github.com/tafia/quick-xml/issues/191
 [#363]: https://github.com/tafia/quick-xml/issues/363
 [#387]: https://github.com/tafia/quick-xml/pull/387
 [#391]: https://github.com/tafia/quick-xml/pull/391

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -1,6 +1,7 @@
 use criterion::{self, criterion_group, criterion_main, Criterion};
 use pretty_assertions::assert_eq;
 use quick_xml::events::Event;
+use quick_xml::name::QName;
 use quick_xml::Reader;
 
 static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml");
@@ -173,15 +174,15 @@ fn bytes_text_unescaped(c: &mut Criterion) {
 /// Benchmarks, how fast individual event parsed
 fn one_event(c: &mut Criterion) {
     let mut group = c.benchmark_group("One event");
-    group.bench_function("Text", |b| {
+    group.bench_function("StartText", |b| {
         let src = "Hello world!".repeat(512 / 12).into_bytes();
         let mut buf = Vec::with_capacity(1024);
         b.iter(|| {
             let mut r = Reader::from_reader(src.as_ref());
             let mut nbtxt = criterion::black_box(0);
             r.check_end_names(false).check_comments(false);
             match r.read_event(&mut buf) {
-                Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(),
                 something_else => panic!("Did not expect {:?}", something_else),
             };
 
@@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) {
             let mut buf = Vec::new();
             loop {
                 match r.read_event(&mut buf) {
-                    Ok(Event::Empty(e)) if e.name() == b"player" => {
+                    Ok(Event::Empty(e)) if e.name() == QName(b"player") => {
                         for name in ["num", "status", "avg"] {
                             if let Some(_attr) = e.try_get_attribute(name).unwrap() {
                                 count += 1

diff --git a/src/de/escape.rs b/src/de/escape.rs
@@ -45,12 +45,8 @@ macro_rules! deserialize_num {
         where
             V: Visitor<'de>,
         {
-            #[cfg(not(feature = "encoding"))]
             let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;
 
-            #[cfg(feature = "encoding")]
-            let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;
-
             visitor.$visit(value)
         }
     };
@@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
         V: Visitor<'de>,
     {
         let unescaped = self.unescaped()?;
-        #[cfg(not(feature = "encoding"))]
         let value = self.decoder.decode(&unescaped)?;
 
-        #[cfg(feature = "encoding")]
-        let value = self.decoder.decode(&unescaped);
         visitor.visit_str(&value)
     }
 

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -337,7 +337,7 @@ where
 {
     #[cfg(feature = "encoding")]
     {
-        let value = decoder.decode(value);
+        let value = decoder.decode(value)?;
         // No need to unescape because valid boolean representations cannot be escaped
         match value.as_ref() {
             "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
@@ -624,7 +624,7 @@ where
         allow_start: bool,
     ) -> Result<BytesCData<'de>, DeError> {
         match self.next()? {
-            DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::InvalidXml(e.into())),
+            DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
             DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
             DeEvent::CData(e) => Ok(e),
             DeEvent::Start(e) if allow_start => {
@@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
         let event = loop {
             let e = self.reader.read_event(&mut self.buf)?;
             match e {
+                //TODO: Probably not the best idea treat StartText as usual text
+                // Usually this event will represent a BOM
+                // Changing this requires review of the serde-de::top_level::one_element test
+                Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
                 Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
                 Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
                 Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
@@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
         loop {
             let e = self.reader.read_event_unbuffered()?;
             match e {
+                //TODO: Probably not the best idea treat StartText as usual text
+                // Usually this event will represent a BOM
+                // Changing this requires review of the serde-de::top_level::one_element test
+                Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
                 Event::Start(e) => break Ok(DeEvent::Start(e)),
                 Event::End(e) => break Ok(DeEvent::End(e)),
                 Event::Text(e) => break Ok(DeEvent::Text(e)),

diff --git a/src/de/seq.rs b/src/de/seq.rs
@@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
 use crate::events::BytesStart;
 use crate::reader::Decoder;
 use serde::de::{DeserializeSeed, SeqAccess};
-#[cfg(not(feature = "encoding"))]
-use std::borrow::Cow;
 
 /// Check if tag `start` is included in the `fields` list. `decoder` is used to
 /// get a string representation of a tag.
@@ -14,11 +12,7 @@ pub fn not_in(
     start: &BytesStart,
     decoder: Decoder,
 ) -> Result<bool, DeError> {
-    #[cfg(not(feature = "encoding"))]
-    let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);
-
-    #[cfg(feature = "encoding")]
-    let tag = decoder.decode(start.name().into_inner());
+    let tag = decoder.decode(start.name().into_inner())?;
 
     Ok(fields.iter().all(|&field| field != tag.as_ref()))
 }

diff --git a/src/errors.rs b/src/errors.rs
@@ -4,14 +4,16 @@ use crate::escape::EscapeError;
 use crate::events::attributes::AttrError;
 use crate::utils::write_byte_string;
 use std::str::Utf8Error;
+use std::string::FromUtf8Error;
 
 /// The error type used by this crate.
 #[derive(Debug)]
 pub enum Error {
     /// IO error
     Io(::std::io::Error),
-    /// Utf8 error
-    Utf8(Utf8Error),
+    /// Input decoding error. If `encoding` feature is disabled, contains `None`,
+    /// otherwise contains the UTF-8 decoding error
+    NonDecodable(Option<Utf8Error>),
     /// Unexpected End of File
     UnexpectedEof(String),
     /// End event mismatch
@@ -46,10 +48,18 @@ impl From<::std::io::Error> for Error {
 }
 
 impl From<Utf8Error> for Error {
-    /// Creates a new `Error::Utf8` from the given error
+    /// Creates a new `Error::NonDecodable` from the given error
     #[inline]
     fn from(error: Utf8Error) -> Error {
-        Error::Utf8(error)
+        Error::NonDecodable(Some(error))
+    }
+}
+
+impl From<FromUtf8Error> for Error {
+    /// Creates a new `Error::Utf8` from the given error
+    #[inline]
+    fn from(error: FromUtf8Error) -> Error {
+        error.utf8_error().into()
     }
 }
 
@@ -77,7 +87,8 @@ impl std::fmt::Display for Error {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
             Error::Io(e) => write!(f, "I/O error: {}", e),
-            Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
+            Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
+            Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
             Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
             Error::EndEventMismatch { expected, found } => {
                 write!(f, "Expecting </{}> found </{}>", expected, found)
@@ -109,7 +120,7 @@ impl std::error::Error for Error {
     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
         match self {
             Error::Io(e) => Some(e),
-            Error::Utf8(e) => Some(e),
+            Error::NonDecodable(Some(e)) => Some(e),
             Error::InvalidAttr(e) => Some(e),
             Error::EscapeError(e) => Some(e),
             _ => None,
@@ -227,6 +238,7 @@ pub mod serialize {
     }
 
     impl From<Error> for DeError {
+        #[inline]
         fn from(e: Error) -> Self {
             Self::InvalidXml(e)
         }
@@ -239,15 +251,17 @@ pub mod serialize {
         }
     }
 
-    impl From<ParseIntError> for DeError {
-        fn from(e: ParseIntError) -> Self {
-            Self::InvalidInt(e)
+    impl From<Utf8Error> for DeError {
+        #[inline]
+        fn from(e: Utf8Error) -> Self {
+            Self::InvalidXml(e.into())
         }
     }
 
-    impl From<ParseFloatError> for DeError {
-        fn from(e: ParseFloatError) -> Self {
-            Self::InvalidFloat(e)
+    impl From<FromUtf8Error> for DeError {
+        #[inline]
+        fn from(e: FromUtf8Error) -> Self {
+            Self::InvalidXml(e.into())
         }
     }
 
@@ -257,4 +271,18 @@ pub mod serialize {
             Self::InvalidXml(e.into())
         }
     }
+
+    impl From<ParseIntError> for DeError {
+        #[inline]
+        fn from(e: ParseIntError) -> Self {
+            Self::InvalidInt(e)
+        }
+    }
+
+    impl From<ParseFloatError> for DeError {
+        #[inline]
+        fn from(e: ParseFloatError) -> Self {
+            Self::InvalidFloat(e)
+        }
+    }
 }