Fix tafia#180: Define the same API for encoding whether feature `enco…

…ding` is defined or not
Mingun · Jun 19, 2022 · 61cd193 · 61cd193
1 parent 93d412f
commit 61cd193
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 84 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -78,6 +78,13 @@
   - text before the first tag is not an XML content at all, so it is meaningless
     to try to unescape something in it
 
+- [#180]: Define the same API for encoding whether feature `encoding` is defined or not.
+  Since then no more replacements does for invalid characters when `encoding` feature
+  is enabled. An error will be returned instead, as for the case when `encoding` is disabled.
+  If `encoding` feature is disabled, `Cow::Borrowed` is returned instead of `&str`.
+  If `encoding` feature is enabled, `Result<Cow>` is returned instead of `Cow`.
+- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`
+
 ### New Tests
 
 - [#9]: Added tests for incorrect nested tags in input

diff --git a/src/de/escape.rs b/src/de/escape.rs
@@ -45,12 +45,8 @@ macro_rules! deserialize_num {
         where
             V: Visitor<'de>,
         {
-            #[cfg(not(feature = "encoding"))]
             let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;
 
-            #[cfg(feature = "encoding")]
-            let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;
-
             visitor.$visit(value)
         }
     };
@@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
         V: Visitor<'de>,
     {
         let unescaped = self.unescaped()?;
-        #[cfg(not(feature = "encoding"))]
         let value = self.decoder.decode(&unescaped)?;
 
-        #[cfg(feature = "encoding")]
-        let value = self.decoder.decode(&unescaped);
         visitor.visit_str(&value)
     }
 

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -337,7 +337,7 @@ where
 {
     #[cfg(feature = "encoding")]
     {
-        let value = decoder.decode(value);
+        let value = decoder.decode(value)?;
         // No need to unescape because valid boolean representations cannot be escaped
         match value.as_ref() {
             "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {

diff --git a/src/de/seq.rs b/src/de/seq.rs
@@ -14,11 +14,7 @@ pub fn not_in(
     start: &BytesStart,
     decoder: Decoder,
 ) -> Result<bool, DeError> {
-    #[cfg(not(feature = "encoding"))]
-    let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);
-
-    #[cfg(feature = "encoding")]
-    let tag = decoder.decode(start.name().into_inner());
+    let tag = decoder.decode(start.name().into_inner())?;
 
     Ok(fields.iter().all(|&field| field != tag.as_ref()))
 }

diff --git a/src/errors.rs b/src/errors.rs
@@ -11,8 +11,9 @@ use std::string::FromUtf8Error;
 pub enum Error {
     /// IO error
     Io(::std::io::Error),
-    /// Utf8 error
-    Utf8(Utf8Error),
+    /// Input decoding error. If `encoding` feature is disabled, contains `None`,
+    /// otherwise contains the UTF-8 decoding error
+    NonDecodable(Option<Utf8Error>),
     /// Unexpected End of File
     UnexpectedEof(String),
     /// End event mismatch
@@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error {
 }
 
 impl From<Utf8Error> for Error {
-    /// Creates a new `Error::Utf8` from the given error
+    /// Creates a new `Error::NonDecodable` from the given error
     #[inline]
     fn from(error: Utf8Error) -> Error {
-        Error::Utf8(error)
+        Error::NonDecodable(Some(error))
     }
 }
 
@@ -86,7 +87,8 @@ impl std::fmt::Display for Error {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
             Error::Io(e) => write!(f, "I/O error: {}", e),
-            Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
+            Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
+            Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
             Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
             Error::EndEventMismatch { expected, found } => {
                 write!(f, "Expecting </{}> found </{}>", expected, found)
@@ -118,7 +120,7 @@ impl std::error::Error for Error {
     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
         match self {
             Error::Io(e) => Some(e),
-            Error::Utf8(e) => Some(e),
+            Error::NonDecodable(Some(e)) => Some(e),
             Error::InvalidAttr(e) => Some(e),
             Error::EscapeError(e) => Some(e),
             _ => None,

diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -71,7 +71,7 @@ impl<'a> Attribute<'a> {
         do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
     }
 
-    /// Decode then unescapes the value
+    /// NonDecodable then unescapes the value
     ///
     /// This allocates a `String` in all cases. For performance reasons it might be a better idea to
     /// instead use one of:
@@ -85,7 +85,7 @@ impl<'a> Attribute<'a> {
         self.do_unescape_and_decode_value(reader, None)
     }
 
-    /// Decode then unescapes the value with custom entities
+    /// NonDecodable then unescapes the value with custom entities
     ///
     /// This allocates a `String` in all cases. For performance reasons it might be a better idea to
     /// instead use one of:
@@ -113,10 +113,6 @@ impl<'a> Attribute<'a> {
         reader: &Reader<B>,
         custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
     ) -> XmlResult<String> {
-        #[cfg(feature = "encoding")]
-        let decoded = reader.decoder().decode(&*self.value);
-
-        #[cfg(not(feature = "encoding"))]
         let decoded = reader.decoder().decode(&*self.value)?;
 
         let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;

diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> {
     /// appeared in the BOM or in the text before the first tag.
     pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
         //TODO: Fix lifetime issue - it should be possible to borrow string
-        #[cfg(feature = "encoding")]
-        let decoded = decoder.decode_with_bom_removal(&*self);
-
-        #[cfg(not(feature = "encoding"))]
         let decoded = decoder.decode_with_bom_removal(&*self)?;
 
         Ok(decoded.to_string())
@@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> {
         reader: &Reader<B>,
         custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
     ) -> Result<String> {
-        #[cfg(feature = "encoding")]
-        let decoded = reader.decoder().decode(&*self);
-
-        #[cfg(not(feature = "encoding"))]
         let decoded = reader.decoder().decode(&*self)?;
 
         let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
@@ -880,10 +872,6 @@ impl<'a> BytesText<'a> {
         reader: &Reader<B>,
         custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
     ) -> Result<String> {
-        #[cfg(feature = "encoding")]
-        let decoded = reader.decoder().decode(&*self);
-
-        #[cfg(not(feature = "encoding"))]
         let decoded = reader.decoder().decode(&*self)?;
 
         let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
@@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> {
     #[cfg(feature = "serialize")]
     pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
         Ok(match &self.content {
-            Cow::Borrowed(bytes) => {
-                #[cfg(feature = "encoding")]
-                {
-                    decoder.decode(bytes)
-                }
-                #[cfg(not(feature = "encoding"))]
-                {
-                    decoder.decode(bytes)?.into()
-                }
-            }
+            Cow::Borrowed(bytes) => decoder.decode(bytes)?,
             Cow::Owned(bytes) => {
-                #[cfg(feature = "encoding")]
-                let decoded = decoder.decode(bytes).into_owned();
-
-                #[cfg(not(feature = "encoding"))]
                 let decoded = decoder.decode(bytes)?.to_string();
 
                 decoded.into()

diff --git a/src/reader.rs b/src/reader.rs
@@ -1,6 +1,5 @@
 //! A module to handle `Reader`
 
-#[cfg(feature = "encoding")]
 use std::borrow::Cow;
 use std::io::{self, BufRead, BufReader};
 use std::{fs::File, path::Path, str::from_utf8};
@@ -1443,8 +1442,9 @@ impl Decoder {
     /// Returns an error in case of malformed sequences in the `bytes`.
     ///
     /// If you instead want to use XML declared encoding, use the `encoding` feature
-    pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
-        Ok(from_utf8(bytes)?)
+    #[inline]
+    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
+        Ok(Cow::Borrowed(from_utf8(bytes)?))
     }
 
     /// Decodes a slice regardless of XML declaration with BOM removal if
@@ -1453,7 +1453,7 @@ impl Decoder {
     /// Returns an error in case of malformed sequences in the `bytes`.
     ///
     /// If you instead want to use XML declared encoding, use the `encoding` feature
-    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> {
+    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
         let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
             &bytes[3..]
         } else {
@@ -1475,12 +1475,18 @@ impl Decoder {
     }
 
     /// Decodes specified bytes using encoding, declared in the XML, if it was
-    /// declared there, or UTF-8 otherwise
+    /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
+    /// in the `bytes`.
     ///
-    /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
-    /// `U+FFFD REPLACEMENT CHARACTER`.
-    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
-        self.encoding.decode(bytes).0
+    /// Returns an error in case of malformed sequences in the `bytes`.
+    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
+        match self
+            .encoding
+            .decode_without_bom_handling_and_without_replacement(bytes)
+        {
+            None => Err(Error::NonDecodable(None)),
+            Some(s) => Ok(s),
+        }
     }
 
     /// Decodes a slice with BOM removal if it is present in the `bytes` using
@@ -1491,9 +1497,26 @@ impl Decoder {
     ///
     /// If XML declaration is absent in the XML, UTF-8 is used.
     ///
-    /// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`.
-    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
-        self.encoding.decode_with_bom_removal(bytes).0
+    /// Returns an error in case of malformed sequences in the `bytes`.
+    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
+        self.decode(self.remove_bom(bytes))
+    }
+    /// Copied from [`Encoding::decode_with_bom_removal`]
+    #[inline]
+    fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
+        use encoding_rs::*;
+
+        if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
+            return &bytes[3..];
+        }
+        if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
+            return &bytes[2..];
+        }
+        if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
+            return &bytes[2..];
+        }
+
+        bytes
     }
 }
 

diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs
@@ -1,7 +1,6 @@
 use quick_xml::events::{BytesStart, Event};
 use quick_xml::name::{QName, ResolveResult};
-use quick_xml::{Reader, Result};
-use std::borrow::Cow;
+use quick_xml::{Decoder, Reader, Result};
 use std::str::from_utf8;
 
 #[test]
@@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
     loop {
         buf.clear();
         let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
-        let line = xmlrs_display(event, &reader);
+        let line = xmlrs_display(event, reader.decoder());
         if let Some((n, spec)) = spec_lines.next() {
             if spec.trim() == "EndDocument" {
                 break;
@@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
     }
 }
 
-fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String {
-    let name = decode(name.as_ref(), reader);
+fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String {
+    let name = decoder.decode(name.as_ref()).unwrap();
     match n {
         // Produces string '{namespace}prefixed_name'
         ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name),
@@ -448,44 +447,33 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result<String, String> {
     Ok(atts.join(", "))
 }
 
-// FIXME: The public API differs based on the "encoding" feature
-fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> {
-    #[cfg(feature = "encoding")]
-    let decoded = reader.decoder().decode(text);
-
-    #[cfg(not(feature = "encoding"))]
-    let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap());
-
-    decoded
-}
-
-fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String {
+fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String {
     match opt_event {
         Ok((_, Event::StartText(_))) => "StartText".to_string(),
         Ok((n, Event::Start(ref e))) => {
-            let name = namespace_name(n, e.name(), reader);
+            let name = namespace_name(n, e.name(), decoder);
             match make_attrs(e) {
                 Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name),
                 Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs),
                 Err(e) => format!("StartElement({}, attr-error: {})", &name, &e),
             }
         }
         Ok((n, Event::Empty(ref e))) => {
-            let name = namespace_name(n, e.name(), reader);
+            let name = namespace_name(n, e.name(), decoder);
             match make_attrs(e) {
                 Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name),
                 Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs),
                 Err(e) => format!("EmptyElement({}, attr-error: {})", &name, &e),
             }
         }
         Ok((n, Event::End(ref e))) => {
-            let name = namespace_name(n, e.name(), reader);
+            let name = namespace_name(n, e.name(), decoder);
             format!("EndElement({})", name)
         }
         Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()),
         Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()),
         Ok((_, Event::Text(ref e))) => match e.unescaped() {
-            Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()),
+            Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()),
             Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err),
         },
         Ok((_, Event::Decl(ref e))) => {