From 61cd193615b32829f6a452e93bfa30d2c2c4ec0c Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Jun 2022 02:31:25 +0500 Subject: [PATCH] Fix #180: Define the same API for encoding whether feature `encoding` is defined or not --- Changelog.md | 7 ++++++ src/de/escape.rs | 7 ------ src/de/mod.rs | 2 +- src/de/seq.rs | 6 +---- src/errors.rs | 14 ++++++----- src/events/attributes.rs | 8 ++----- src/events/mod.rs | 27 +-------------------- src/reader.rs | 47 +++++++++++++++++++++++++++---------- tests/xmlrs_reader_tests.rs | 30 +++++++---------------- 9 files changed, 64 insertions(+), 84 deletions(-) diff --git a/Changelog.md b/Changelog.md index f83cd351..6c271492 100644 --- a/Changelog.md +++ b/Changelog.md @@ -78,6 +78,13 @@ - text before the first tag is not an XML content at all, so it is meaningless to try to unescape something in it +- [#180]: Define the same API for encoding whether feature `encoding` is defined or not. + Since then no more replacements does for invalid characters when `encoding` feature + is enabled. An error will be returned instead, as for the case when `encoding` is disabled. + If `encoding` feature is disabled, `Cow::Borrowed` is returned instead of `&str`. + If `encoding` feature is enabled, `Result` is returned instead of `Cow`. +- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable` + ### New Tests - [#9]: Added tests for incorrect nested tags in input diff --git a/src/de/escape.rs b/src/de/escape.rs index 28f99364..e47109e0 100644 --- a/src/de/escape.rs +++ b/src/de/escape.rs @@ -45,12 +45,8 @@ macro_rules! deserialize_num { where V: Visitor<'de>, { - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?; - visitor.$visit(value) } }; @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> { V: Visitor<'de>, { let unescaped = self.unescaped()?; - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(&unescaped)?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(&unescaped); visitor.visit_str(&value) } diff --git a/src/de/mod.rs b/src/de/mod.rs index 53c5ffd0..e252ada7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -337,7 +337,7 @@ where { #[cfg(feature = "encoding")] { - let value = decoder.decode(value); + let value = decoder.decode(value)?; // No need to unescape because valid boolean representations cannot be escaped match value.as_ref() { "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => { diff --git a/src/de/seq.rs b/src/de/seq.rs index 958f172d..0e19c909 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -14,11 +14,7 @@ pub fn not_in( start: &BytesStart, decoder: Decoder, ) -> Result { - #[cfg(not(feature = "encoding"))] - let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?); - - #[cfg(feature = "encoding")] - let tag = decoder.decode(start.name().into_inner()); + let tag = decoder.decode(start.name().into_inner())?; Ok(fields.iter().all(|&field| field != tag.as_ref())) } diff --git a/src/errors.rs b/src/errors.rs index 01c960c0..c1af08af 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -11,8 +11,9 @@ use std::string::FromUtf8Error; pub enum Error { /// IO error Io(::std::io::Error), - /// Utf8 error - Utf8(Utf8Error), + /// Input decoding error. If `encoding` feature is disabled, contains `None`, + /// otherwise contains the UTF-8 decoding error + NonDecodable(Option), /// Unexpected End of File UnexpectedEof(String), /// End event mismatch @@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error { } impl From for Error { - /// Creates a new `Error::Utf8` from the given error + /// Creates a new `Error::NonDecodable` from the given error #[inline] fn from(error: Utf8Error) -> Error { - Error::Utf8(error) + Error::NonDecodable(Some(error)) } } @@ -86,7 +87,8 @@ impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::Io(e) => write!(f, "I/O error: {}", e), - Error::Utf8(e) => write!(f, "UTF8 error: {}", e), + Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"), + Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e), Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e), Error::EndEventMismatch { expected, found } => { write!(f, "Expecting found ", expected, found) @@ -118,7 +120,7 @@ impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { Error::Io(e) => Some(e), - Error::Utf8(e) => Some(e), + Error::NonDecodable(Some(e)) => Some(e), Error::InvalidAttr(e) => Some(e), Error::EscapeError(e) => Some(e), _ => None, diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1e059d5e..61082745 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -71,7 +71,7 @@ impl<'a> Attribute<'a> { do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError) } - /// Decode then unescapes the value + /// NonDecodable then unescapes the value /// /// This allocates a `String` in all cases. For performance reasons it might be a better idea to /// instead use one of: @@ -85,7 +85,7 @@ impl<'a> Attribute<'a> { self.do_unescape_and_decode_value(reader, None) } - /// Decode then unescapes the value with custom entities + /// NonDecodable then unescapes the value with custom entities /// /// This allocates a `String` in all cases. For performance reasons it might be a better idea to /// instead use one of: @@ -113,10 +113,6 @@ impl<'a> Attribute<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self.value); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self.value)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; diff --git a/src/events/mod.rs b/src/events/mod.rs index e4865c3a..0d799e7a 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> { /// appeared in the BOM or in the text before the first tag. pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result { //TODO: Fix lifetime issue - it should be possible to borrow string - #[cfg(feature = "encoding")] - let decoded = decoder.decode_with_bom_removal(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = decoder.decode_with_bom_removal(&*self)?; Ok(decoded.to_string()) @@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; @@ -880,10 +872,6 @@ impl<'a> BytesText<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; @@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> { #[cfg(feature = "serialize")] pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { Ok(match &self.content { - Cow::Borrowed(bytes) => { - #[cfg(feature = "encoding")] - { - decoder.decode(bytes) - } - #[cfg(not(feature = "encoding"))] - { - decoder.decode(bytes)?.into() - } - } + Cow::Borrowed(bytes) => decoder.decode(bytes)?, Cow::Owned(bytes) => { - #[cfg(feature = "encoding")] - let decoded = decoder.decode(bytes).into_owned(); - - #[cfg(not(feature = "encoding"))] let decoded = decoder.decode(bytes)?.to_string(); decoded.into() diff --git a/src/reader.rs b/src/reader.rs index a5b4c600..8dcce2ab 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,6 +1,5 @@ //! A module to handle `Reader` -#[cfg(feature = "encoding")] use std::borrow::Cow; use std::io::{self, BufRead, BufReader}; use std::{fs::File, path::Path, str::from_utf8}; @@ -1443,8 +1442,9 @@ impl Decoder { /// Returns an error in case of malformed sequences in the `bytes`. /// /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - Ok(from_utf8(bytes)?) + #[inline] + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + Ok(Cow::Borrowed(from_utf8(bytes)?)) } /// Decodes a slice regardless of XML declaration with BOM removal if @@ -1453,7 +1453,7 @@ impl Decoder { /// Returns an error in case of malformed sequences in the `bytes`. /// /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> { + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { &bytes[3..] } else { @@ -1475,12 +1475,18 @@ impl Decoder { } /// Decodes specified bytes using encoding, declared in the XML, if it was - /// declared there, or UTF-8 otherwise + /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present + /// in the `bytes`. /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { - self.encoding.decode(bytes).0 + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + match self + .encoding + .decode_without_bom_handling_and_without_replacement(bytes) + { + None => Err(Error::NonDecodable(None)), + Some(s) => Ok(s), + } } /// Decodes a slice with BOM removal if it is present in the `bytes` using @@ -1491,9 +1497,26 @@ impl Decoder { /// /// If XML declaration is absent in the XML, UTF-8 is used. /// - /// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`. - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { - self.encoding.decode_with_bom_removal(bytes).0 + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + self.decode(self.remove_bom(bytes)) + } + /// Copied from [`Encoding::decode_with_bom_removal`] + #[inline] + fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { + use encoding_rs::*; + + if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { + return &bytes[3..]; + } + if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { + return &bytes[2..]; + } + if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { + return &bytes[2..]; + } + + bytes } } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index a3f66bcb..a23bb03c 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -1,7 +1,6 @@ use quick_xml::events::{BytesStart, Event}; use quick_xml::name::{QName, ResolveResult}; -use quick_xml::{Reader, Result}; -use std::borrow::Cow; +use quick_xml::{Decoder, Reader, Result}; use std::str::from_utf8; #[test] @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { loop { buf.clear(); let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer); - let line = xmlrs_display(event, &reader); + let line = xmlrs_display(event, reader.decoder()); if let Some((n, spec)) = spec_lines.next() { if spec.trim() == "EndDocument" { break; @@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { } } -fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String { - let name = decode(name.as_ref(), reader); +fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String { + let name = decoder.decode(name.as_ref()).unwrap(); match n { // Produces string '{namespace}prefixed_name' ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name), @@ -448,22 +447,11 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result { Ok(atts.join(", ")) } -// FIXME: The public API differs based on the "encoding" feature -fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(text); - - #[cfg(not(feature = "encoding"))] - let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap()); - - decoded -} - -fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { +fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String { match opt_event { Ok((_, Event::StartText(_))) => "StartText".to_string(), Ok((n, Event::Start(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name), Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs), @@ -471,7 +459,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::Empty(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name), Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs), @@ -479,13 +467,13 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::End(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); format!("EndElement({})", name) } Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()), Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()), Ok((_, Event::Text(ref e))) => match e.unescaped() { - Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()), + Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()), Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err), }, Ok((_, Event::Decl(ref e))) => {