Skip to content

Commit

Permalink
Fix tafia#180: Eliminated the differences in the decoding API when fe…
Browse files Browse the repository at this point in the history
…ature `encoding` enabled and when it is disabled

Co-authored-by: Daniel Alley <dalley@redhat.com>
  • Loading branch information
Mingun and dralley committed Jun 20, 2022
1 parent be9ec0c commit d49a4b8
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 84 deletions.
10 changes: 10 additions & 0 deletions Changelog.md
Expand Up @@ -78,6 +78,16 @@
- text before the first tag is not an XML content at all, so it is meaningless
to try to unescape something in it

- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
disabled. Signatures of functions are now the same regardless of whether or not the feature is
enabled, and an error will be returned instead of performing replacements for invalid characters
in both cases.

Previously, if the `encoding` feature was enabled, decoding functions would return `Result<Cow<&str>>`
while without this feature they would return `Result<&str>`. With this change, only `Result<Cow<&str>>`
is returned regardless of the status of the feature.
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand Down
7 changes: 0 additions & 7 deletions src/de/escape.rs
Expand Up @@ -45,12 +45,8 @@ macro_rules! deserialize_num {
where
V: Visitor<'de>,
{
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;

visitor.$visit(value)
}
};
Expand All @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(&unescaped)?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(&unescaped);
visitor.visit_str(&value)
}

Expand Down
2 changes: 1 addition & 1 deletion src/de/mod.rs
Expand Up @@ -337,7 +337,7 @@ where
{
#[cfg(feature = "encoding")]
{
let value = decoder.decode(value);
let value = decoder.decode(value)?;
// No need to unescape because valid boolean representations cannot be escaped
match value.as_ref() {
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
Expand Down
8 changes: 1 addition & 7 deletions src/de/seq.rs
Expand Up @@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
use crate::events::BytesStart;
use crate::reader::Decoder;
use serde::de::{DeserializeSeed, SeqAccess};
#[cfg(not(feature = "encoding"))]
use std::borrow::Cow;

/// Check if tag `start` is included in the `fields` list. `decoder` is used to
/// get a string representation of a tag.
Expand All @@ -14,11 +12,7 @@ pub fn not_in(
start: &BytesStart,
decoder: Decoder,
) -> Result<bool, DeError> {
#[cfg(not(feature = "encoding"))]
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);

#[cfg(feature = "encoding")]
let tag = decoder.decode(start.name().into_inner());
let tag = decoder.decode(start.name().into_inner())?;

Ok(fields.iter().all(|&field| field != tag.as_ref()))
}
Expand Down
14 changes: 8 additions & 6 deletions src/errors.rs
Expand Up @@ -11,8 +11,9 @@ use std::string::FromUtf8Error;
pub enum Error {
/// IO error
Io(::std::io::Error),
/// Utf8 error
Utf8(Utf8Error),
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
/// otherwise contains the UTF-8 decoding error
NonDecodable(Option<Utf8Error>),
/// Unexpected End of File
UnexpectedEof(String),
/// End event mismatch
Expand Down Expand Up @@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error {
}

impl From<Utf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
/// Creates a new `Error::NonDecodable` from the given error
#[inline]
fn from(error: Utf8Error) -> Error {
Error::Utf8(error)
Error::NonDecodable(Some(error))
}
}

Expand Down Expand Up @@ -86,7 +87,8 @@ impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Error::Io(e) => write!(f, "I/O error: {}", e),
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
Error::EndEventMismatch { expected, found } => {
write!(f, "Expecting </{}> found </{}>", expected, found)
Expand Down Expand Up @@ -118,7 +120,7 @@ impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Error::Io(e) => Some(e),
Error::Utf8(e) => Some(e),
Error::NonDecodable(Some(e)) => Some(e),
Error::InvalidAttr(e) => Some(e),
Error::EscapeError(e) => Some(e),
_ => None,
Expand Down
4 changes: 0 additions & 4 deletions src/events/attributes.rs
Expand Up @@ -113,10 +113,6 @@ impl<'a> Attribute<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self.value);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down
27 changes: 1 addition & 26 deletions src/events/mod.rs
Expand Up @@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> {
/// appeared in the BOM or in the text before the first tag.
pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
//TODO: Fix lifetime issue - it should be possible to borrow string
#[cfg(feature = "encoding")]
let decoded = decoder.decode_with_bom_removal(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode_with_bom_removal(&*self)?;

Ok(decoded.to_string())
Expand Down Expand Up @@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -880,10 +872,6 @@ impl<'a> BytesText<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> {
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => {
#[cfg(feature = "encoding")]
{
decoder.decode(bytes)
}
#[cfg(not(feature = "encoding"))]
{
decoder.decode(bytes)?.into()
}
}
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Cow::Owned(bytes) => {
#[cfg(feature = "encoding")]
let decoded = decoder.decode(bytes).into_owned();

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode(bytes)?.to_string();

decoded.into()
Expand Down
47 changes: 35 additions & 12 deletions src/reader.rs
@@ -1,6 +1,5 @@
//! A module to handle `Reader`

#[cfg(feature = "encoding")]
use std::borrow::Cow;
use std::io::{self, BufRead, BufReader};
use std::{fs::File, path::Path, str::from_utf8};
Expand Down Expand Up @@ -1472,8 +1471,9 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
Ok(from_utf8(bytes)?)
#[inline]
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
Ok(Cow::Borrowed(from_utf8(bytes)?))
}

/// Decodes a slice regardless of XML declaration with BOM removal if
Expand All @@ -1482,7 +1482,7 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> {
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
&bytes[3..]
} else {
Expand All @@ -1504,12 +1504,18 @@ impl Decoder {
}

/// Decodes specified bytes using encoding, declared in the XML, if it was
/// declared there, or UTF-8 otherwise
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
/// in the `bytes`.
///
/// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
/// `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
match self
.encoding
.decode_without_bom_handling_and_without_replacement(bytes)
{
None => Err(Error::NonDecodable(None)),
Some(s) => Ok(s),
}
}

/// Decodes a slice with BOM removal if it is present in the `bytes` using
Expand All @@ -1520,9 +1526,26 @@ impl Decoder {
///
/// If XML declaration is absent in the XML, UTF-8 is used.
///
/// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode_with_bom_removal(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
self.decode(self.remove_bom(bytes))
}
/// Copied from [`Encoding::decode_with_bom_removal`]
#[inline]
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
use encoding_rs::*;

if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
return &bytes[3..];
}
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
return &bytes[2..];
}
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
return &bytes[2..];
}

bytes
}
}

Expand Down
30 changes: 9 additions & 21 deletions tests/xmlrs_reader_tests.rs
@@ -1,7 +1,6 @@
use quick_xml::events::{BytesStart, Event};
use quick_xml::name::{QName, ResolveResult};
use quick_xml::{Reader, Result};
use std::borrow::Cow;
use quick_xml::{Decoder, Reader, Result};
use std::str::from_utf8;

#[test]
Expand Down Expand Up @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
loop {
buf.clear();
let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
let line = xmlrs_display(event, &reader);
let line = xmlrs_display(event, reader.decoder());
if let Some((n, spec)) = spec_lines.next() {
if spec.trim() == "EndDocument" {
break;
Expand Down Expand Up @@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
}
}

fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String {
let name = decode(name.as_ref(), reader);
fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String {
let name = decoder.decode(name.as_ref()).unwrap();
match n {
// Produces string '{namespace}prefixed_name'
ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name),
Expand All @@ -448,44 +447,33 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result<String, String> {
Ok(atts.join(", "))
}

// FIXME: The public API differs based on the "encoding" feature
fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(text);

#[cfg(not(feature = "encoding"))]
let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap());

decoded
}

fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String {
fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String {
match opt_event {
Ok((_, Event::StartText(_))) => "StartText".to_string(),
Ok((n, Event::Start(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name),
Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs),
Err(e) => format!("StartElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::Empty(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name),
Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs),
Err(e) => format!("EmptyElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::End(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
format!("EndElement({})", name)
}
Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()),
Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()),
Ok((_, Event::Text(ref e))) => match e.unescaped() {
Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()),
Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()),
Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err),
},
Ok((_, Event::Decl(ref e))) => {
Expand Down

0 comments on commit d49a4b8

Please sign in to comment.