Skip to content

Commit

Permalink
Fix tafia#180: Define the same API for encoding whether feature `enco…
Browse files Browse the repository at this point in the history
…ding` is defined or not
  • Loading branch information
Mingun committed Jun 19, 2022
1 parent 93d412f commit 61cd193
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 84 deletions.
7 changes: 7 additions & 0 deletions Changelog.md
Expand Up @@ -78,6 +78,13 @@
- text before the first tag is not an XML content at all, so it is meaningless
to try to unescape something in it

- [#180]: Define the same API for encoding whether feature `encoding` is defined or not.
Since then no more replacements does for invalid characters when `encoding` feature
is enabled. An error will be returned instead, as for the case when `encoding` is disabled.
If `encoding` feature is disabled, `Cow::Borrowed` is returned instead of `&str`.
If `encoding` feature is enabled, `Result<Cow>` is returned instead of `Cow`.
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand Down
7 changes: 0 additions & 7 deletions src/de/escape.rs
Expand Up @@ -45,12 +45,8 @@ macro_rules! deserialize_num {
where
V: Visitor<'de>,
{
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;

visitor.$visit(value)
}
};
Expand All @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(&unescaped)?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(&unescaped);
visitor.visit_str(&value)
}

Expand Down
2 changes: 1 addition & 1 deletion src/de/mod.rs
Expand Up @@ -337,7 +337,7 @@ where
{
#[cfg(feature = "encoding")]
{
let value = decoder.decode(value);
let value = decoder.decode(value)?;
// No need to unescape because valid boolean representations cannot be escaped
match value.as_ref() {
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
Expand Down
6 changes: 1 addition & 5 deletions src/de/seq.rs
Expand Up @@ -14,11 +14,7 @@ pub fn not_in(
start: &BytesStart,
decoder: Decoder,
) -> Result<bool, DeError> {
#[cfg(not(feature = "encoding"))]
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);

#[cfg(feature = "encoding")]
let tag = decoder.decode(start.name().into_inner());
let tag = decoder.decode(start.name().into_inner())?;

Ok(fields.iter().all(|&field| field != tag.as_ref()))
}
Expand Down
14 changes: 8 additions & 6 deletions src/errors.rs
Expand Up @@ -11,8 +11,9 @@ use std::string::FromUtf8Error;
pub enum Error {
/// IO error
Io(::std::io::Error),
/// Utf8 error
Utf8(Utf8Error),
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
/// otherwise contains the UTF-8 decoding error
NonDecodable(Option<Utf8Error>),
/// Unexpected End of File
UnexpectedEof(String),
/// End event mismatch
Expand Down Expand Up @@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error {
}

impl From<Utf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
/// Creates a new `Error::NonDecodable` from the given error
#[inline]
fn from(error: Utf8Error) -> Error {
Error::Utf8(error)
Error::NonDecodable(Some(error))
}
}

Expand Down Expand Up @@ -86,7 +87,8 @@ impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Error::Io(e) => write!(f, "I/O error: {}", e),
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
Error::EndEventMismatch { expected, found } => {
write!(f, "Expecting </{}> found </{}>", expected, found)
Expand Down Expand Up @@ -118,7 +120,7 @@ impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Error::Io(e) => Some(e),
Error::Utf8(e) => Some(e),
Error::NonDecodable(Some(e)) => Some(e),
Error::InvalidAttr(e) => Some(e),
Error::EscapeError(e) => Some(e),
_ => None,
Expand Down
8 changes: 2 additions & 6 deletions src/events/attributes.rs
Expand Up @@ -71,7 +71,7 @@ impl<'a> Attribute<'a> {
do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
}

/// Decode then unescapes the value
/// NonDecodable then unescapes the value
///
/// This allocates a `String` in all cases. For performance reasons it might be a better idea to
/// instead use one of:
Expand All @@ -85,7 +85,7 @@ impl<'a> Attribute<'a> {
self.do_unescape_and_decode_value(reader, None)
}

/// Decode then unescapes the value with custom entities
/// NonDecodable then unescapes the value with custom entities
///
/// This allocates a `String` in all cases. For performance reasons it might be a better idea to
/// instead use one of:
Expand Down Expand Up @@ -113,10 +113,6 @@ impl<'a> Attribute<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self.value);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down
27 changes: 1 addition & 26 deletions src/events/mod.rs
Expand Up @@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> {
/// appeared in the BOM or in the text before the first tag.
pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
//TODO: Fix lifetime issue - it should be possible to borrow string
#[cfg(feature = "encoding")]
let decoded = decoder.decode_with_bom_removal(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode_with_bom_removal(&*self)?;

Ok(decoded.to_string())
Expand Down Expand Up @@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -880,10 +872,6 @@ impl<'a> BytesText<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> {
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => {
#[cfg(feature = "encoding")]
{
decoder.decode(bytes)
}
#[cfg(not(feature = "encoding"))]
{
decoder.decode(bytes)?.into()
}
}
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Cow::Owned(bytes) => {
#[cfg(feature = "encoding")]
let decoded = decoder.decode(bytes).into_owned();

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode(bytes)?.to_string();

decoded.into()
Expand Down
47 changes: 35 additions & 12 deletions src/reader.rs
@@ -1,6 +1,5 @@
//! A module to handle `Reader`

#[cfg(feature = "encoding")]
use std::borrow::Cow;
use std::io::{self, BufRead, BufReader};
use std::{fs::File, path::Path, str::from_utf8};
Expand Down Expand Up @@ -1443,8 +1442,9 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
Ok(from_utf8(bytes)?)
#[inline]
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
Ok(Cow::Borrowed(from_utf8(bytes)?))
}

/// Decodes a slice regardless of XML declaration with BOM removal if
Expand All @@ -1453,7 +1453,7 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> {
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
&bytes[3..]
} else {
Expand All @@ -1475,12 +1475,18 @@ impl Decoder {
}

/// Decodes specified bytes using encoding, declared in the XML, if it was
/// declared there, or UTF-8 otherwise
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
/// in the `bytes`.
///
/// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
/// `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
match self
.encoding
.decode_without_bom_handling_and_without_replacement(bytes)
{
None => Err(Error::NonDecodable(None)),
Some(s) => Ok(s),
}
}

/// Decodes a slice with BOM removal if it is present in the `bytes` using
Expand All @@ -1491,9 +1497,26 @@ impl Decoder {
///
/// If XML declaration is absent in the XML, UTF-8 is used.
///
/// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode_with_bom_removal(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
self.decode(self.remove_bom(bytes))
}
/// Copied from [`Encoding::decode_with_bom_removal`]
#[inline]
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
use encoding_rs::*;

if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
return &bytes[3..];
}
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
return &bytes[2..];
}
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
return &bytes[2..];
}

bytes
}
}

Expand Down
30 changes: 9 additions & 21 deletions tests/xmlrs_reader_tests.rs
@@ -1,7 +1,6 @@
use quick_xml::events::{BytesStart, Event};
use quick_xml::name::{QName, ResolveResult};
use quick_xml::{Reader, Result};
use std::borrow::Cow;
use quick_xml::{Decoder, Reader, Result};
use std::str::from_utf8;

#[test]
Expand Down Expand Up @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
loop {
buf.clear();
let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
let line = xmlrs_display(event, &reader);
let line = xmlrs_display(event, reader.decoder());
if let Some((n, spec)) = spec_lines.next() {
if spec.trim() == "EndDocument" {
break;
Expand Down Expand Up @@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
}
}

fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String {
let name = decode(name.as_ref(), reader);
fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String {
let name = decoder.decode(name.as_ref()).unwrap();
match n {
// Produces string '{namespace}prefixed_name'
ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name),
Expand All @@ -448,44 +447,33 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result<String, String> {
Ok(atts.join(", "))
}

// FIXME: The public API differs based on the "encoding" feature
fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(text);

#[cfg(not(feature = "encoding"))]
let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap());

decoded
}

fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String {
fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String {
match opt_event {
Ok((_, Event::StartText(_))) => "StartText".to_string(),
Ok((n, Event::Start(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name),
Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs),
Err(e) => format!("StartElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::Empty(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name),
Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs),
Err(e) => format!("EmptyElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::End(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
format!("EndElement({})", name)
}
Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()),
Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()),
Ok((_, Event::Text(ref e))) => match e.unescaped() {
Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()),
Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()),
Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err),
},
Ok((_, Event::Decl(ref e))) => {
Expand Down

0 comments on commit 61cd193

Please sign in to comment.