Skip to content

Commit

Permalink
Fix tafia#180: Define the same API for encoding whether feature `enco…
Browse files Browse the repository at this point in the history
…ding` is defined or not
  • Loading branch information
Mingun committed Jun 20, 2022
1 parent 6990f9b commit 3368f58
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 82 deletions.
11 changes: 11 additions & 0 deletions Changelog.md
Expand Up @@ -78,6 +78,17 @@
- text before the first tag is not an XML content at all, so it is meaningless
to try to unescape something in it

- [#180]: Eliminate differences in the decoding API when feature `encoding` enabled and when it is
disabled. Since then signatures of functions are the same regardless of the feature enabled and
no more replacements does for invalid characters when feature is enabled. An error will be returned
instead, as for the case when `encoding` is disabled.
Namely, if `encoding` feature is disabled, `Result<Cow<&str>>` is returned instead of `Result<&str>`.
The `Cow` variant is always `Cow::Borrowed` in that case.
If `encoding` feature is enabled, `Result<Cow<&str>>` is returned instead of `Cow<&str>`. Error is
returned in cases where decoding impossible (previously, the symbol is replaced with
`U+FFFD REPLACEMENT CHARACTER`)
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand Down
7 changes: 0 additions & 7 deletions src/de/escape.rs
Expand Up @@ -45,12 +45,8 @@ macro_rules! deserialize_num {
where
V: Visitor<'de>,
{
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;

visitor.$visit(value)
}
};
Expand All @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(&unescaped)?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(&unescaped);
visitor.visit_str(&value)
}

Expand Down
2 changes: 1 addition & 1 deletion src/de/mod.rs
Expand Up @@ -337,7 +337,7 @@ where
{
#[cfg(feature = "encoding")]
{
let value = decoder.decode(value);
let value = decoder.decode(value)?;
// No need to unescape because valid boolean representations cannot be escaped
match value.as_ref() {
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
Expand Down
6 changes: 1 addition & 5 deletions src/de/seq.rs
Expand Up @@ -14,11 +14,7 @@ pub fn not_in(
start: &BytesStart,
decoder: Decoder,
) -> Result<bool, DeError> {
#[cfg(not(feature = "encoding"))]
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);

#[cfg(feature = "encoding")]
let tag = decoder.decode(start.name().into_inner());
let tag = decoder.decode(start.name().into_inner())?;

Ok(fields.iter().all(|&field| field != tag.as_ref()))
}
Expand Down
14 changes: 8 additions & 6 deletions src/errors.rs
Expand Up @@ -11,8 +11,9 @@ use std::string::FromUtf8Error;
pub enum Error {
/// IO error
Io(::std::io::Error),
/// Utf8 error
Utf8(Utf8Error),
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
/// otherwise contains the UTF-8 decoding error
NonDecodable(Option<Utf8Error>),
/// Unexpected End of File
UnexpectedEof(String),
/// End event mismatch
Expand Down Expand Up @@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error {
}

impl From<Utf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
/// Creates a new `Error::NonDecodable` from the given error
#[inline]
fn from(error: Utf8Error) -> Error {
Error::Utf8(error)
Error::NonDecodable(Some(error))
}
}

Expand Down Expand Up @@ -86,7 +87,8 @@ impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Error::Io(e) => write!(f, "I/O error: {}", e),
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
Error::EndEventMismatch { expected, found } => {
write!(f, "Expecting </{}> found </{}>", expected, found)
Expand Down Expand Up @@ -118,7 +120,7 @@ impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Error::Io(e) => Some(e),
Error::Utf8(e) => Some(e),
Error::NonDecodable(Some(e)) => Some(e),
Error::InvalidAttr(e) => Some(e),
Error::EscapeError(e) => Some(e),
_ => None,
Expand Down
4 changes: 0 additions & 4 deletions src/events/attributes.rs
Expand Up @@ -113,10 +113,6 @@ impl<'a> Attribute<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self.value);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down
27 changes: 1 addition & 26 deletions src/events/mod.rs
Expand Up @@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> {
/// appeared in the BOM or in the text before the first tag.
pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
//TODO: Fix lifetime issue - it should be possible to borrow string
#[cfg(feature = "encoding")]
let decoded = decoder.decode_with_bom_removal(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode_with_bom_removal(&*self)?;

Ok(decoded.to_string())
Expand Down Expand Up @@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -880,10 +872,6 @@ impl<'a> BytesText<'a> {
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<String> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(&*self);

#[cfg(not(feature = "encoding"))]
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
Expand Down Expand Up @@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> {
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => {
#[cfg(feature = "encoding")]
{
decoder.decode(bytes)
}
#[cfg(not(feature = "encoding"))]
{
decoder.decode(bytes)?.into()
}
}
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Cow::Owned(bytes) => {
#[cfg(feature = "encoding")]
let decoded = decoder.decode(bytes).into_owned();

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode(bytes)?.to_string();

decoded.into()
Expand Down
47 changes: 35 additions & 12 deletions src/reader.rs
@@ -1,6 +1,5 @@
//! A module to handle `Reader`

#[cfg(feature = "encoding")]
use std::borrow::Cow;
use std::io::{self, BufRead, BufReader};
use std::{fs::File, path::Path, str::from_utf8};
Expand Down Expand Up @@ -1472,8 +1471,9 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
Ok(from_utf8(bytes)?)
#[inline]
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
Ok(Cow::Borrowed(from_utf8(bytes)?))
}

/// Decodes a slice regardless of XML declaration with BOM removal if
Expand All @@ -1482,7 +1482,7 @@ impl Decoder {
/// Returns an error in case of malformed sequences in the `bytes`.
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> {
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
&bytes[3..]
} else {
Expand All @@ -1504,12 +1504,18 @@ impl Decoder {
}

/// Decodes specified bytes using encoding, declared in the XML, if it was
/// declared there, or UTF-8 otherwise
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
/// in the `bytes`.
///
/// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
/// `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
match self
.encoding
.decode_without_bom_handling_and_without_replacement(bytes)
{
None => Err(Error::NonDecodable(None)),
Some(s) => Ok(s),
}
}

/// Decodes a slice with BOM removal if it is present in the `bytes` using
Expand All @@ -1520,9 +1526,26 @@ impl Decoder {
///
/// If XML declaration is absent in the XML, UTF-8 is used.
///
/// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
self.encoding.decode_with_bom_removal(bytes).0
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
self.decode(self.remove_bom(bytes))
}
/// Copied from [`Encoding::decode_with_bom_removal`]
#[inline]
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
use encoding_rs::*;

if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
return &bytes[3..];
}
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
return &bytes[2..];
}
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
return &bytes[2..];
}

bytes
}
}

Expand Down
30 changes: 9 additions & 21 deletions tests/xmlrs_reader_tests.rs
@@ -1,7 +1,6 @@
use quick_xml::events::{BytesStart, Event};
use quick_xml::name::{QName, ResolveResult};
use quick_xml::{Reader, Result};
use std::borrow::Cow;
use quick_xml::{Decoder, Reader, Result};
use std::str::from_utf8;

#[test]
Expand Down Expand Up @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
loop {
buf.clear();
let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
let line = xmlrs_display(event, &reader);
let line = xmlrs_display(event, reader.decoder());
if let Some((n, spec)) = spec_lines.next() {
if spec.trim() == "EndDocument" {
break;
Expand Down Expand Up @@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
}
}

fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String {
let name = decode(name.as_ref(), reader);
fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String {
let name = decoder.decode(name.as_ref()).unwrap();
match n {
// Produces string '{namespace}prefixed_name'
ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name),
Expand All @@ -448,44 +447,33 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result<String, String> {
Ok(atts.join(", "))
}

// FIXME: The public API differs based on the "encoding" feature
fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> {
#[cfg(feature = "encoding")]
let decoded = reader.decoder().decode(text);

#[cfg(not(feature = "encoding"))]
let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap());

decoded
}

fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String {
fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String {
match opt_event {
Ok((_, Event::StartText(_))) => "StartText".to_string(),
Ok((n, Event::Start(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name),
Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs),
Err(e) => format!("StartElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::Empty(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
match make_attrs(e) {
Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name),
Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs),
Err(e) => format!("EmptyElement({}, attr-error: {})", &name, &e),
}
}
Ok((n, Event::End(ref e))) => {
let name = namespace_name(n, e.name(), reader);
let name = namespace_name(n, e.name(), decoder);
format!("EndElement({})", name)
}
Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()),
Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()),
Ok((_, Event::Text(ref e))) => match e.unescaped() {
Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()),
Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()),
Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err),
},
Ok((_, Event::Decl(ref e))) => {
Expand Down

0 comments on commit 3368f58

Please sign in to comment.