Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CDATA handling #374

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions Changelog.md
Expand Up @@ -19,6 +19,15 @@
from the attribute and element names and attribute values
- fix: allow to deserialize `unit`s from text and CDATA content.
`DeError::InvalidUnit` variant is removed, because after fix it is no longer used
- test: add tests for trivial documents (empty / only comment / `<root>...</root>` -- one tag with content)
- fix: CDATA was not handled in many cases where it should
- fix: do not unescape CDATA content because it never escaped by design
([#311](https://github.com/tafia/quick-xml/issues/311)).

NOTE: now text content when deserialized into bytes (`Vec<u8>` / `&[u8]`), also unescaped.
It is impossible to get a raw XML data in bytes buffer. Actually, deserializing of bytes
should be prohibited, because XML cannot store raw byte data. You should store binary
data in a string hex- or base64- or any-other-schema-encoded.

## 0.23.0-alpha3

Expand Down
2 changes: 1 addition & 1 deletion benches/bench.rs
Expand Up @@ -225,7 +225,7 @@ fn bench_quick_xml_one_cdata_event_trimmed(b: &mut Bencher) {
.check_comments(false)
.trim_text(true);
match r.read_event(&mut buf) {
Ok(Event::CData(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::CData(ref e)) => nbtxt += e.len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down
17 changes: 15 additions & 2 deletions src/de/byte_buf.rs
@@ -1,11 +1,12 @@
//! Helper types for tests

use crate::utils::write_byte_string;
use serde::de::{self, Deserialize, Deserializer, Error};
use std::fmt;

/// Wrapper around `Vec<u8>` that deserialized using `deserialize_byte_buf`
/// instead of vector's generic `deserialize_seq`
#[derive(Debug, PartialEq)]
#[derive(PartialEq)]
pub struct ByteBuf(pub Vec<u8>);

impl<'de> Deserialize<'de> for ByteBuf {
Expand Down Expand Up @@ -35,9 +36,15 @@ impl<'de> Deserialize<'de> for ByteBuf {
}
}

impl fmt::Debug for ByteBuf {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write_byte_string(f, &self.0)
}
}

/// Wrapper around `&[u8]` that deserialized using `deserialize_bytes`
/// instead of vector's generic `deserialize_seq`
#[derive(Debug, PartialEq)]
#[derive(PartialEq)]
pub struct Bytes<'de>(pub &'de [u8]);

impl<'de> Deserialize<'de> for Bytes<'de> {
Expand All @@ -62,3 +69,9 @@ impl<'de> Deserialize<'de> for Bytes<'de> {
Ok(d.deserialize_bytes(Visitor)?)
}
}

impl<'de> fmt::Debug for Bytes<'de> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write_byte_string(f, self.0)
}
}
2 changes: 1 addition & 1 deletion src/de/map.rs
Expand Up @@ -100,7 +100,7 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a,
} else {
// try getting from events (<key>value</key>)
match self.de.peek()? {
DeEvent::Text(_) => {
DeEvent::Text(_) | DeEvent::CData(_) => {
self.state = State::InnerValue;
// Deserialize `key` from special attribute name which means
// that value should be taken from the text content of the
Expand Down
198 changes: 184 additions & 14 deletions src/de/mod.rs
Expand Up @@ -117,7 +117,7 @@ mod var;
pub use crate::errors::serialize::DeError;
use crate::{
errors::Error,
events::{BytesEnd, BytesStart, BytesText, Event},
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
reader::Decoder,
Reader,
};
Expand All @@ -141,7 +141,7 @@ pub enum DeEvent<'a> {
Text(BytesText<'a>),
/// Unescaped character data between `Start` and `End` element,
/// stored in `<![CDATA[...]]>`.
CData(BytesText<'a>),
CData(BytesCData<'a>),
/// End of XML document.
Eof,
}
Expand Down Expand Up @@ -300,18 +300,20 @@ where
/// |`<tag ...>text</tag>`|`text` |Complete tag consumed |
/// |`<tag/>` |empty slice|Virtual end tag not consumed|
/// |`</tag>` |empty slice|Not consumed |
fn next_text(&mut self) -> Result<BytesText<'de>, DeError> {
fn next_text(&mut self) -> Result<BytesCData<'de>, DeError> {
match self.next()? {
DeEvent::Text(e) | DeEvent::CData(e) => Ok(e),
DeEvent::Text(e) => e.unescape().map_err(|e| DeError::Xml(e.into())),
DeEvent::CData(e) => Ok(e),
DeEvent::Eof => Err(DeError::Eof),
DeEvent::Start(e) => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) | DeEvent::CData(t) => t,
DeEvent::Text(t) => t.unescape().map_err(|e| DeError::Xml(e.into()))?,
DeEvent::CData(t) => t,
DeEvent::Start(_) => return Err(DeError::Start),
DeEvent::End(end) if end.name() == e.name() => {
return Ok(BytesText::from_escaped(&[] as &[u8]));
return Ok(BytesCData::new(&[] as &[u8]));
}
DeEvent::End(_) => return Err(DeError::End),
DeEvent::Eof => return Err(DeError::Eof),
Expand All @@ -321,7 +323,7 @@ where
}
DeEvent::End(e) => {
self.peek = Some(DeEvent::End(e));
Ok(BytesText::from_escaped(&[] as &[u8]))
Ok(BytesCData::new(&[] as &[u8]))
}
}
}
Expand Down Expand Up @@ -406,13 +408,12 @@ where
deserialize_bool(txt.as_ref(), self.reader.decoder(), visitor)
}

/// Representation of owned strings the same as [non-owned](#method.deserialize_str).
fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, DeError>
where
V: Visitor<'de>,
{
let text = self.next_text()?;
let string = text.decode_and_escape(self.reader.decoder())?;
visitor.visit_string(string.into_owned())
self.deserialize_str(visitor)
}

fn deserialize_char<V>(self, visitor: V) -> Result<V::Value, DeError>
Expand All @@ -427,7 +428,7 @@ where
V: Visitor<'de>,
{
let text = self.next_text()?;
let string = text.decode_and_escape(self.reader.decoder())?;
let string = text.decode(self.reader.decoder())?;
match string {
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
Cow::Owned(string) => visitor.visit_string(string),
Expand All @@ -439,8 +440,7 @@ where
V: Visitor<'de>,
{
let text = self.next_text()?;
let value = text.escaped();
visitor.visit_bytes(value)
visitor.visit_bytes(&text)
}

fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
Expand Down Expand Up @@ -562,6 +562,7 @@ where
{
match self.peek()? {
DeEvent::Text(t) if t.is_empty() => visitor.visit_none(),
DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
DeEvent::Eof => visitor.visit_none(),
_ => visitor.visit_some(self),
}
Expand Down Expand Up @@ -723,6 +724,8 @@ mod tests {
where
T: Deserialize<'de>,
{
// Log XM that we try to deserialize to see it in the failed tests output
dbg!(s);
let mut de = Deserializer::from_str(s);
let result = T::deserialize(&mut de);

Expand Down Expand Up @@ -771,7 +774,7 @@ mod tests {
);
assert_eq!(
de.next().unwrap(),
CData(BytesText::from_plain_str("cdata content"))
CData(BytesCData::from_str("cdata content"))
);
assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag")));

Expand Down Expand Up @@ -907,6 +910,173 @@ mod tests {
source: String,
}

/// Tests for trivial XML documents: empty or contains only primitive type
/// on a top level; all of them should be considered invalid
mod trivial {
use super::*;

#[rustfmt::skip] // excess spaces used for readability
macro_rules! eof {
($name:ident: $type:ty = $value:expr) => {
#[test]
fn $name() {
let item = from_str::<$type>($value).unwrap_err();

match item {
DeError::Eof => (),
_ => panic!("Expected `Eof`, found {:?}", item),
}
}
};
($value:expr) => {
eof!(i8_: i8 = $value);
eof!(i16_: i16 = $value);
eof!(i32_: i32 = $value);
eof!(i64_: i64 = $value);
eof!(isize_: isize = $value);

eof!(u8_: u8 = $value);
eof!(u16_: u16 = $value);
eof!(u32_: u32 = $value);
eof!(u64_: u64 = $value);
eof!(usize_: usize = $value);

serde_if_integer128! {
eof!(u128_: u128 = $value);
eof!(i128_: i128 = $value);
}

eof!(f32_: f32 = $value);
eof!(f64_: f64 = $value);

eof!(false_: bool = $value);
eof!(true_: bool = $value);
eof!(char_: char = $value);

eof!(string: String = $value);
eof!(byte_buf: ByteBuf = $value);

#[test]
fn unit() {
let item = from_str::<()>($value).unwrap_err();

match item {
DeError::Eof => (),
_ => panic!("Expected `Eof`, found {:?}", item),
}
}
};
}

/// Empty document should considered invalid no matter which type we try to deserialize
mod empty_doc {
use super::*;
eof!("");
}

/// Document that contains only comment should be handles as if it is empty
mod only_comment {
use super::*;
eof!("<!--comment-->");
}

/// Tests deserialization from top-level tag content: `<root>...content...</root>`
mod struct_ {
use super::*;

/// Well-formed XML must have a single tag at the root level.
/// Any XML tag can be modeled as a struct, and content of this tag are modeled as
/// fields of this struct.
///
/// Because we want to get access to unnamed content of the tag (usually, this internal
/// XML node called `#text`) we use a rename to a special name `$value`
#[derive(Debug, Deserialize, PartialEq)]
struct Trivial<T> {
#[serde(rename = "$value")]
value: T,
}

macro_rules! in_struct {
($name:ident: $type:ty = $value:expr, $expected:expr) => {
#[test]
fn $name() {
let item: Trivial<$type> = from_str($value).unwrap();

assert_eq!(item, Trivial { value: $expected });
}
};
}

/// Tests deserialization from text content in a tag
#[rustfmt::skip] // tests formatted in a table
mod text {
use super::*;

in_struct!(i8_: i8 = "<root>-42</root>", -42i8);
in_struct!(i16_: i16 = "<root>-4200</root>", -4200i16);
in_struct!(i32_: i32 = "<root>-42000000</root>", -42000000i32);
in_struct!(i64_: i64 = "<root>-42000000000000</root>", -42000000000000i64);
in_struct!(isize_: isize = "<root>-42000000000000</root>", -42000000000000isize);

in_struct!(u8_: u8 = "<root>42</root>", 42u8);
in_struct!(u16_: u16 = "<root>4200</root>", 4200u16);
in_struct!(u32_: u32 = "<root>42000000</root>", 42000000u32);
in_struct!(u64_: u64 = "<root>42000000000000</root>", 42000000000000u64);
in_struct!(usize_: usize = "<root>42000000000000</root>", 42000000000000usize);

serde_if_integer128! {
in_struct!(u128_: u128 = "<root>420000000000000000000000000000</root>", 420000000000000000000000000000u128);
in_struct!(i128_: i128 = "<root>-420000000000000000000000000000</root>", -420000000000000000000000000000i128);
}

in_struct!(f32_: f32 = "<root>4.2</root>", 4.2f32);
in_struct!(f64_: f64 = "<root>4.2</root>", 4.2f64);

in_struct!(false_: bool = "<root>false</root>", false);
in_struct!(true_: bool = "<root>true</root>", true);
in_struct!(char_: char = "<root>r</root>", 'r');

in_struct!(string: String = "<root>escaped&#x20;string</root>", "escaped string".into());
in_struct!(byte_buf: ByteBuf = "<root>escaped&#x20;byte_buf</root>", ByteBuf(r"escaped byte_buf".into()));
}

/// Tests deserialization from CDATA content in a tag.
/// CDATA handling similar to text handling except that strings does not unescapes
#[rustfmt::skip] // tests formatted in a table
mod cdata {
use super::*;

in_struct!(i8_: i8 = "<root><![CDATA[-42]]></root>", -42i8);
in_struct!(i16_: i16 = "<root><![CDATA[-4200]]></root>", -4200i16);
in_struct!(i32_: i32 = "<root><![CDATA[-42000000]]></root>", -42000000i32);
in_struct!(i64_: i64 = "<root><![CDATA[-42000000000000]]></root>", -42000000000000i64);
in_struct!(isize_: isize = "<root><![CDATA[-42000000000000]]></root>", -42000000000000isize);

in_struct!(u8_: u8 = "<root><![CDATA[42]]></root>", 42u8);
in_struct!(u16_: u16 = "<root><![CDATA[4200]]></root>", 4200u16);
in_struct!(u32_: u32 = "<root><![CDATA[42000000]]></root>", 42000000u32);
in_struct!(u64_: u64 = "<root><![CDATA[42000000000000]]></root>", 42000000000000u64);
in_struct!(usize_: usize = "<root><![CDATA[42000000000000]]></root>", 42000000000000usize);

serde_if_integer128! {
in_struct!(u128_: u128 = "<root><![CDATA[420000000000000000000000000000]]></root>", 420000000000000000000000000000u128);
in_struct!(i128_: i128 = "<root><![CDATA[-420000000000000000000000000000]]></root>", -420000000000000000000000000000i128);
}

in_struct!(f32_: f32 = "<root><![CDATA[4.2]]></root>", 4.2f32);
in_struct!(f64_: f64 = "<root><![CDATA[4.2]]></root>", 4.2f64);

in_struct!(false_: bool = "<root><![CDATA[false]]></root>", false);
in_struct!(true_: bool = "<root><![CDATA[true]]></root>", true);
in_struct!(char_: char = "<root><![CDATA[r]]></root>", 'r');

// Escape sequences does not processed inside CDATA section
in_struct!(string: String = "<root><![CDATA[escaped&#x20;string]]></root>", "escaped&#x20;string".into());
in_struct!(byte_buf: ByteBuf = "<root><![CDATA[escaped&#x20;byte_buf]]></root>", ByteBuf(r"escaped&#x20;byte_buf".into()));
}
}
}

#[test]
fn multiple_roots_attributes() {
let s = r##"
Expand Down
4 changes: 3 additions & 1 deletion src/de/var.rs
Expand Up @@ -36,6 +36,8 @@ where
let decoder = self.de.reader.decoder();
let de = match self.de.peek()? {
DeEvent::Text(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, true),
// Escape sequences does not processed inside CDATA section
DeEvent::CData(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, false),
DeEvent::Start(e) => EscapedDeserializer::new(Cow::Borrowed(e.name()), decoder, false),
_ => {
return Err(DeError::Unsupported(
Expand Down Expand Up @@ -64,7 +66,7 @@ where
fn unit_variant(self) -> Result<(), DeError> {
match self.de.next()? {
DeEvent::Start(e) => self.de.read_to_end(e.name()),
DeEvent::Text(_) => Ok(()),
DeEvent::Text(_) | DeEvent::CData(_) => Ok(()),
_ => unreachable!(),
}
}
Expand Down