tafia · Mingun · Aug 24, 2021 · Mar 20, 2022 · Aug 7, 2021 · Mar 13, 2022
diff --git a/Changelog.md b/Changelog.md
@@ -19,6 +19,15 @@
   from the attribute and element names and attribute values
 - fix: allow to deserialize `unit`s from text and CDATA content.
   `DeError::InvalidUnit` variant is removed, because after fix it is no longer used
+- test: add tests for trivial documents (empty / only comment / `<root>...</root>` -- one tag with content)
+- fix: CDATA was not handled in many cases where it should
+- fix: do not unescape CDATA content because it never escaped by design
+  ([#311](https://github.com/tafia/quick-xml/issues/311)).
+
+  NOTE: now text content when deserialized into bytes (`Vec<u8>` / `&[u8]`), also unescaped.
+  It is impossible to get a raw XML data in bytes buffer. Actually, deserializing of bytes
+  should be prohibited, because XML cannot store raw byte data. You should store binary
+  data in a string hex- or base64- or any-other-schema-encoded.
 
 ## 0.23.0-alpha3
 

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -225,7 +225,7 @@ fn bench_quick_xml_one_cdata_event_trimmed(b: &mut Bencher) {
             .check_comments(false)
             .trim_text(true);
         match r.read_event(&mut buf) {
-            Ok(Event::CData(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+            Ok(Event::CData(ref e)) => nbtxt += e.len(),
             something_else => panic!("Did not expect {:?}", something_else),
         };
 

diff --git a/src/de/byte_buf.rs b/src/de/byte_buf.rs
@@ -1,11 +1,12 @@
 //! Helper types for tests
 
+use crate::utils::write_byte_string;
 use serde::de::{self, Deserialize, Deserializer, Error};
 use std::fmt;
 
 /// Wrapper around `Vec<u8>` that deserialized using `deserialize_byte_buf`
 /// instead of vector's generic `deserialize_seq`
-#[derive(Debug, PartialEq)]
+#[derive(PartialEq)]
 pub struct ByteBuf(pub Vec<u8>);
 
 impl<'de> Deserialize<'de> for ByteBuf {
@@ -35,9 +36,15 @@ impl<'de> Deserialize<'de> for ByteBuf {
     }
 }
 
+impl fmt::Debug for ByteBuf {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write_byte_string(f, &self.0)
+    }
+}
+
 /// Wrapper around `&[u8]` that deserialized using `deserialize_bytes`
 /// instead of vector's generic `deserialize_seq`
-#[derive(Debug, PartialEq)]
+#[derive(PartialEq)]
 pub struct Bytes<'de>(pub &'de [u8]);
 
 impl<'de> Deserialize<'de> for Bytes<'de> {
@@ -62,3 +69,9 @@ impl<'de> Deserialize<'de> for Bytes<'de> {
         Ok(d.deserialize_bytes(Visitor)?)
     }
 }
+
+impl<'de> fmt::Debug for Bytes<'de> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write_byte_string(f, self.0)
+    }
+}
diff --git a/src/de/map.rs b/src/de/map.rs
@@ -100,7 +100,7 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a,
         } else {
             // try getting from events (<key>value</key>)
             match self.de.peek()? {
-                DeEvent::Text(_) => {
+                DeEvent::Text(_) | DeEvent::CData(_) => {
                     self.state = State::InnerValue;
                     // Deserialize `key` from special attribute name which means
                     // that value should be taken from the text content of the

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -117,7 +117,7 @@ mod var;
 pub use crate::errors::serialize::DeError;
 use crate::{
     errors::Error,
-    events::{BytesEnd, BytesStart, BytesText, Event},
+    events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
     reader::Decoder,
     Reader,
 };
@@ -141,7 +141,7 @@ pub enum DeEvent<'a> {
     Text(BytesText<'a>),
     /// Unescaped character data between `Start` and `End` element,
     /// stored in `<![CDATA[...]]>`.
-    CData(BytesText<'a>),
+    CData(BytesCData<'a>),
     /// End of XML document.
     Eof,
 }
@@ -300,18 +300,20 @@ where
     /// |`<tag ...>text</tag>`|`text`     |Complete tag consumed       |
     /// |`<tag/>`             |empty slice|Virtual end tag not consumed|
     /// |`</tag>`             |empty slice|Not consumed                |
-    fn next_text(&mut self) -> Result<BytesText<'de>, DeError> {
+    fn next_text(&mut self) -> Result<BytesCData<'de>, DeError> {
         match self.next()? {
-            DeEvent::Text(e) | DeEvent::CData(e) => Ok(e),
+            DeEvent::Text(e) => e.unescape().map_err(|e| DeError::Xml(e.into())),
+            DeEvent::CData(e) => Ok(e),
             DeEvent::Eof => Err(DeError::Eof),
             DeEvent::Start(e) => {
                 // allow one nested level
                 let inner = self.next()?;
                 let t = match inner {
-                    DeEvent::Text(t) | DeEvent::CData(t) => t,
+                    DeEvent::Text(t) => t.unescape().map_err(|e| DeError::Xml(e.into()))?,
+                    DeEvent::CData(t) => t,
                     DeEvent::Start(_) => return Err(DeError::Start),
                     DeEvent::End(end) if end.name() == e.name() => {
-                        return Ok(BytesText::from_escaped(&[] as &[u8]));
+                        return Ok(BytesCData::new(&[] as &[u8]));
                     }
                     DeEvent::End(_) => return Err(DeError::End),
                     DeEvent::Eof => return Err(DeError::Eof),
@@ -321,7 +323,7 @@ where
             }
             DeEvent::End(e) => {
                 self.peek = Some(DeEvent::End(e));
-                Ok(BytesText::from_escaped(&[] as &[u8]))
+                Ok(BytesCData::new(&[] as &[u8]))
             }
         }
     }
@@ -406,13 +408,12 @@ where
         deserialize_bool(txt.as_ref(), self.reader.decoder(), visitor)
     }
 
+    /// Representation of owned strings the same as [non-owned](#method.deserialize_str).
     fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, DeError>
     where
         V: Visitor<'de>,
     {
-        let text = self.next_text()?;
-        let string = text.decode_and_escape(self.reader.decoder())?;
-        visitor.visit_string(string.into_owned())
+        self.deserialize_str(visitor)
     }
 
     fn deserialize_char<V>(self, visitor: V) -> Result<V::Value, DeError>
@@ -427,7 +428,7 @@ where
         V: Visitor<'de>,
     {
         let text = self.next_text()?;
-        let string = text.decode_and_escape(self.reader.decoder())?;
+        let string = text.decode(self.reader.decoder())?;
         match string {
             Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
             Cow::Owned(string) => visitor.visit_string(string),
@@ -439,8 +440,7 @@ where
         V: Visitor<'de>,
     {
         let text = self.next_text()?;
-        let value = text.escaped();
-        visitor.visit_bytes(value)
+        visitor.visit_bytes(&text)
     }
 
     fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
@@ -562,6 +562,7 @@ where
     {
         match self.peek()? {
             DeEvent::Text(t) if t.is_empty() => visitor.visit_none(),
+            DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
             DeEvent::Eof => visitor.visit_none(),
             _ => visitor.visit_some(self),
         }
@@ -723,6 +724,8 @@ mod tests {
     where
         T: Deserialize<'de>,
     {
+        // Log XM that we try to deserialize to see it in the failed tests output
+        dbg!(s);
         let mut de = Deserializer::from_str(s);
         let result = T::deserialize(&mut de);
 
@@ -771,7 +774,7 @@ mod tests {
         );
         assert_eq!(
             de.next().unwrap(),
-            CData(BytesText::from_plain_str("cdata content"))
+            CData(BytesCData::from_str("cdata content"))
         );
         assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag")));
 
@@ -907,6 +910,173 @@ mod tests {
         source: String,
     }
 
+    /// Tests for trivial XML documents: empty or contains only primitive type
+    /// on a top level; all of them should be considered invalid
+    mod trivial {
+        use super::*;
+
+        #[rustfmt::skip] // excess spaces used for readability
+        macro_rules! eof {
+            ($name:ident: $type:ty = $value:expr) => {
+                #[test]
+                fn $name() {
+                    let item = from_str::<$type>($value).unwrap_err();
+
+                    match item {
+                        DeError::Eof => (),
+                        _ => panic!("Expected `Eof`, found {:?}", item),
+                    }
+                }
+            };
+            ($value:expr) => {
+                eof!(i8_:    i8    = $value);
+                eof!(i16_:   i16   = $value);
+                eof!(i32_:   i32   = $value);
+                eof!(i64_:   i64   = $value);
+                eof!(isize_: isize = $value);
+
+                eof!(u8_:    u8    = $value);
+                eof!(u16_:   u16   = $value);
+                eof!(u32_:   u32   = $value);
+                eof!(u64_:   u64   = $value);
+                eof!(usize_: usize = $value);
+
+                serde_if_integer128! {
+                    eof!(u128_: u128 = $value);
+                    eof!(i128_: i128 = $value);
+                }
+
+                eof!(f32_: f32 = $value);
+                eof!(f64_: f64 = $value);
+
+                eof!(false_: bool = $value);
+                eof!(true_: bool = $value);
+                eof!(char_: char = $value);
+
+                eof!(string: String = $value);
+                eof!(byte_buf: ByteBuf = $value);
+
+                #[test]
+                fn unit() {
+                    let item = from_str::<()>($value).unwrap_err();
+
+                    match item {
+                        DeError::Eof => (),
+                        _ => panic!("Expected `Eof`, found {:?}", item),
+                    }
+                }
+            };
+        }
+
+        /// Empty document should considered invalid no matter which type we try to deserialize
+        mod empty_doc {
+            use super::*;
+            eof!("");
+        }
+
+        /// Document that contains only comment should be handles as if it is empty
+        mod only_comment {
+            use super::*;
+            eof!("<!--comment-->");
+        }
+
+        /// Tests deserialization from top-level tag content: `<root>...content...</root>`
+        mod struct_ {
+            use super::*;
+
+            /// Well-formed XML must have a single tag at the root level.
+            /// Any XML tag can be modeled as a struct, and content of this tag are modeled as
+            /// fields of this struct.
+            ///
+            /// Because we want to get access to unnamed content of the tag (usually, this internal
+            /// XML node called `#text`) we use a rename to a special name `$value`
+            #[derive(Debug, Deserialize, PartialEq)]
+            struct Trivial<T> {
+                #[serde(rename = "$value")]
+                value: T,
+            }
+
+            macro_rules! in_struct {
+                ($name:ident: $type:ty = $value:expr, $expected:expr) => {
+                    #[test]
+                    fn $name() {
+                        let item: Trivial<$type> = from_str($value).unwrap();
+
+                        assert_eq!(item, Trivial { value: $expected });
+                    }
+                };
+            }
+
+            /// Tests deserialization from text content in a tag
+            #[rustfmt::skip] // tests formatted in a table
+            mod text {
+                use super::*;
+
+                in_struct!(i8_:  i8  = "<root>-42</root>", -42i8);
+                in_struct!(i16_: i16 = "<root>-4200</root>", -4200i16);
+                in_struct!(i32_: i32 = "<root>-42000000</root>", -42000000i32);
+                in_struct!(i64_: i64 = "<root>-42000000000000</root>", -42000000000000i64);
+                in_struct!(isize_: isize = "<root>-42000000000000</root>", -42000000000000isize);
+
+                in_struct!(u8_:  u8  = "<root>42</root>", 42u8);
+                in_struct!(u16_: u16 = "<root>4200</root>", 4200u16);
+                in_struct!(u32_: u32 = "<root>42000000</root>", 42000000u32);
+                in_struct!(u64_: u64 = "<root>42000000000000</root>", 42000000000000u64);
+                in_struct!(usize_: usize = "<root>42000000000000</root>", 42000000000000usize);
+
+                serde_if_integer128! {
+                    in_struct!(u128_: u128 = "<root>420000000000000000000000000000</root>", 420000000000000000000000000000u128);
+                    in_struct!(i128_: i128 = "<root>-420000000000000000000000000000</root>", -420000000000000000000000000000i128);
+                }
+
+                in_struct!(f32_: f32 = "<root>4.2</root>", 4.2f32);
+                in_struct!(f64_: f64 = "<root>4.2</root>", 4.2f64);
+
+                in_struct!(false_: bool = "<root>false</root>", false);
+                in_struct!(true_: bool = "<root>true</root>", true);
+                in_struct!(char_: char = "<root>r</root>", 'r');
+
+                in_struct!(string:   String  = "<root>escaped&#x20;string</root>", "escaped string".into());
+                in_struct!(byte_buf: ByteBuf = "<root>escaped&#x20;byte_buf</root>", ByteBuf(r"escaped byte_buf".into()));
+            }
+
+            /// Tests deserialization from CDATA content in a tag.
+            /// CDATA handling similar to text handling except that strings does not unescapes
+            #[rustfmt::skip] // tests formatted in a table
+            mod cdata {
+                use super::*;
+
+                in_struct!(i8_:  i8  = "<root><![CDATA[-42]]></root>", -42i8);
+                in_struct!(i16_: i16 = "<root><![CDATA[-4200]]></root>", -4200i16);
+                in_struct!(i32_: i32 = "<root><![CDATA[-42000000]]></root>", -42000000i32);
+                in_struct!(i64_: i64 = "<root><![CDATA[-42000000000000]]></root>", -42000000000000i64);
+                in_struct!(isize_: isize = "<root><![CDATA[-42000000000000]]></root>", -42000000000000isize);
+
+                in_struct!(u8_:  u8  = "<root><![CDATA[42]]></root>", 42u8);
+                in_struct!(u16_: u16 = "<root><![CDATA[4200]]></root>", 4200u16);
+                in_struct!(u32_: u32 = "<root><![CDATA[42000000]]></root>", 42000000u32);
+                in_struct!(u64_: u64 = "<root><![CDATA[42000000000000]]></root>", 42000000000000u64);
+                in_struct!(usize_: usize = "<root><![CDATA[42000000000000]]></root>", 42000000000000usize);
+
+                serde_if_integer128! {
+                    in_struct!(u128_: u128 = "<root><![CDATA[420000000000000000000000000000]]></root>", 420000000000000000000000000000u128);
+                    in_struct!(i128_: i128 = "<root><![CDATA[-420000000000000000000000000000]]></root>", -420000000000000000000000000000i128);
+                }
+
+                in_struct!(f32_: f32 = "<root><![CDATA[4.2]]></root>", 4.2f32);
+                in_struct!(f64_: f64 = "<root><![CDATA[4.2]]></root>", 4.2f64);
+
+                in_struct!(false_: bool = "<root><![CDATA[false]]></root>", false);
+                in_struct!(true_: bool = "<root><![CDATA[true]]></root>", true);
+                in_struct!(char_: char = "<root><![CDATA[r]]></root>", 'r');
+
+                // Escape sequences does not processed inside CDATA section
+                in_struct!(string:   String  = "<root><![CDATA[escaped&#x20;string]]></root>", "escaped&#x20;string".into());
+                in_struct!(byte_buf: ByteBuf = "<root><![CDATA[escaped&#x20;byte_buf]]></root>", ByteBuf(r"escaped&#x20;byte_buf".into()));
+            }
+        }
+    }
+
     #[test]
     fn multiple_roots_attributes() {
         let s = r##"

diff --git a/src/de/var.rs b/src/de/var.rs
@@ -36,6 +36,8 @@ where
         let decoder = self.de.reader.decoder();
         let de = match self.de.peek()? {
             DeEvent::Text(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, true),
+            // Escape sequences does not processed inside CDATA section
+            DeEvent::CData(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, false),
             DeEvent::Start(e) => EscapedDeserializer::new(Cow::Borrowed(e.name()), decoder, false),
             _ => {
                 return Err(DeError::Unsupported(
@@ -64,7 +66,7 @@ where
     fn unit_variant(self) -> Result<(), DeError> {
         match self.de.next()? {
             DeEvent::Start(e) => self.de.read_to_end(e.name()),
-            DeEvent::Text(_) => Ok(()),
+            DeEvent::Text(_) | DeEvent::CData(_) => Ok(()),
             _ => unreachable!(),
         }
     }