Fix #311: Introduce BytesCData type for CData events and use it ins…

…tead of `BytesText` This commit revert changes from 85f9f68
Mingun · Mar 20, 2022 · 9433830 · 9433830
1 parent a1ad40d
commit 9433830
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 78 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -21,6 +21,8 @@
   `DeError::InvalidUnit` variant is removed, because after fix it is no longer used
 - test: add tests for trivial documents (empty / only comment / `<root>...</root>` -- one tag with content)
 - fix: CDATA was not handled in many cases where it should
+- fix: do not unescape CDATA content because it never escaped by design
+  ([#311](https://github.com/tafia/quick-xml/issues/311))
 
 ## 0.23.0-alpha3
 

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -117,7 +117,7 @@ mod var;
 pub use crate::errors::serialize::DeError;
 use crate::{
     errors::Error,
-    events::{BytesEnd, BytesStart, BytesText, Event},
+    events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
     reader::Decoder,
     Reader,
 };
@@ -141,7 +141,7 @@ pub enum DeEvent<'a> {
     Text(BytesText<'a>),
     /// Unescaped character data between `Start` and `End` element,
     /// stored in `<![CDATA[...]]>`.
-    CData(BytesText<'a>),
+    CData(BytesCData<'a>),
     /// End of XML document.
     Eof,
 }
@@ -300,18 +300,20 @@ where
     /// |`<tag ...>text</tag>`|`text`     |Complete tag consumed       |
     /// |`<tag/>`             |empty slice|Virtual end tag not consumed|
     /// |`</tag>`             |empty slice|Not consumed                |
-    fn next_text(&mut self) -> Result<BytesText<'de>, DeError> {
+    fn next_text(&mut self) -> Result<BytesCData<'de>, DeError> {
         match self.next()? {
-            DeEvent::Text(e) | DeEvent::CData(e) => Ok(e),
+            DeEvent::Text(e) => e.unescape().map_err(|e| DeError::Xml(e.into())),
+            DeEvent::CData(e) => Ok(e),
             DeEvent::Eof => Err(DeError::Eof),
             DeEvent::Start(e) => {
                 // allow one nested level
                 let inner = self.next()?;
                 let t = match inner {
-                    DeEvent::Text(t) | DeEvent::CData(t) => t,
+                    DeEvent::Text(t) => t.unescape().map_err(|e| DeError::Xml(e.into()))?,
+                    DeEvent::CData(t) => t,
                     DeEvent::Start(_) => return Err(DeError::Start),
                     DeEvent::End(end) if end.name() == e.name() => {
-                        return Ok(BytesText::from_escaped(&[] as &[u8]));
+                        return Ok(BytesCData::new(&[] as &[u8]));
                     }
                     DeEvent::End(_) => return Err(DeError::End),
                     DeEvent::Eof => return Err(DeError::Eof),
@@ -321,7 +323,7 @@ where
             }
             DeEvent::End(e) => {
                 self.peek = Some(DeEvent::End(e));
-                Ok(BytesText::from_escaped(&[] as &[u8]))
+                Ok(BytesCData::new(&[] as &[u8]))
             }
         }
     }
@@ -426,7 +428,7 @@ where
         V: Visitor<'de>,
     {
         let text = self.next_text()?;
-        let string = text.decode_and_escape(self.reader.decoder())?;
+        let string = text.decode(self.reader.decoder())?;
         match string {
             Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
             Cow::Owned(string) => visitor.visit_string(string),
@@ -438,8 +440,7 @@ where
         V: Visitor<'de>,
     {
         let text = self.next_text()?;
-        let value = text.escaped();
-        visitor.visit_bytes(value)
+        visitor.visit_bytes(&text)
     }
 
     fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
@@ -560,7 +561,8 @@ where
         V: Visitor<'de>,
     {
         match self.peek()? {
-            DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
+            DeEvent::Text(t) if t.is_empty() => visitor.visit_none(),
+            DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
             DeEvent::Eof => visitor.visit_none(),
             _ => visitor.visit_some(self),
         }
@@ -772,7 +774,7 @@ mod tests {
         );
         assert_eq!(
             de.next().unwrap(),
-            CData(BytesText::from_plain_str("cdata content"))
+            CData(BytesCData::from_str("cdata content"))
         );
         assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag")));
 

diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -40,7 +40,7 @@ pub mod attributes;
 use encoding_rs::Encoding;
 use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8};
 
-use crate::escape::{do_unescape, escape};
+use crate::escape::{do_unescape, escape, partial_escape, EscapeError};
 use crate::utils::write_cow_string;
 use crate::{errors::Error, errors::Result, reader::Reader};
 use attributes::{Attribute, Attributes};
@@ -604,11 +604,13 @@ impl<'a> BytesText<'a> {
         }
     }
 
-    /// Extracts the inner `Cow` from the `BytesText` event container.
-    #[cfg(feature = "serialize")]
-    #[inline]
-    pub(crate) fn into_inner(self) -> Cow<'a, [u8]> {
-        self.content
+    /// Returns unescaped version of the text content, that can be written
+    /// as CDATA in XML
+    pub fn unescape(self) -> std::result::Result<BytesCData<'a>, EscapeError> {
+        Ok(BytesCData::new(match do_unescape(&self.content, None)? {
+            Cow::Borrowed(_) => self.content,
+            Cow::Owned(unescaped) => Cow::Owned(unescaped),
+        }))
     }
 
     /// gets escaped content
@@ -646,60 +648,6 @@ impl<'a> BytesText<'a> {
         do_unescape(self, custom_entities).map_err(Error::EscapeError)
     }
 
-    /// Gets content of this text buffer in the specified encoding
-    #[cfg(feature = "serialize")]
-    pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
-        Ok(match &self.content {
-            Cow::Borrowed(bytes) => {
-                #[cfg(feature = "encoding")]
-                {
-                    decoder.decode(bytes)
-                }
-                #[cfg(not(feature = "encoding"))]
-                {
-                    decoder.decode(bytes)?.into()
-                }
-            }
-            Cow::Owned(bytes) => {
-                #[cfg(feature = "encoding")]
-                let decoded = decoder.decode(bytes).into_owned();
-
-                #[cfg(not(feature = "encoding"))]
-                let decoded = decoder.decode(bytes)?.to_string();
-
-                decoded.into()
-            }
-        })
-    }
-
-    #[cfg(feature = "serialize")]
-    pub(crate) fn decode_and_escape(
-        &self,
-        decoder: crate::reader::Decoder,
-    ) -> Result<Cow<'a, str>> {
-        match self.decode(decoder)? {
-            Cow::Borrowed(decoded) => {
-                let unescaped =
-                    do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
-                match unescaped {
-                    Cow::Borrowed(unescaped) => {
-                        from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8)
-                    }
-                    Cow::Owned(unescaped) => String::from_utf8(unescaped)
-                        .map(|s| s.into())
-                        .map_err(|e| Error::Utf8(e.utf8_error())),
-                }
-            }
-            Cow::Owned(decoded) => {
-                let unescaped =
-                    do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
-                String::from_utf8(unescaped.into_owned())
-                    .map(|s| s.into())
-                    .map_err(|e| Error::Utf8(e.utf8_error()))
-            }
-        }
-    }
-
     /// helper method to unescape then decode self using the reader encoding
     /// but without BOM (Byte order mark)
     ///
@@ -860,6 +808,117 @@ impl<'a> std::fmt::Debug for BytesText<'a> {
     }
 }
 
+/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
+/// [convert](Self::escape) it to [`BytesText`]
+#[derive(Clone, Eq, PartialEq)]
+pub struct BytesCData<'a> {
+    content: Cow<'a, [u8]>,
+}
+
+impl<'a> BytesCData<'a> {
+    /// Creates a new `BytesCData` from a byte sequence.
+    #[inline]
+    pub fn new<C: Into<Cow<'a, [u8]>>>(content: C) -> Self {
+        Self {
+            content: content.into(),
+        }
+    }
+
+    /// Creates a new `BytesCData` from a string
+    #[inline]
+    pub fn from_str(content: &'a str) -> Self {
+        Self::new(content.as_bytes())
+    }
+
+    /// Extracts the inner `Cow` from the `BytesCData` event container.
+    #[inline]
+    pub fn into_inner(self) -> Cow<'a, [u8]> {
+        self.content
+    }
+
+    /// Ensures that all data is owned to extend the object's lifetime if
+    /// necessary.
+    #[inline]
+    pub fn into_owned(self) -> BytesCData<'static> {
+        BytesCData {
+            content: self.content.into_owned().into(),
+        }
+    }
+
+    /// Converts this CDATA content to an escaped version, that can be written
+    /// as an usual text in XML.
+    ///
+    /// This function performs following replacements:
+    ///
+    /// | Character | Replacement
+    /// |-----------|------------
+    /// | `<`       | `&lt;`
+    /// | `>`       | `&gt;`
+    /// | `&`       | `&amp;`
+    /// | `'`       | `&apos;`
+    /// | `"`       | `&quot;`
+    pub fn escape(self) -> BytesText<'a> {
+        BytesText::from_escaped(match escape(&self.content) {
+            Cow::Borrowed(_) => self.content,
+            Cow::Owned(escaped) => Cow::Owned(escaped),
+        })
+    }
+
+    /// Converts this CDATA content to an escaped version, that can be written
+    /// as an usual text in XML.
+    ///
+    /// In XML text content, it is allowed (though not recommended) to leave
+    /// the quote special characters `"` and `'` unescaped.
+    ///
+    /// This function performs following replacements:
+    ///
+    /// | Character | Replacement
+    /// |-----------|------------
+    /// | `<`       | `&lt;`
+    /// | `>`       | `&gt;`
+    /// | `&`       | `&amp;`
+    pub fn partial_escape(self) -> BytesText<'a> {
+        BytesText::from_escaped(match partial_escape(&self.content) {
+            Cow::Borrowed(_) => self.content,
+            Cow::Owned(escaped) => Cow::Owned(escaped),
+        })
+    }
+
+    /// Gets content of this text buffer in the specified encoding
+    #[cfg(feature = "serialize")]
+    pub fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
+        Ok(match &self.content {
+            Cow::Borrowed(bytes) => {
+                #[cfg(feature = "encoding")]
+                {
+                    decoder.decode(bytes)
+                }
+                #[cfg(not(feature = "encoding"))]
+                {
+                    decoder.decode(bytes)?.into()
+                }
+            }
+            Cow::Owned(bytes) => {
+                #[cfg(feature = "encoding")]
+                let decoded = decoder.decode(bytes).into_owned();
+
+                #[cfg(not(feature = "encoding"))]
+                let decoded = decoder.decode(bytes)?.to_string();
+
+                decoded.into()
+            }
+        })
+    }
+}
+
+impl<'a> std::fmt::Debug for BytesCData<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "BytesCData {{ content: ")?;
+        write_cow_string(f, &self.content)?;
+        write!(f, " }}")
+    }
+}
+
 /// Event emitted by [`Reader::read_event`].
 ///
 /// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
@@ -876,7 +935,7 @@ pub enum Event<'a> {
     /// Comment `<!-- ... -->`.
     Comment(BytesText<'a>),
     /// CData `<![CDATA[...]]>`.
-    CData(BytesText<'a>),
+    CData(BytesCData<'a>),
     /// XML declaration `<?xml ...?>`.
     Decl(BytesDecl<'a>),
     /// Processing instruction `<?...?>`.
@@ -934,6 +993,14 @@ impl<'a> Deref for BytesText<'a> {
     }
 }
 
+impl<'a> Deref for BytesCData<'a> {
+    type Target = [u8];
+
+    fn deref(&self) -> &[u8] {
+        &*self.content
+    }
+}
+
 impl<'a> Deref for Event<'a> {
     type Target = [u8];
     fn deref(&self) -> &[u8] {

diff --git a/src/reader.rs b/src/reader.rs
@@ -9,7 +9,8 @@ use std::{fs::File, path::Path, str::from_utf8};
 use encoding_rs::{Encoding, UTF_16BE, UTF_16LE};
 
 use crate::errors::{Error, Result};
-use crate::events::{attributes::Attribute, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
+use crate::events::attributes::Attribute;
+use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
 
 use memchr;
 
@@ -368,7 +369,7 @@ impl<R: BufRead> Reader<R> {
             Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2])))
         } else if uncased_starts_with(buf, b"![CDATA[") {
             debug_assert!(len >= 10, "Minimum length guaranteed by read_bang_elem");
-            Ok(Event::CData(BytesText::from_plain(&buf[8..buf.len() - 2])))
+            Ok(Event::CData(BytesCData::new(&buf[8..buf.len() - 2])))
         } else if uncased_starts_with(buf, b"!DOCTYPE") {
             debug_assert!(len >= 8, "Minimum length guaranteed by read_bang_elem");
             let start = buf[8..]

diff --git a/src/writer.rs b/src/writer.rs
@@ -1,7 +1,7 @@
 //! A module to handle `Writer`
 
 use crate::errors::{Error, Result};
-use crate::events::{attributes::Attribute, BytesStart, BytesText, Event};
+use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
 use std::io::Write;
 
 /// XML writer.
@@ -261,7 +261,7 @@ impl<'a, W: Write> ElementWriter<'a, W> {
     }
 
     /// Write a CData event `<![CDATA[...]]>` inside the current element.
-    pub fn write_cdata_content(self, text: BytesText) -> Result<&'a mut Writer<W>> {
+    pub fn write_cdata_content(self, text: BytesCData) -> Result<&'a mut Writer<W>> {
         self.writer
             .write_event(Event::Start(self.start_tag.to_borrowed()))?;
         self.writer.write_event(Event::CData(text))?;

diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs
@@ -180,7 +180,7 @@ fn test_cdata() {
 fn test_cdata_open_close() {
     let mut r = Reader::from_str("<![CDATA[test <> test]]>");
     r.trim_text(true);
-    next_eq!(r, CData, b"test &lt;&gt; test");
+    next_eq!(r, CData, b"test <> test");
 }
 
 #[test]