diff --git a/Changelog.md b/Changelog.md
index d406911d..13a813e8 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -33,6 +33,9 @@
([#344](https://github.com/tafia/quick-xml/issues/344))
- test: add tests for trivial documents (empty / only comment / `...` -- one tag with content)
- fix: CDATA was not handled in many cases where it should
+- fix: do not unescape CDATA content because it never escaped by design.
+ CDATA event data now represented by its own `BytesCData` type
+ ([quick-xml#311](https://github.com/tafia/quick-xml/issues/311))
## 0.23.0-alpha3
diff --git a/benches/bench.rs b/benches/bench.rs
index 4dc76bbe..81d040e3 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -223,7 +223,7 @@ fn bench_fast_xml_one_cdata_event_trimmed(b: &mut Bencher) {
.check_comments(false)
.trim_text(true);
match r.read_event(&mut buf) {
- Ok(Event::CData(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+ Ok(Event::CData(ref e)) => nbtxt += e.len(),
something_else => panic!("Did not expect {:?}", something_else),
};
diff --git a/src/de/mod.rs b/src/de/mod.rs
index 8426dca7..00935738 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -113,7 +113,7 @@ mod var;
pub use crate::errors::serialize::DeError;
use crate::{
errors::Error,
- events::{BytesEnd, BytesStart, BytesText, Event},
+ events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
reader::Decoder,
Reader,
};
@@ -137,7 +137,7 @@ pub enum DeEvent<'a> {
Text(BytesText<'a>),
/// Unescaped character data between `Start` and `End` element,
/// stored in ``.
- CData(BytesText<'a>),
+ CData(BytesCData<'a>),
/// End of XML document.
Eof,
}
@@ -316,19 +316,23 @@ where
/// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it, consumes events up to ``
/// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to ``
/// |[`DeEvent::Eof`] | |Emits [`DeError::Eof`]
- fn next_text(&mut self) -> Result, DeError> {
+ fn next_text(&mut self, unescape: bool) -> Result, DeError> {
match self.next()? {
- DeEvent::Text(e) | DeEvent::CData(e) => Ok(e),
+ DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::Xml(e.into())),
+ DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
+ DeEvent::CData(e) => Ok(e),
DeEvent::Start(e) => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
- DeEvent::Text(t) | DeEvent::CData(t) => t,
+ DeEvent::Text(t) if unescape => t.unescape()?,
+ DeEvent::Text(t) => BytesCData::new(t.into_inner()),
+ DeEvent::CData(t) => t,
DeEvent::Start(_) => return Err(DeError::Start),
// We can get End event in case of `` or `` input
// Return empty text in that case
DeEvent::End(end) if end.name() == e.name() => {
- return Ok(BytesText::from_escaped(&[] as &[u8]));
+ return Ok(BytesCData::new(&[] as &[u8]));
}
DeEvent::End(_) => return Err(DeError::End),
DeEvent::Eof => return Err(DeError::Eof),
@@ -397,9 +401,9 @@ macro_rules! deserialize_type {
where
V: Visitor<'de>,
{
- let txt = self.next_text()?;
// No need to unescape because valid integer representations cannot be escaped
- let string = txt.decode(self.reader.decoder())?;
+ let text = self.next_text(false)?;
+ let string = text.decode(self.reader.decoder())?;
visitor.$visit(string.parse()?)
}
};
@@ -438,9 +442,10 @@ where
where
V: Visitor<'de>,
{
- let txt = self.next_text()?;
+ // No need to unescape because valid integer representations cannot be escaped
+ let text = self.next_text(false)?;
- deserialize_bool(txt.as_ref(), self.reader.decoder(), visitor)
+ deserialize_bool(text.as_ref(), self.reader.decoder(), visitor)
}
/// Representation of owned strings the same as [non-owned](#method.deserialize_str).
@@ -462,8 +467,8 @@ where
where
V: Visitor<'de>,
{
- let text = self.next_text()?;
- let string = text.decode_and_escape(self.reader.decoder())?;
+ let text = self.next_text(true)?;
+ let string = text.decode(self.reader.decoder())?;
match string {
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
Cow::Owned(string) => visitor.visit_string(string),
@@ -474,16 +479,17 @@ where
where
V: Visitor<'de>,
{
- let text = self.next_text()?;
- let value = text.escaped();
- visitor.visit_bytes(value)
+ // No need to unescape because bytes gives access to the raw XML input
+ let text = self.next_text(false)?;
+ visitor.visit_bytes(&text)
}
fn deserialize_byte_buf(self, visitor: V) -> Result
where
V: Visitor<'de>,
{
- let text = self.next_text()?;
+ // No need to unescape because bytes gives access to the raw XML input
+ let text = self.next_text(false)?;
let value = text.into_inner().into_owned();
visitor.visit_byte_buf(value)
}
@@ -597,7 +603,8 @@ where
V: Visitor<'de>,
{
match self.peek()? {
- DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
+ DeEvent::Text(t) if t.is_empty() => visitor.visit_none(),
+ DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
DeEvent::Eof => visitor.visit_none(),
_ => visitor.visit_some(self),
}
@@ -790,7 +797,7 @@ mod tests {
);
assert_eq!(
de.next().unwrap(),
- CData(BytesText::from_plain_str("cdata content"))
+ CData(BytesCData::from_str("cdata content"))
);
assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag")));
diff --git a/src/errors.rs b/src/errors.rs
index ec105e61..2860c49b 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -133,7 +133,7 @@ impl std::error::Error for Error {
pub mod serialize {
//! A module to handle serde (de)serialization errors
- use super::Error;
+ use super::*;
use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
@@ -170,7 +170,7 @@ pub mod serialize {
}
impl fmt::Display for DeError {
- fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeError::Custom(s) => write!(f, "{}", s),
DeError::Xml(e) => write!(f, "{}", e),
@@ -221,6 +221,13 @@ pub mod serialize {
}
}
+ impl From for DeError {
+ #[inline]
+ fn from(e: EscapeError) -> Self {
+ Self::Xml(e.into())
+ }
+ }
+
impl From for DeError {
fn from(e: ParseIntError) -> Self {
DeError::Int(e)
diff --git a/src/events/mod.rs b/src/events/mod.rs
index 7bdfaf47..96eaa292 100644
--- a/src/events/mod.rs
+++ b/src/events/mod.rs
@@ -40,11 +40,14 @@ pub mod attributes;
use encoding_rs::Encoding;
use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8};
-use crate::escape::{do_unescape, escape};
+use crate::escape::{do_unescape, escape, partial_escape};
use crate::utils::write_cow_string;
use crate::{errors::Error, errors::Result, reader::Reader};
use attributes::{Attribute, Attributes};
+#[cfg(feature = "serialize")]
+use crate::escape::EscapeError;
+
use memchr;
/// Opening tag data (`Event::Start`), with optional attributes.
@@ -591,12 +594,24 @@ impl<'a> BytesText<'a> {
}
/// Extracts the inner `Cow` from the `BytesText` event container.
- #[cfg(feature = "serialize")]
#[inline]
- pub(crate) fn into_inner(self) -> Cow<'a, [u8]> {
+ pub fn into_inner(self) -> Cow<'a, [u8]> {
self.content
}
+ /// Returns unescaped version of the text content, that can be written
+ /// as CDATA in XML
+ #[cfg(feature = "serialize")]
+ pub(crate) fn unescape(self) -> std::result::Result, EscapeError> {
+ //TODO: need to think about better API instead of dozens similar functions
+ // Maybe use builder pattern. After that expose function as public API
+ //FIXME: need to take into account entities defined in the document
+ Ok(BytesCData::new(match do_unescape(&self.content, None)? {
+ Cow::Borrowed(_) => self.content,
+ Cow::Owned(unescaped) => Cow::Owned(unescaped),
+ }))
+ }
+
/// gets escaped content
///
/// Searches for '&' into content and try to escape the coded character if possible
@@ -632,60 +647,6 @@ impl<'a> BytesText<'a> {
do_unescape(self, custom_entities).map_err(Error::EscapeError)
}
- /// Gets content of this text buffer in the specified encoding
- #[cfg(feature = "serialize")]
- pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> {
- Ok(match &self.content {
- Cow::Borrowed(bytes) => {
- #[cfg(feature = "encoding")]
- {
- decoder.decode(bytes)
- }
- #[cfg(not(feature = "encoding"))]
- {
- decoder.decode(bytes)?.into()
- }
- }
- Cow::Owned(bytes) => {
- #[cfg(feature = "encoding")]
- let decoded = decoder.decode(bytes).into_owned();
-
- #[cfg(not(feature = "encoding"))]
- let decoded = decoder.decode(bytes)?.to_string();
-
- decoded.into()
- }
- })
- }
-
- #[cfg(feature = "serialize")]
- pub(crate) fn decode_and_escape(
- &self,
- decoder: crate::reader::Decoder,
- ) -> Result> {
- match self.decode(decoder)? {
- Cow::Borrowed(decoded) => {
- let unescaped =
- do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
- match unescaped {
- Cow::Borrowed(unescaped) => {
- from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8)
- }
- Cow::Owned(unescaped) => String::from_utf8(unescaped)
- .map(|s| s.into())
- .map_err(|e| Error::Utf8(e.utf8_error())),
- }
- }
- Cow::Owned(decoded) => {
- let unescaped =
- do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
- String::from_utf8(unescaped.into_owned())
- .map(|s| s.into())
- .map_err(|e| Error::Utf8(e.utf8_error()))
- }
- }
- }
-
/// helper method to unescape then decode self using the reader encoding
/// but without BOM (Byte order mark)
///
@@ -846,6 +807,117 @@ impl<'a> std::fmt::Debug for BytesText<'a> {
}
}
+/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
+/// [convert](Self::escape) it to [`BytesText`]
+#[derive(Clone, Eq, PartialEq)]
+pub struct BytesCData<'a> {
+ content: Cow<'a, [u8]>,
+}
+
+impl<'a> BytesCData<'a> {
+ /// Creates a new `BytesCData` from a byte sequence.
+ #[inline]
+ pub fn new>>(content: C) -> Self {
+ Self {
+ content: content.into(),
+ }
+ }
+
+ /// Creates a new `BytesCData` from a string
+ #[inline]
+ pub fn from_str(content: &'a str) -> Self {
+ Self::new(content.as_bytes())
+ }
+
+ /// Ensures that all data is owned to extend the object's lifetime if
+ /// necessary.
+ #[inline]
+ pub fn into_owned(self) -> BytesCData<'static> {
+ BytesCData {
+ content: self.content.into_owned().into(),
+ }
+ }
+
+ /// Extracts the inner `Cow` from the `BytesCData` event container.
+ #[inline]
+ pub fn into_inner(self) -> Cow<'a, [u8]> {
+ self.content
+ }
+
+ /// Converts this CDATA content to an escaped version, that can be written
+ /// as an usual text in XML.
+ ///
+ /// This function performs following replacements:
+ ///
+ /// | Character | Replacement
+ /// |-----------|------------
+ /// | `<` | `<`
+ /// | `>` | `>`
+ /// | `&` | `&`
+ /// | `'` | `'`
+ /// | `"` | `"`
+ pub fn escape(self) -> BytesText<'a> {
+ BytesText::from_escaped(match escape(&self.content) {
+ Cow::Borrowed(_) => self.content,
+ Cow::Owned(escaped) => Cow::Owned(escaped),
+ })
+ }
+
+ /// Converts this CDATA content to an escaped version, that can be written
+ /// as an usual text in XML.
+ ///
+ /// In XML text content, it is allowed (though not recommended) to leave
+ /// the quote special characters `"` and `'` unescaped.
+ ///
+ /// This function performs following replacements:
+ ///
+ /// | Character | Replacement
+ /// |-----------|------------
+ /// | `<` | `<`
+ /// | `>` | `>`
+ /// | `&` | `&`
+ pub fn partial_escape(self) -> BytesText<'a> {
+ BytesText::from_escaped(match partial_escape(&self.content) {
+ Cow::Borrowed(_) => self.content,
+ Cow::Owned(escaped) => Cow::Owned(escaped),
+ })
+ }
+
+ /// Gets content of this text buffer in the specified encoding
+ #[cfg(feature = "serialize")]
+ pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> {
+ Ok(match &self.content {
+ Cow::Borrowed(bytes) => {
+ #[cfg(feature = "encoding")]
+ {
+ decoder.decode(bytes)
+ }
+ #[cfg(not(feature = "encoding"))]
+ {
+ decoder.decode(bytes)?.into()
+ }
+ }
+ Cow::Owned(bytes) => {
+ #[cfg(feature = "encoding")]
+ let decoded = decoder.decode(bytes).into_owned();
+
+ #[cfg(not(feature = "encoding"))]
+ let decoded = decoder.decode(bytes)?.to_string();
+
+ decoded.into()
+ }
+ })
+ }
+}
+
+impl<'a> std::fmt::Debug for BytesCData<'a> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ write!(f, "BytesCData {{ content: ")?;
+ write_cow_string(f, &self.content)?;
+ write!(f, " }}")
+ }
+}
+
/// Event emitted by [`Reader::read_event`].
///
/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
@@ -862,7 +934,7 @@ pub enum Event<'a> {
/// Comment ``.
Comment(BytesText<'a>),
/// CData ``.
- CData(BytesText<'a>),
+ CData(BytesCData<'a>),
/// XML declaration ``.
Decl(BytesDecl<'a>),
/// Processing instruction `...?>`.
@@ -920,6 +992,14 @@ impl<'a> Deref for BytesText<'a> {
}
}
+impl<'a> Deref for BytesCData<'a> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ &*self.content
+ }
+}
+
impl<'a> Deref for Event<'a> {
type Target = [u8];
fn deref(&self) -> &[u8] {
diff --git a/src/reader.rs b/src/reader.rs
index 7e7becfe..fab6a41a 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -9,7 +9,8 @@ use std::{fs::File, path::Path, str::from_utf8};
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE};
use crate::errors::{Error, Result};
-use crate::events::{attributes::Attribute, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
+use crate::events::attributes::Attribute;
+use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
use memchr;
@@ -367,7 +368,7 @@ impl Reader {
Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2])))
}
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
- Ok(Event::CData(BytesText::from_plain(&buf[8..])))
+ Ok(Event::CData(BytesCData::new(&buf[8..])))
}
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
let start = buf[8..]
diff --git a/src/writer.rs b/src/writer.rs
index ce711594..768c244d 100644
--- a/src/writer.rs
+++ b/src/writer.rs
@@ -1,7 +1,7 @@
//! A module to handle `Writer`
use crate::errors::{Error, Result};
-use crate::events::{attributes::Attribute, BytesStart, BytesText, Event};
+use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
use std::io::Write;
/// XML writer.
@@ -262,7 +262,7 @@ impl<'a, W: Write> ElementWriter<'a, W> {
}
/// Write a CData event `` inside the current element.
- pub fn write_cdata_content(self, text: BytesText) -> Result<&'a mut Writer> {
+ pub fn write_cdata_content(self, text: BytesCData) -> Result<&'a mut Writer> {
self.writer
.write_event(Event::Start(self.start_tag.to_borrowed()))?;
self.writer.write_event(Event::CData(text))?;
diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs
index 5e58ad94..4ec13c8c 100644
--- a/tests/unit_tests.rs
+++ b/tests/unit_tests.rs
@@ -182,7 +182,7 @@ fn test_cdata() {
fn test_cdata_open_close() {
let mut r = Reader::from_str(" test]]>");
r.trim_text(true);
- next_eq!(r, CData, b"test <> test");
+ next_eq!(r, CData, b"test <> test");
}
#[test]