From 738ddd7a17f9be0ddd16a0d16605a65121e9c1f4 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 21 Mar 2022 00:02:04 +0500 Subject: [PATCH] Introduce a `BytesCData` type for CData events and use it instead of `BytesText` This fixes tafia/quick-xml#311. This commit revert changes from 85f9f685e971b29e42fa829b45fa47c165383b98 --- Changelog.md | 3 + benches/bench.rs | 2 +- src/de/mod.rs | 43 ++++++---- src/errors.rs | 11 ++- src/events/mod.rs | 196 +++++++++++++++++++++++++++++++------------- src/reader.rs | 5 +- src/writer.rs | 4 +- tests/unit_tests.rs | 2 +- 8 files changed, 182 insertions(+), 84 deletions(-) diff --git a/Changelog.md b/Changelog.md index d406911d..13a813e8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -33,6 +33,9 @@ ([#344](https://github.com/tafia/quick-xml/issues/344)) - test: add tests for trivial documents (empty / only comment / `...` -- one tag with content) - fix: CDATA was not handled in many cases where it should +- fix: do not unescape CDATA content because it never escaped by design. + CDATA event data now represented by its own `BytesCData` type + ([quick-xml#311](https://github.com/tafia/quick-xml/issues/311)) ## 0.23.0-alpha3 diff --git a/benches/bench.rs b/benches/bench.rs index 4dc76bbe..81d040e3 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -223,7 +223,7 @@ fn bench_fast_xml_one_cdata_event_trimmed(b: &mut Bencher) { .check_comments(false) .trim_text(true); match r.read_event(&mut buf) { - Ok(Event::CData(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::CData(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/src/de/mod.rs b/src/de/mod.rs index 8426dca7..00935738 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -113,7 +113,7 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ errors::Error, - events::{BytesEnd, BytesStart, BytesText, Event}, + events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, reader::Decoder, Reader, }; @@ -137,7 +137,7 @@ pub enum DeEvent<'a> { Text(BytesText<'a>), /// Unescaped character data between `Start` and `End` element, /// stored in ``. - CData(BytesText<'a>), + CData(BytesCData<'a>), /// End of XML document. Eof, } @@ -316,19 +316,23 @@ where /// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it, consumes events up to `` /// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to `` /// |[`DeEvent::Eof`] | |Emits [`DeError::Eof`] - fn next_text(&mut self) -> Result, DeError> { + fn next_text(&mut self, unescape: bool) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) | DeEvent::CData(e) => Ok(e), + DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::Xml(e.into())), + DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())), + DeEvent::CData(e) => Ok(e), DeEvent::Start(e) => { // allow one nested level let inner = self.next()?; let t = match inner { - DeEvent::Text(t) | DeEvent::CData(t) => t, + DeEvent::Text(t) if unescape => t.unescape()?, + DeEvent::Text(t) => BytesCData::new(t.into_inner()), + DeEvent::CData(t) => t, DeEvent::Start(_) => return Err(DeError::Start), // We can get End event in case of `` or `` input // Return empty text in that case DeEvent::End(end) if end.name() == e.name() => { - return Ok(BytesText::from_escaped(&[] as &[u8])); + return Ok(BytesCData::new(&[] as &[u8])); } DeEvent::End(_) => return Err(DeError::End), DeEvent::Eof => return Err(DeError::Eof), @@ -397,9 +401,9 @@ macro_rules! deserialize_type { where V: Visitor<'de>, { - let txt = self.next_text()?; // No need to unescape because valid integer representations cannot be escaped - let string = txt.decode(self.reader.decoder())?; + let text = self.next_text(false)?; + let string = text.decode(self.reader.decoder())?; visitor.$visit(string.parse()?) } }; @@ -438,9 +442,10 @@ where where V: Visitor<'de>, { - let txt = self.next_text()?; + // No need to unescape because valid integer representations cannot be escaped + let text = self.next_text(false)?; - deserialize_bool(txt.as_ref(), self.reader.decoder(), visitor) + deserialize_bool(text.as_ref(), self.reader.decoder(), visitor) } /// Representation of owned strings the same as [non-owned](#method.deserialize_str). @@ -462,8 +467,8 @@ where where V: Visitor<'de>, { - let text = self.next_text()?; - let string = text.decode_and_escape(self.reader.decoder())?; + let text = self.next_text(true)?; + let string = text.decode(self.reader.decoder())?; match string { Cow::Borrowed(string) => visitor.visit_borrowed_str(string), Cow::Owned(string) => visitor.visit_string(string), @@ -474,16 +479,17 @@ where where V: Visitor<'de>, { - let text = self.next_text()?; - let value = text.escaped(); - visitor.visit_bytes(value) + // No need to unescape because bytes gives access to the raw XML input + let text = self.next_text(false)?; + visitor.visit_bytes(&text) } fn deserialize_byte_buf(self, visitor: V) -> Result where V: Visitor<'de>, { - let text = self.next_text()?; + // No need to unescape because bytes gives access to the raw XML input + let text = self.next_text(false)?; let value = text.into_inner().into_owned(); visitor.visit_byte_buf(value) } @@ -597,7 +603,8 @@ where V: Visitor<'de>, { match self.peek()? { - DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(), + DeEvent::Text(t) if t.is_empty() => visitor.visit_none(), + DeEvent::CData(t) if t.is_empty() => visitor.visit_none(), DeEvent::Eof => visitor.visit_none(), _ => visitor.visit_some(self), } @@ -790,7 +797,7 @@ mod tests { ); assert_eq!( de.next().unwrap(), - CData(BytesText::from_plain_str("cdata content")) + CData(BytesCData::from_str("cdata content")) ); assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag"))); diff --git a/src/errors.rs b/src/errors.rs index ec105e61..2860c49b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -133,7 +133,7 @@ impl std::error::Error for Error { pub mod serialize { //! A module to handle serde (de)serialization errors - use super::Error; + use super::*; use std::fmt; use std::num::{ParseFloatError, ParseIntError}; @@ -170,7 +170,7 @@ pub mod serialize { } impl fmt::Display for DeError { - fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DeError::Custom(s) => write!(f, "{}", s), DeError::Xml(e) => write!(f, "{}", e), @@ -221,6 +221,13 @@ pub mod serialize { } } + impl From for DeError { + #[inline] + fn from(e: EscapeError) -> Self { + Self::Xml(e.into()) + } + } + impl From for DeError { fn from(e: ParseIntError) -> Self { DeError::Int(e) diff --git a/src/events/mod.rs b/src/events/mod.rs index 7bdfaf47..96eaa292 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -40,11 +40,14 @@ pub mod attributes; use encoding_rs::Encoding; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8}; -use crate::escape::{do_unescape, escape}; +use crate::escape::{do_unescape, escape, partial_escape}; use crate::utils::write_cow_string; use crate::{errors::Error, errors::Result, reader::Reader}; use attributes::{Attribute, Attributes}; +#[cfg(feature = "serialize")] +use crate::escape::EscapeError; + use memchr; /// Opening tag data (`Event::Start`), with optional attributes. @@ -591,12 +594,24 @@ impl<'a> BytesText<'a> { } /// Extracts the inner `Cow` from the `BytesText` event container. - #[cfg(feature = "serialize")] #[inline] - pub(crate) fn into_inner(self) -> Cow<'a, [u8]> { + pub fn into_inner(self) -> Cow<'a, [u8]> { self.content } + /// Returns unescaped version of the text content, that can be written + /// as CDATA in XML + #[cfg(feature = "serialize")] + pub(crate) fn unescape(self) -> std::result::Result, EscapeError> { + //TODO: need to think about better API instead of dozens similar functions + // Maybe use builder pattern. After that expose function as public API + //FIXME: need to take into account entities defined in the document + Ok(BytesCData::new(match do_unescape(&self.content, None)? { + Cow::Borrowed(_) => self.content, + Cow::Owned(unescaped) => Cow::Owned(unescaped), + })) + } + /// gets escaped content /// /// Searches for '&' into content and try to escape the coded character if possible @@ -632,60 +647,6 @@ impl<'a> BytesText<'a> { do_unescape(self, custom_entities).map_err(Error::EscapeError) } - /// Gets content of this text buffer in the specified encoding - #[cfg(feature = "serialize")] - pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { - Ok(match &self.content { - Cow::Borrowed(bytes) => { - #[cfg(feature = "encoding")] - { - decoder.decode(bytes) - } - #[cfg(not(feature = "encoding"))] - { - decoder.decode(bytes)?.into() - } - } - Cow::Owned(bytes) => { - #[cfg(feature = "encoding")] - let decoded = decoder.decode(bytes).into_owned(); - - #[cfg(not(feature = "encoding"))] - let decoded = decoder.decode(bytes)?.to_string(); - - decoded.into() - } - }) - } - - #[cfg(feature = "serialize")] - pub(crate) fn decode_and_escape( - &self, - decoder: crate::reader::Decoder, - ) -> Result> { - match self.decode(decoder)? { - Cow::Borrowed(decoded) => { - let unescaped = - do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; - match unescaped { - Cow::Borrowed(unescaped) => { - from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8) - } - Cow::Owned(unescaped) => String::from_utf8(unescaped) - .map(|s| s.into()) - .map_err(|e| Error::Utf8(e.utf8_error())), - } - } - Cow::Owned(decoded) => { - let unescaped = - do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()) - .map(|s| s.into()) - .map_err(|e| Error::Utf8(e.utf8_error())) - } - } - } - /// helper method to unescape then decode self using the reader encoding /// but without BOM (Byte order mark) /// @@ -846,6 +807,117 @@ impl<'a> std::fmt::Debug for BytesText<'a> { } } +/// CDATA content contains unescaped data from the reader. If you want to write them as a text, +/// [convert](Self::escape) it to [`BytesText`] +#[derive(Clone, Eq, PartialEq)] +pub struct BytesCData<'a> { + content: Cow<'a, [u8]>, +} + +impl<'a> BytesCData<'a> { + /// Creates a new `BytesCData` from a byte sequence. + #[inline] + pub fn new>>(content: C) -> Self { + Self { + content: content.into(), + } + } + + /// Creates a new `BytesCData` from a string + #[inline] + pub fn from_str(content: &'a str) -> Self { + Self::new(content.as_bytes()) + } + + /// Ensures that all data is owned to extend the object's lifetime if + /// necessary. + #[inline] + pub fn into_owned(self) -> BytesCData<'static> { + BytesCData { + content: self.content.into_owned().into(), + } + } + + /// Extracts the inner `Cow` from the `BytesCData` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content + } + + /// Converts this CDATA content to an escaped version, that can be written + /// as an usual text in XML. + /// + /// This function performs following replacements: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + pub fn escape(self) -> BytesText<'a> { + BytesText::from_escaped(match escape(&self.content) { + Cow::Borrowed(_) => self.content, + Cow::Owned(escaped) => Cow::Owned(escaped), + }) + } + + /// Converts this CDATA content to an escaped version, that can be written + /// as an usual text in XML. + /// + /// In XML text content, it is allowed (though not recommended) to leave + /// the quote special characters `"` and `'` unescaped. + /// + /// This function performs following replacements: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + pub fn partial_escape(self) -> BytesText<'a> { + BytesText::from_escaped(match partial_escape(&self.content) { + Cow::Borrowed(_) => self.content, + Cow::Owned(escaped) => Cow::Owned(escaped), + }) + } + + /// Gets content of this text buffer in the specified encoding + #[cfg(feature = "serialize")] + pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { + Ok(match &self.content { + Cow::Borrowed(bytes) => { + #[cfg(feature = "encoding")] + { + decoder.decode(bytes) + } + #[cfg(not(feature = "encoding"))] + { + decoder.decode(bytes)?.into() + } + } + Cow::Owned(bytes) => { + #[cfg(feature = "encoding")] + let decoded = decoder.decode(bytes).into_owned(); + + #[cfg(not(feature = "encoding"))] + let decoded = decoder.decode(bytes)?.to_string(); + + decoded.into() + } + }) + } +} + +impl<'a> std::fmt::Debug for BytesCData<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "BytesCData {{ content: ")?; + write_cow_string(f, &self.content)?; + write!(f, " }}") + } +} + /// Event emitted by [`Reader::read_event`]. /// /// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event @@ -862,7 +934,7 @@ pub enum Event<'a> { /// Comment ``. Comment(BytesText<'a>), /// CData ``. - CData(BytesText<'a>), + CData(BytesCData<'a>), /// XML declaration ``. Decl(BytesDecl<'a>), /// Processing instruction ``. @@ -920,6 +992,14 @@ impl<'a> Deref for BytesText<'a> { } } +impl<'a> Deref for BytesCData<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + impl<'a> Deref for Event<'a> { type Target = [u8]; fn deref(&self) -> &[u8] { diff --git a/src/reader.rs b/src/reader.rs index 7e7becfe..fab6a41a 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -9,7 +9,8 @@ use std::{fs::File, path::Path, str::from_utf8}; use encoding_rs::{Encoding, UTF_16BE, UTF_16LE}; use crate::errors::{Error, Result}; -use crate::events::{attributes::Attribute, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; +use crate::events::attributes::Attribute; +use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use memchr; @@ -367,7 +368,7 @@ impl Reader { Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) } BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - Ok(Event::CData(BytesText::from_plain(&buf[8..]))) + Ok(Event::CData(BytesCData::new(&buf[8..]))) } BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { let start = buf[8..] diff --git a/src/writer.rs b/src/writer.rs index ce711594..768c244d 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,7 +1,7 @@ //! A module to handle `Writer` use crate::errors::{Error, Result}; -use crate::events::{attributes::Attribute, BytesStart, BytesText, Event}; +use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event}; use std::io::Write; /// XML writer. @@ -262,7 +262,7 @@ impl<'a, W: Write> ElementWriter<'a, W> { } /// Write a CData event `` inside the current element. - pub fn write_cdata_content(self, text: BytesText) -> Result<&'a mut Writer> { + pub fn write_cdata_content(self, text: BytesCData) -> Result<&'a mut Writer> { self.writer .write_event(Event::Start(self.start_tag.to_borrowed()))?; self.writer.write_event(Event::CData(text))?; diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 5e58ad94..4ec13c8c 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -182,7 +182,7 @@ fn test_cdata() { fn test_cdata_open_close() { let mut r = Reader::from_str(" test]]>"); r.trim_text(true); - next_eq!(r, CData, b"test <> test"); + next_eq!(r, CData, b"test <> test"); } #[test]