Skip to content
This repository has been archived by the owner on May 30, 2022. It is now read-only.

Commit

Permalink
Fix #311: Introduce BytesCData type for CData events and use it ins…
Browse files Browse the repository at this point in the history
…tead of `BytesText`

This commit revert changes from 85f9f68
  • Loading branch information
Mingun committed Mar 20, 2022
1 parent a1ad40d commit 9433830
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 78 deletions.
2 changes: 2 additions & 0 deletions Changelog.md
Expand Up @@ -21,6 +21,8 @@
`DeError::InvalidUnit` variant is removed, because after fix it is no longer used
- test: add tests for trivial documents (empty / only comment / `<root>...</root>` -- one tag with content)
- fix: CDATA was not handled in many cases where it should
- fix: do not unescape CDATA content because it never escaped by design
([#311](https://github.com/tafia/quick-xml/issues/311))

## 0.23.0-alpha3

Expand Down
26 changes: 14 additions & 12 deletions src/de/mod.rs
Expand Up @@ -117,7 +117,7 @@ mod var;
pub use crate::errors::serialize::DeError;
use crate::{
errors::Error,
events::{BytesEnd, BytesStart, BytesText, Event},
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
reader::Decoder,
Reader,
};
Expand All @@ -141,7 +141,7 @@ pub enum DeEvent<'a> {
Text(BytesText<'a>),
/// Unescaped character data between `Start` and `End` element,
/// stored in `<![CDATA[...]]>`.
CData(BytesText<'a>),
CData(BytesCData<'a>),
/// End of XML document.
Eof,
}
Expand Down Expand Up @@ -300,18 +300,20 @@ where
/// |`<tag ...>text</tag>`|`text` |Complete tag consumed |
/// |`<tag/>` |empty slice|Virtual end tag not consumed|
/// |`</tag>` |empty slice|Not consumed |
fn next_text(&mut self) -> Result<BytesText<'de>, DeError> {
fn next_text(&mut self) -> Result<BytesCData<'de>, DeError> {
match self.next()? {
DeEvent::Text(e) | DeEvent::CData(e) => Ok(e),
DeEvent::Text(e) => e.unescape().map_err(|e| DeError::Xml(e.into())),
DeEvent::CData(e) => Ok(e),
DeEvent::Eof => Err(DeError::Eof),
DeEvent::Start(e) => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) | DeEvent::CData(t) => t,
DeEvent::Text(t) => t.unescape().map_err(|e| DeError::Xml(e.into()))?,
DeEvent::CData(t) => t,
DeEvent::Start(_) => return Err(DeError::Start),
DeEvent::End(end) if end.name() == e.name() => {
return Ok(BytesText::from_escaped(&[] as &[u8]));
return Ok(BytesCData::new(&[] as &[u8]));
}
DeEvent::End(_) => return Err(DeError::End),
DeEvent::Eof => return Err(DeError::Eof),
Expand All @@ -321,7 +323,7 @@ where
}
DeEvent::End(e) => {
self.peek = Some(DeEvent::End(e));
Ok(BytesText::from_escaped(&[] as &[u8]))
Ok(BytesCData::new(&[] as &[u8]))
}
}
}
Expand Down Expand Up @@ -426,7 +428,7 @@ where
V: Visitor<'de>,
{
let text = self.next_text()?;
let string = text.decode_and_escape(self.reader.decoder())?;
let string = text.decode(self.reader.decoder())?;
match string {
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
Cow::Owned(string) => visitor.visit_string(string),
Expand All @@ -438,8 +440,7 @@ where
V: Visitor<'de>,
{
let text = self.next_text()?;
let value = text.escaped();
visitor.visit_bytes(value)
visitor.visit_bytes(&text)
}

fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
Expand Down Expand Up @@ -560,7 +561,8 @@ where
V: Visitor<'de>,
{
match self.peek()? {
DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
DeEvent::Text(t) if t.is_empty() => visitor.visit_none(),
DeEvent::CData(t) if t.is_empty() => visitor.visit_none(),
DeEvent::Eof => visitor.visit_none(),
_ => visitor.visit_some(self),
}
Expand Down Expand Up @@ -772,7 +774,7 @@ mod tests {
);
assert_eq!(
de.next().unwrap(),
CData(BytesText::from_plain_str("cdata content"))
CData(BytesCData::from_str("cdata content"))
);
assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag")));

Expand Down
189 changes: 128 additions & 61 deletions src/events/mod.rs
Expand Up @@ -40,7 +40,7 @@ pub mod attributes;
use encoding_rs::Encoding;
use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8};

use crate::escape::{do_unescape, escape};
use crate::escape::{do_unescape, escape, partial_escape, EscapeError};
use crate::utils::write_cow_string;
use crate::{errors::Error, errors::Result, reader::Reader};
use attributes::{Attribute, Attributes};
Expand Down Expand Up @@ -604,11 +604,13 @@ impl<'a> BytesText<'a> {
}
}

/// Extracts the inner `Cow` from the `BytesText` event container.
#[cfg(feature = "serialize")]
#[inline]
pub(crate) fn into_inner(self) -> Cow<'a, [u8]> {
self.content
/// Returns unescaped version of the text content, that can be written
/// as CDATA in XML
pub fn unescape(self) -> std::result::Result<BytesCData<'a>, EscapeError> {
Ok(BytesCData::new(match do_unescape(&self.content, None)? {
Cow::Borrowed(_) => self.content,
Cow::Owned(unescaped) => Cow::Owned(unescaped),
}))
}

/// gets escaped content
Expand Down Expand Up @@ -646,60 +648,6 @@ impl<'a> BytesText<'a> {
do_unescape(self, custom_entities).map_err(Error::EscapeError)
}

/// Gets content of this text buffer in the specified encoding
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => {
#[cfg(feature = "encoding")]
{
decoder.decode(bytes)
}
#[cfg(not(feature = "encoding"))]
{
decoder.decode(bytes)?.into()
}
}
Cow::Owned(bytes) => {
#[cfg(feature = "encoding")]
let decoded = decoder.decode(bytes).into_owned();

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode(bytes)?.to_string();

decoded.into()
}
})
}

#[cfg(feature = "serialize")]
pub(crate) fn decode_and_escape(
&self,
decoder: crate::reader::Decoder,
) -> Result<Cow<'a, str>> {
match self.decode(decoder)? {
Cow::Borrowed(decoded) => {
let unescaped =
do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
match unescaped {
Cow::Borrowed(unescaped) => {
from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8)
}
Cow::Owned(unescaped) => String::from_utf8(unescaped)
.map(|s| s.into())
.map_err(|e| Error::Utf8(e.utf8_error())),
}
}
Cow::Owned(decoded) => {
let unescaped =
do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?;
String::from_utf8(unescaped.into_owned())
.map(|s| s.into())
.map_err(|e| Error::Utf8(e.utf8_error()))
}
}
}

/// helper method to unescape then decode self using the reader encoding
/// but without BOM (Byte order mark)
///
Expand Down Expand Up @@ -860,6 +808,117 @@ impl<'a> std::fmt::Debug for BytesText<'a> {
}
}

/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
/// [convert](Self::escape) it to [`BytesText`]
#[derive(Clone, Eq, PartialEq)]
pub struct BytesCData<'a> {
content: Cow<'a, [u8]>,
}

impl<'a> BytesCData<'a> {
/// Creates a new `BytesCData` from a byte sequence.
#[inline]
pub fn new<C: Into<Cow<'a, [u8]>>>(content: C) -> Self {
Self {
content: content.into(),
}
}

/// Creates a new `BytesCData` from a string
#[inline]
pub fn from_str(content: &'a str) -> Self {
Self::new(content.as_bytes())
}

/// Extracts the inner `Cow` from the `BytesCData` event container.
#[inline]
pub fn into_inner(self) -> Cow<'a, [u8]> {
self.content
}

/// Ensures that all data is owned to extend the object's lifetime if
/// necessary.
#[inline]
pub fn into_owned(self) -> BytesCData<'static> {
BytesCData {
content: self.content.into_owned().into(),
}
}

/// Converts this CDATA content to an escaped version, that can be written
/// as an usual text in XML.
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(self) -> BytesText<'a> {
BytesText::from_escaped(match escape(&self.content) {
Cow::Borrowed(_) => self.content,
Cow::Owned(escaped) => Cow::Owned(escaped),
})
}

/// Converts this CDATA content to an escaped version, that can be written
/// as an usual text in XML.
///
/// In XML text content, it is allowed (though not recommended) to leave
/// the quote special characters `"` and `'` unescaped.
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(self) -> BytesText<'a> {
BytesText::from_escaped(match partial_escape(&self.content) {
Cow::Borrowed(_) => self.content,
Cow::Owned(escaped) => Cow::Owned(escaped),
})
}

/// Gets content of this text buffer in the specified encoding
#[cfg(feature = "serialize")]
pub fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => {
#[cfg(feature = "encoding")]
{
decoder.decode(bytes)
}
#[cfg(not(feature = "encoding"))]
{
decoder.decode(bytes)?.into()
}
}
Cow::Owned(bytes) => {
#[cfg(feature = "encoding")]
let decoded = decoder.decode(bytes).into_owned();

#[cfg(not(feature = "encoding"))]
let decoded = decoder.decode(bytes)?.to_string();

decoded.into()
}
})
}
}

impl<'a> std::fmt::Debug for BytesCData<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "BytesCData {{ content: ")?;
write_cow_string(f, &self.content)?;
write!(f, " }}")
}
}

/// Event emitted by [`Reader::read_event`].
///
/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
Expand All @@ -876,7 +935,7 @@ pub enum Event<'a> {
/// Comment `<!-- ... -->`.
Comment(BytesText<'a>),
/// CData `<![CDATA[...]]>`.
CData(BytesText<'a>),
CData(BytesCData<'a>),
/// XML declaration `<?xml ...?>`.
Decl(BytesDecl<'a>),
/// Processing instruction `<?...?>`.
Expand Down Expand Up @@ -934,6 +993,14 @@ impl<'a> Deref for BytesText<'a> {
}
}

impl<'a> Deref for BytesCData<'a> {
type Target = [u8];

fn deref(&self) -> &[u8] {
&*self.content
}
}

impl<'a> Deref for Event<'a> {
type Target = [u8];
fn deref(&self) -> &[u8] {
Expand Down
5 changes: 3 additions & 2 deletions src/reader.rs
Expand Up @@ -9,7 +9,8 @@ use std::{fs::File, path::Path, str::from_utf8};
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE};

use crate::errors::{Error, Result};
use crate::events::{attributes::Attribute, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
use crate::events::attributes::Attribute;
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};

use memchr;

Expand Down Expand Up @@ -368,7 +369,7 @@ impl<R: BufRead> Reader<R> {
Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2])))
} else if uncased_starts_with(buf, b"![CDATA[") {
debug_assert!(len >= 10, "Minimum length guaranteed by read_bang_elem");
Ok(Event::CData(BytesText::from_plain(&buf[8..buf.len() - 2])))
Ok(Event::CData(BytesCData::new(&buf[8..buf.len() - 2])))
} else if uncased_starts_with(buf, b"!DOCTYPE") {
debug_assert!(len >= 8, "Minimum length guaranteed by read_bang_elem");
let start = buf[8..]
Expand Down
4 changes: 2 additions & 2 deletions src/writer.rs
@@ -1,7 +1,7 @@
//! A module to handle `Writer`

use crate::errors::{Error, Result};
use crate::events::{attributes::Attribute, BytesStart, BytesText, Event};
use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
use std::io::Write;

/// XML writer.
Expand Down Expand Up @@ -261,7 +261,7 @@ impl<'a, W: Write> ElementWriter<'a, W> {
}

/// Write a CData event `<![CDATA[...]]>` inside the current element.
pub fn write_cdata_content(self, text: BytesText) -> Result<&'a mut Writer<W>> {
pub fn write_cdata_content(self, text: BytesCData) -> Result<&'a mut Writer<W>> {
self.writer
.write_event(Event::Start(self.start_tag.to_borrowed()))?;
self.writer.write_event(Event::CData(text))?;
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests.rs
Expand Up @@ -180,7 +180,7 @@ fn test_cdata() {
fn test_cdata_open_close() {
let mut r = Reader::from_str("<![CDATA[test <> test]]>");
r.trim_text(true);
next_eq!(r, CData, b"test &lt;&gt; test");
next_eq!(r, CData, b"test <> test");
}

#[test]
Expand Down

0 comments on commit 9433830

Please sign in to comment.