From eca40d4d0d148831534020de6b78669e3897b4b5 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 25 Aug 2021 01:06:05 +0500 Subject: [PATCH 1/6] Unify string handling --- src/de/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index a6b7adef..317987f1 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -406,13 +406,12 @@ where deserialize_bool(txt.as_ref(), self.reader.decoder(), visitor) } + /// Representation of owned strings the same as [non-owned](#method.deserialize_str). fn deserialize_string(self, visitor: V) -> Result where V: Visitor<'de>, { - let text = self.next_text()?; - let string = text.decode_and_escape(self.reader.decoder())?; - visitor.visit_string(string.into_owned()) + self.deserialize_str(visitor) } fn deserialize_char(self, visitor: V) -> Result From cf77d1ce4b0e8519149bdbb09b86397371b2db03 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 20 Mar 2022 21:06:12 +0500 Subject: [PATCH 2/6] Manually implement Debug for debug helper types Now debug representation is readable --- src/de/byte_buf.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/de/byte_buf.rs b/src/de/byte_buf.rs index 67e07008..e5c3dda1 100644 --- a/src/de/byte_buf.rs +++ b/src/de/byte_buf.rs @@ -1,11 +1,12 @@ //! Helper types for tests +use crate::utils::write_byte_string; use serde::de::{self, Deserialize, Deserializer, Error}; use std::fmt; /// Wrapper around `Vec` that deserialized using `deserialize_byte_buf` /// instead of vector's generic `deserialize_seq` -#[derive(Debug, PartialEq)] +#[derive(PartialEq)] pub struct ByteBuf(pub Vec); impl<'de> Deserialize<'de> for ByteBuf { @@ -35,9 +36,15 @@ impl<'de> Deserialize<'de> for ByteBuf { } } +impl fmt::Debug for ByteBuf { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write_byte_string(f, &self.0) + } +} + /// Wrapper around `&[u8]` that deserialized using `deserialize_bytes` /// instead of vector's generic `deserialize_seq` -#[derive(Debug, PartialEq)] +#[derive(PartialEq)] pub struct Bytes<'de>(pub &'de [u8]); impl<'de> Deserialize<'de> for Bytes<'de> { @@ -62,3 +69,9 @@ impl<'de> Deserialize<'de> for Bytes<'de> { Ok(d.deserialize_bytes(Visitor)?) } } + +impl<'de> fmt::Debug for Bytes<'de> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write_byte_string(f, self.0) + } +} From 128f93a54341096a57c1d6385a7c95cd2e24b93a Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 7 Aug 2021 20:48:59 +0500 Subject: [PATCH 3/6] Add tests for trivial XML documents failures: de::tests::trivial::struct_::cdata::byte_buf de::tests::trivial::struct_::cdata::char_ de::tests::trivial::struct_::cdata::f32_ de::tests::trivial::struct_::cdata::f64_ de::tests::trivial::struct_::cdata::false_ de::tests::trivial::struct_::cdata::i128_ de::tests::trivial::struct_::cdata::i16_ de::tests::trivial::struct_::cdata::i32_ de::tests::trivial::struct_::cdata::i64_ de::tests::trivial::struct_::cdata::i8_ de::tests::trivial::struct_::cdata::isize_ de::tests::trivial::struct_::cdata::string de::tests::trivial::struct_::cdata::true_ de::tests::trivial::struct_::cdata::u128_ de::tests::trivial::struct_::cdata::u16_ de::tests::trivial::struct_::cdata::u32_ de::tests::trivial::struct_::cdata::u64_ de::tests::trivial::struct_::cdata::u8_ de::tests::trivial::struct_::cdata::usize_ de::tests::trivial::struct_::text::byte_buf --- Changelog.md | 1 + src/de/mod.rs | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/Changelog.md b/Changelog.md index 92b51511..71b2c694 100644 --- a/Changelog.md +++ b/Changelog.md @@ -19,6 +19,7 @@ from the attribute and element names and attribute values - fix: allow to deserialize `unit`s from text and CDATA content. `DeError::InvalidUnit` variant is removed, because after fix it is no longer used +- test: add tests for trivial documents (empty / only comment / `...` -- one tag with content) ## 0.23.0-alpha3 diff --git a/src/de/mod.rs b/src/de/mod.rs index 317987f1..fae21c1b 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -722,6 +722,8 @@ mod tests { where T: Deserialize<'de>, { + // Log XM that we try to deserialize to see it in the failed tests output + dbg!(s); let mut de = Deserializer::from_str(s); let result = T::deserialize(&mut de); @@ -906,6 +908,173 @@ mod tests { source: String, } + /// Tests for trivial XML documents: empty or contains only primitive type + /// on a top level; all of them should be considered invalid + mod trivial { + use super::*; + + #[rustfmt::skip] // excess spaces used for readability + macro_rules! eof { + ($name:ident: $type:ty = $value:expr) => { + #[test] + fn $name() { + let item = from_str::<$type>($value).unwrap_err(); + + match item { + DeError::Eof => (), + _ => panic!("Expected `Eof`, found {:?}", item), + } + } + }; + ($value:expr) => { + eof!(i8_: i8 = $value); + eof!(i16_: i16 = $value); + eof!(i32_: i32 = $value); + eof!(i64_: i64 = $value); + eof!(isize_: isize = $value); + + eof!(u8_: u8 = $value); + eof!(u16_: u16 = $value); + eof!(u32_: u32 = $value); + eof!(u64_: u64 = $value); + eof!(usize_: usize = $value); + + serde_if_integer128! { + eof!(u128_: u128 = $value); + eof!(i128_: i128 = $value); + } + + eof!(f32_: f32 = $value); + eof!(f64_: f64 = $value); + + eof!(false_: bool = $value); + eof!(true_: bool = $value); + eof!(char_: char = $value); + + eof!(string: String = $value); + eof!(byte_buf: ByteBuf = $value); + + #[test] + fn unit() { + let item = from_str::<()>($value).unwrap_err(); + + match item { + DeError::Eof => (), + _ => panic!("Expected `Eof`, found {:?}", item), + } + } + }; + } + + /// Empty document should considered invalid no matter which type we try to deserialize + mod empty_doc { + use super::*; + eof!(""); + } + + /// Document that contains only comment should be handles as if it is empty + mod only_comment { + use super::*; + eof!(""); + } + + /// Tests deserialization from top-level tag content: `...content...` + mod struct_ { + use super::*; + + /// Well-formed XML must have a single tag at the root level. + /// Any XML tag can be modeled as a struct, and content of this tag are modeled as + /// fields of this struct. + /// + /// Because we want to get access to unnamed content of the tag (usually, this internal + /// XML node called `#text`) we use a rename to a special name `$value` + #[derive(Debug, Deserialize, PartialEq)] + struct Trivial { + #[serde(rename = "$value")] + value: T, + } + + macro_rules! in_struct { + ($name:ident: $type:ty = $value:expr, $expected:expr) => { + #[test] + fn $name() { + let item: Trivial<$type> = from_str($value).unwrap(); + + assert_eq!(item, Trivial { value: $expected }); + } + }; + } + + /// Tests deserialization from text content in a tag + #[rustfmt::skip] // tests formatted in a table + mod text { + use super::*; + + in_struct!(i8_: i8 = "-42", -42i8); + in_struct!(i16_: i16 = "-4200", -4200i16); + in_struct!(i32_: i32 = "-42000000", -42000000i32); + in_struct!(i64_: i64 = "-42000000000000", -42000000000000i64); + in_struct!(isize_: isize = "-42000000000000", -42000000000000isize); + + in_struct!(u8_: u8 = "42", 42u8); + in_struct!(u16_: u16 = "4200", 4200u16); + in_struct!(u32_: u32 = "42000000", 42000000u32); + in_struct!(u64_: u64 = "42000000000000", 42000000000000u64); + in_struct!(usize_: usize = "42000000000000", 42000000000000usize); + + serde_if_integer128! { + in_struct!(u128_: u128 = "420000000000000000000000000000", 420000000000000000000000000000u128); + in_struct!(i128_: i128 = "-420000000000000000000000000000", -420000000000000000000000000000i128); + } + + in_struct!(f32_: f32 = "4.2", 4.2f32); + in_struct!(f64_: f64 = "4.2", 4.2f64); + + in_struct!(false_: bool = "false", false); + in_struct!(true_: bool = "true", true); + in_struct!(char_: char = "r", 'r'); + + in_struct!(string: String = "escaped string", "escaped string".into()); + in_struct!(byte_buf: ByteBuf = "escaped byte_buf", ByteBuf(r"escaped byte_buf".into())); + } + + /// Tests deserialization from CDATA content in a tag. + /// CDATA handling similar to text handling except that strings does not unescapes + #[rustfmt::skip] // tests formatted in a table + mod cdata { + use super::*; + + in_struct!(i8_: i8 = "", -42i8); + in_struct!(i16_: i16 = "", -4200i16); + in_struct!(i32_: i32 = "", -42000000i32); + in_struct!(i64_: i64 = "", -42000000000000i64); + in_struct!(isize_: isize = "", -42000000000000isize); + + in_struct!(u8_: u8 = "", 42u8); + in_struct!(u16_: u16 = "", 4200u16); + in_struct!(u32_: u32 = "", 42000000u32); + in_struct!(u64_: u64 = "", 42000000000000u64); + in_struct!(usize_: usize = "", 42000000000000usize); + + serde_if_integer128! { + in_struct!(u128_: u128 = "", 420000000000000000000000000000u128); + in_struct!(i128_: i128 = "", -420000000000000000000000000000i128); + } + + in_struct!(f32_: f32 = "", 4.2f32); + in_struct!(f64_: f64 = "", 4.2f64); + + in_struct!(false_: bool = "", false); + in_struct!(true_: bool = "", true); + in_struct!(char_: char = "", 'r'); + + // Escape sequences does not processed inside CDATA section + in_struct!(string: String = "", "escaped string".into()); + in_struct!(byte_buf: ByteBuf = "", ByteBuf(r"escaped byte_buf".into())); + } + } + } + #[test] fn multiple_roots_attributes() { let s = r##" From fd04e5f945b4d0db75457d500c98b9956edcb0ee Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 13 Mar 2022 12:31:34 +0500 Subject: [PATCH 4/6] Handle CDATA events in the deserializer - this fixes almost all CDATA trivial tests failures: de::tests::trivial::struct_::text::byte_buf de::tests::trivial::struct_::cdata::byte_buf --- Changelog.md | 1 + src/de/map.rs | 2 +- src/de/mod.rs | 2 +- src/de/var.rs | 4 +++- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index 71b2c694..4d5c4be8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -20,6 +20,7 @@ - fix: allow to deserialize `unit`s from text and CDATA content. `DeError::InvalidUnit` variant is removed, because after fix it is no longer used - test: add tests for trivial documents (empty / only comment / `...` -- one tag with content) +- fix: CDATA was not handled in many cases where it should ## 0.23.0-alpha3 diff --git a/src/de/map.rs b/src/de/map.rs index 466f04c3..aa311bb8 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -100,7 +100,7 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a, } else { // try getting from events (value) match self.de.peek()? { - DeEvent::Text(_) => { + DeEvent::Text(_) | DeEvent::CData(_) => { self.state = State::InnerValue; // Deserialize `key` from special attribute name which means // that value should be taken from the text content of the diff --git a/src/de/mod.rs b/src/de/mod.rs index fae21c1b..8f5e180d 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -560,7 +560,7 @@ where V: Visitor<'de>, { match self.peek()? { - DeEvent::Text(t) if t.is_empty() => visitor.visit_none(), + DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(), DeEvent::Eof => visitor.visit_none(), _ => visitor.visit_some(self), } diff --git a/src/de/var.rs b/src/de/var.rs index 63d5a0e2..53638687 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -36,6 +36,8 @@ where let decoder = self.de.reader.decoder(); let de = match self.de.peek()? { DeEvent::Text(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, true), + // Escape sequences does not processed inside CDATA section + DeEvent::CData(t) => EscapedDeserializer::new(Cow::Borrowed(t), decoder, false), DeEvent::Start(e) => EscapedDeserializer::new(Cow::Borrowed(e.name()), decoder, false), _ => { return Err(DeError::Unsupported( @@ -64,7 +66,7 @@ where fn unit_variant(self) -> Result<(), DeError> { match self.de.next()? { DeEvent::Start(e) => self.de.read_to_end(e.name()), - DeEvent::Text(_) => Ok(()), + DeEvent::Text(_) | DeEvent::CData(_) => Ok(()), _ => unreachable!(), } } From a1ad40dcbfb96d5e2305f9129bd5d8e3bcfe3f64 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 20 Mar 2022 23:11:20 +0500 Subject: [PATCH 5/6] Move imports up --- src/events/mod.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index c0b9a924..609ada03 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -41,6 +41,7 @@ use encoding_rs::Encoding; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8}; use crate::escape::{do_unescape, escape}; +use crate::utils::write_cow_string; use crate::{errors::Error, errors::Result, reader::Reader}; use attributes::{Attribute, Attributes}; @@ -365,8 +366,6 @@ impl<'a> BytesStart<'a> { impl<'a> std::fmt::Debug for BytesStart<'a> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - use crate::utils::write_cow_string; - write!(f, "BytesStart {{ buf: ")?; write_cow_string(f, &self.buf)?; write!(f, ", name_len: {} }}", self.name_len) @@ -548,15 +547,14 @@ impl<'a> BytesEnd<'a> { impl<'a> std::fmt::Debug for BytesEnd<'a> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - use crate::utils::write_cow_string; - write!(f, "BytesEnd {{ name: ")?; write_cow_string(f, &self.name)?; write!(f, " }}") } } -/// Data from various events (most notably, `Event::Text`). +/// Data from various events (most notably, `Event::Text`) that stored in XML +/// in escaped form. Internally data is stored in escaped form #[derive(Clone, Eq, PartialEq)] pub struct BytesText<'a> { // Invariant: The content is always escaped. @@ -566,8 +564,8 @@ pub struct BytesText<'a> { impl<'a> BytesText<'a> { /// Creates a new `BytesText` from an escaped byte sequence. #[inline] - pub fn from_escaped>>(content: C) -> BytesText<'a> { - BytesText { + pub fn from_escaped>>(content: C) -> Self { + Self { content: content.into(), } } @@ -575,15 +573,15 @@ impl<'a> BytesText<'a> { /// Creates a new `BytesText` from a byte sequence. The byte sequence is /// expected not to be escaped. #[inline] - pub fn from_plain(content: &'a [u8]) -> BytesText<'a> { - BytesText { + pub fn from_plain(content: &'a [u8]) -> Self { + Self { content: escape(content), } } /// Creates a new `BytesText` from an escaped string. #[inline] - pub fn from_escaped_str>>(content: C) -> BytesText<'a> { + pub fn from_escaped_str>>(content: C) -> Self { Self::from_escaped(match content.into() { Cow::Owned(o) => Cow::Owned(o.into_bytes()), Cow::Borrowed(b) => Cow::Borrowed(b.as_bytes()), @@ -593,7 +591,7 @@ impl<'a> BytesText<'a> { /// Creates a new `BytesText` from a string. The string is expected not to /// be escaped. #[inline] - pub fn from_plain_str(content: &'a str) -> BytesText<'a> { + pub fn from_plain_str(content: &'a str) -> Self { Self::from_plain(content.as_bytes()) } @@ -856,8 +854,6 @@ impl<'a> BytesText<'a> { impl<'a> std::fmt::Debug for BytesText<'a> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - use crate::utils::write_cow_string; - write!(f, "BytesText {{ content: ")?; write_cow_string(f, &self.content)?; write!(f, " }}") From 572b2c237b5436924d1d0b0de3bc4dba2ac65447 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 21 Mar 2022 00:02:04 +0500 Subject: [PATCH 6/6] Fix #311: Introduce `BytesCData` type for CData events and use it instead of `BytesText` This commit revert changes from 85f9f685e971b29e42fa829b45fa47c165383b98 --- Changelog.md | 7 ++ benches/bench.rs | 2 +- src/de/mod.rs | 26 +++--- src/events/mod.rs | 194 ++++++++++++++++++++++++++++++-------------- src/reader.rs | 5 +- src/writer.rs | 4 +- tests/unit_tests.rs | 2 +- 7 files changed, 162 insertions(+), 78 deletions(-) diff --git a/Changelog.md b/Changelog.md index 4d5c4be8..520ffb2f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -21,6 +21,13 @@ `DeError::InvalidUnit` variant is removed, because after fix it is no longer used - test: add tests for trivial documents (empty / only comment / `...` -- one tag with content) - fix: CDATA was not handled in many cases where it should +- fix: do not unescape CDATA content because it never escaped by design + ([#311](https://github.com/tafia/quick-xml/issues/311)). + + NOTE: now text content when deserialized into bytes (`Vec` / `&[u8]`), also unescaped. + It is impossible to get a raw XML data in bytes buffer. Actually, deserializing of bytes + should be prohibited, because XML cannot store raw byte data. You should store binary + data in a string hex- or base64- or any-other-schema-encoded. ## 0.23.0-alpha3 diff --git a/benches/bench.rs b/benches/bench.rs index adea6840..42409aac 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -225,7 +225,7 @@ fn bench_quick_xml_one_cdata_event_trimmed(b: &mut Bencher) { .check_comments(false) .trim_text(true); match r.read_event(&mut buf) { - Ok(Event::CData(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::CData(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/src/de/mod.rs b/src/de/mod.rs index 8f5e180d..e51d0a1a 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -117,7 +117,7 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ errors::Error, - events::{BytesEnd, BytesStart, BytesText, Event}, + events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, reader::Decoder, Reader, }; @@ -141,7 +141,7 @@ pub enum DeEvent<'a> { Text(BytesText<'a>), /// Unescaped character data between `Start` and `End` element, /// stored in ``. - CData(BytesText<'a>), + CData(BytesCData<'a>), /// End of XML document. Eof, } @@ -300,18 +300,20 @@ where /// |`text`|`text` |Complete tag consumed | /// |`` |empty slice|Virtual end tag not consumed| /// |`` |empty slice|Not consumed | - fn next_text(&mut self) -> Result, DeError> { + fn next_text(&mut self) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) | DeEvent::CData(e) => Ok(e), + DeEvent::Text(e) => e.unescape().map_err(|e| DeError::Xml(e.into())), + DeEvent::CData(e) => Ok(e), DeEvent::Eof => Err(DeError::Eof), DeEvent::Start(e) => { // allow one nested level let inner = self.next()?; let t = match inner { - DeEvent::Text(t) | DeEvent::CData(t) => t, + DeEvent::Text(t) => t.unescape().map_err(|e| DeError::Xml(e.into()))?, + DeEvent::CData(t) => t, DeEvent::Start(_) => return Err(DeError::Start), DeEvent::End(end) if end.name() == e.name() => { - return Ok(BytesText::from_escaped(&[] as &[u8])); + return Ok(BytesCData::new(&[] as &[u8])); } DeEvent::End(_) => return Err(DeError::End), DeEvent::Eof => return Err(DeError::Eof), @@ -321,7 +323,7 @@ where } DeEvent::End(e) => { self.peek = Some(DeEvent::End(e)); - Ok(BytesText::from_escaped(&[] as &[u8])) + Ok(BytesCData::new(&[] as &[u8])) } } } @@ -426,7 +428,7 @@ where V: Visitor<'de>, { let text = self.next_text()?; - let string = text.decode_and_escape(self.reader.decoder())?; + let string = text.decode(self.reader.decoder())?; match string { Cow::Borrowed(string) => visitor.visit_borrowed_str(string), Cow::Owned(string) => visitor.visit_string(string), @@ -438,8 +440,7 @@ where V: Visitor<'de>, { let text = self.next_text()?; - let value = text.escaped(); - visitor.visit_bytes(value) + visitor.visit_bytes(&text) } fn deserialize_byte_buf(self, visitor: V) -> Result @@ -560,7 +561,8 @@ where V: Visitor<'de>, { match self.peek()? { - DeEvent::Text(t) | DeEvent::CData(t) if t.is_empty() => visitor.visit_none(), + DeEvent::Text(t) if t.is_empty() => visitor.visit_none(), + DeEvent::CData(t) if t.is_empty() => visitor.visit_none(), DeEvent::Eof => visitor.visit_none(), _ => visitor.visit_some(self), } @@ -772,7 +774,7 @@ mod tests { ); assert_eq!( de.next().unwrap(), - CData(BytesText::from_plain_str("cdata content")) + CData(BytesCData::from_str("cdata content")) ); assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"tag"))); diff --git a/src/events/mod.rs b/src/events/mod.rs index 609ada03..b6bb2c50 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -40,11 +40,14 @@ pub mod attributes; use encoding_rs::Encoding; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8}; -use crate::escape::{do_unescape, escape}; +use crate::escape::{do_unescape, escape, partial_escape}; use crate::utils::write_cow_string; use crate::{errors::Error, errors::Result, reader::Reader}; use attributes::{Attribute, Attributes}; +#[cfg(feature = "serialize")] +use crate::escape::EscapeError; + use memchr; /// Opening tag data (`Event::Start`), with optional attributes. @@ -604,11 +607,17 @@ impl<'a> BytesText<'a> { } } - /// Extracts the inner `Cow` from the `BytesText` event container. + /// Returns unescaped version of the text content, that can be written + /// as CDATA in XML #[cfg(feature = "serialize")] - #[inline] - pub(crate) fn into_inner(self) -> Cow<'a, [u8]> { - self.content + pub(crate) fn unescape(self) -> std::result::Result, EscapeError> { + //TODO: need to think about better API instead of dozens similar functions + // Maybe use builder pattern. After that expose function as public API + //FIXME: need to take into account entities defined in the document + Ok(BytesCData::new(match do_unescape(&self.content, None)? { + Cow::Borrowed(_) => self.content, + Cow::Owned(unescaped) => Cow::Owned(unescaped), + })) } /// gets escaped content @@ -646,60 +655,6 @@ impl<'a> BytesText<'a> { do_unescape(self, custom_entities).map_err(Error::EscapeError) } - /// Gets content of this text buffer in the specified encoding - #[cfg(feature = "serialize")] - pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { - Ok(match &self.content { - Cow::Borrowed(bytes) => { - #[cfg(feature = "encoding")] - { - decoder.decode(bytes) - } - #[cfg(not(feature = "encoding"))] - { - decoder.decode(bytes)?.into() - } - } - Cow::Owned(bytes) => { - #[cfg(feature = "encoding")] - let decoded = decoder.decode(bytes).into_owned(); - - #[cfg(not(feature = "encoding"))] - let decoded = decoder.decode(bytes)?.to_string(); - - decoded.into() - } - }) - } - - #[cfg(feature = "serialize")] - pub(crate) fn decode_and_escape( - &self, - decoder: crate::reader::Decoder, - ) -> Result> { - match self.decode(decoder)? { - Cow::Borrowed(decoded) => { - let unescaped = - do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; - match unescaped { - Cow::Borrowed(unescaped) => { - from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8) - } - Cow::Owned(unescaped) => String::from_utf8(unescaped) - .map(|s| s.into()) - .map_err(|e| Error::Utf8(e.utf8_error())), - } - } - Cow::Owned(decoded) => { - let unescaped = - do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()) - .map(|s| s.into()) - .map_err(|e| Error::Utf8(e.utf8_error())) - } - } - } - /// helper method to unescape then decode self using the reader encoding /// but without BOM (Byte order mark) /// @@ -860,6 +815,117 @@ impl<'a> std::fmt::Debug for BytesText<'a> { } } +/// CDATA content contains unescaped data from the reader. If you want to write them as a text, +/// [convert](Self::escape) it to [`BytesText`] +#[derive(Clone, Eq, PartialEq)] +pub struct BytesCData<'a> { + content: Cow<'a, [u8]>, +} + +impl<'a> BytesCData<'a> { + /// Creates a new `BytesCData` from a byte sequence. + #[inline] + pub fn new>>(content: C) -> Self { + Self { + content: content.into(), + } + } + + /// Creates a new `BytesCData` from a string + #[inline] + pub fn from_str(content: &'a str) -> Self { + Self::new(content.as_bytes()) + } + + /// Extracts the inner `Cow` from the `BytesCData` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content + } + + /// Ensures that all data is owned to extend the object's lifetime if + /// necessary. + #[inline] + pub fn into_owned(self) -> BytesCData<'static> { + BytesCData { + content: self.content.into_owned().into(), + } + } + + /// Converts this CDATA content to an escaped version, that can be written + /// as an usual text in XML. + /// + /// This function performs following replacements: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + pub fn escape(self) -> BytesText<'a> { + BytesText::from_escaped(match escape(&self.content) { + Cow::Borrowed(_) => self.content, + Cow::Owned(escaped) => Cow::Owned(escaped), + }) + } + + /// Converts this CDATA content to an escaped version, that can be written + /// as an usual text in XML. + /// + /// In XML text content, it is allowed (though not recommended) to leave + /// the quote special characters `"` and `'` unescaped. + /// + /// This function performs following replacements: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + pub fn partial_escape(self) -> BytesText<'a> { + BytesText::from_escaped(match partial_escape(&self.content) { + Cow::Borrowed(_) => self.content, + Cow::Owned(escaped) => Cow::Owned(escaped), + }) + } + + /// Gets content of this text buffer in the specified encoding + #[cfg(feature = "serialize")] + pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { + Ok(match &self.content { + Cow::Borrowed(bytes) => { + #[cfg(feature = "encoding")] + { + decoder.decode(bytes) + } + #[cfg(not(feature = "encoding"))] + { + decoder.decode(bytes)?.into() + } + } + Cow::Owned(bytes) => { + #[cfg(feature = "encoding")] + let decoded = decoder.decode(bytes).into_owned(); + + #[cfg(not(feature = "encoding"))] + let decoded = decoder.decode(bytes)?.to_string(); + + decoded.into() + } + }) + } +} + +impl<'a> std::fmt::Debug for BytesCData<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "BytesCData {{ content: ")?; + write_cow_string(f, &self.content)?; + write!(f, " }}") + } +} + /// Event emitted by [`Reader::read_event`]. /// /// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event @@ -876,7 +942,7 @@ pub enum Event<'a> { /// Comment ``. Comment(BytesText<'a>), /// CData ``. - CData(BytesText<'a>), + CData(BytesCData<'a>), /// XML declaration ``. Decl(BytesDecl<'a>), /// Processing instruction ``. @@ -934,6 +1000,14 @@ impl<'a> Deref for BytesText<'a> { } } +impl<'a> Deref for BytesCData<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + impl<'a> Deref for Event<'a> { type Target = [u8]; fn deref(&self) -> &[u8] { diff --git a/src/reader.rs b/src/reader.rs index cf050133..3da17af9 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -9,7 +9,8 @@ use std::{fs::File, path::Path, str::from_utf8}; use encoding_rs::{Encoding, UTF_16BE, UTF_16LE}; use crate::errors::{Error, Result}; -use crate::events::{attributes::Attribute, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; +use crate::events::attributes::Attribute; +use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use memchr; @@ -368,7 +369,7 @@ impl Reader { Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) } else if uncased_starts_with(buf, b"![CDATA[") { debug_assert!(len >= 10, "Minimum length guaranteed by read_bang_elem"); - Ok(Event::CData(BytesText::from_plain(&buf[8..buf.len() - 2]))) + Ok(Event::CData(BytesCData::new(&buf[8..buf.len() - 2]))) } else if uncased_starts_with(buf, b"!DOCTYPE") { debug_assert!(len >= 8, "Minimum length guaranteed by read_bang_elem"); let start = buf[8..] diff --git a/src/writer.rs b/src/writer.rs index 73a7dc2e..3870fd7e 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,7 +1,7 @@ //! A module to handle `Writer` use crate::errors::{Error, Result}; -use crate::events::{attributes::Attribute, BytesStart, BytesText, Event}; +use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event}; use std::io::Write; /// XML writer. @@ -261,7 +261,7 @@ impl<'a, W: Write> ElementWriter<'a, W> { } /// Write a CData event `` inside the current element. - pub fn write_cdata_content(self, text: BytesText) -> Result<&'a mut Writer> { + pub fn write_cdata_content(self, text: BytesCData) -> Result<&'a mut Writer> { self.writer .write_event(Event::Start(self.start_tag.to_borrowed()))?; self.writer.write_event(Event::CData(text))?; diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 3515666a..1d9f74e4 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -180,7 +180,7 @@ fn test_cdata() { fn test_cdata_open_close() { let mut r = Reader::from_str(" test]]>"); r.trim_text(true); - next_eq!(r, CData, b"test <> test"); + next_eq!(r, CData, b"test <> test"); } #[test]