diff --git a/Changelog.md b/Changelog.md
index 17d77c3e..e1d458d3 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -33,6 +33,13 @@
- [#514]: Fix wrong reporting `Error::EndEventMismatch` after disabling and enabling
`.check_end_names`
- [#517]: Fix swapped codes for `\r` and `\n` characters when escaping them
+- [#523]: Fix incorrect skipping text and CDATA content before any map-like structures
+ in serde deserializer, like
+ ```xml
+ unwanted text...
+ ```
+- [#523]: Fix incorrect handling of `xs:list`s with encoded spaces: they still
+ act as delimiters, which is confirmed also by mature XmlBeans Java library
### Misc Changes
@@ -64,6 +71,7 @@
[#514]: https://github.com/tafia/quick-xml/issues/514
[#517]: https://github.com/tafia/quick-xml/issues/517
[#521]: https://github.com/tafia/quick-xml/pull/521
+[#523]: https://github.com/tafia/quick-xml/pull/523
[XML name]: https://www.w3.org/TR/xml11/#NT-Name
[documentation]: https://docs.rs/quick-xml/0.27.0/quick_xml/de/index.html#difference-between-text-and-value-special-names
diff --git a/src/de/map.rs b/src/de/map.rs
index b5c27f78..75f7dbe4 100644
--- a/src/de/map.rs
+++ b/src/de/map.rs
@@ -311,15 +311,13 @@ where
// of that events)
// This case are checked by "xml_schema_lists::element" tests in tests/serde-de.rs
ValueSource::Text => match self.de.next()? {
- DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_cow(
- e.into_inner(),
- true,
- self.de.reader.decoder(),
+ DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode(true)?,
)),
- DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_cow(
- e.into_inner(),
- false,
- self.de.reader.decoder(),
+ DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode()?,
)),
// SAFETY: We set `Text` only when we seen `Text` or `CData`
_ => unreachable!(),
@@ -381,7 +379,7 @@ where
/// Access to the map that created this deserializer. Gives access to the
/// context, such as list of fields, that current map known about.
map: &'m mut MapAccess<'de, 'a, R>,
- /// Determines, should [`Deserializer::next_text_impl()`] expand the second
+ /// Determines, should [`Deserializer::read_string_impl()`] expand the second
/// level of tags or not.
///
/// If this field is `true`, we process the following XML shape:
@@ -463,10 +461,14 @@ impl<'de, 'a, 'm, R> MapValueDeserializer<'de, 'a, 'm, R>
where
R: XmlRead<'de>,
{
- /// Returns a text event, used inside [`deserialize_primitives!()`]
+ /// Returns a next string as concatenated content of consequent [`Text`] and
+ /// [`CData`] events, used inside [`deserialize_primitives!()`].
+ ///
+ /// [`Text`]: DeEvent::Text
+ /// [`CData`]: DeEvent::CData
#[inline]
- fn next_text(&mut self, unescape: bool) -> Result, DeError> {
- self.map.de.next_text_impl(unescape, self.allow_start)
+ fn read_string(&mut self, unescape: bool) -> Result, DeError> {
+ self.map.de.read_string_impl(unescape, self.allow_start)
}
}
@@ -518,6 +520,16 @@ where
self.deserialize_tuple(len, visitor)
}
+ /// Deserializes each `` in
+ /// ```xml
+ ///
+ /// ...
+ /// ...
+ /// ...
+ ///
+ /// ```
+ /// as a sequence item, where `` represents a Map in a [`Self::map`],
+ /// and a `` is a sequential field of that map.
fn deserialize_seq(self, visitor: V) -> Result
where
V: Visitor<'de>,
@@ -619,7 +631,25 @@ impl<'de> TagFilter<'de> {
/// An accessor to sequence elements forming a value for struct field.
/// Technically, this sequence is flattened out into structure and sequence
-/// elements are overlapped with other fields of a structure
+/// elements are overlapped with other fields of a structure. Each call to
+/// [`Self::next_element_seed`] consumes a next sub-tree or consequent list
+/// of [`Text`] and [`CData`] events.
+///
+/// ```xml
+/// <>
+/// ...
+/// The is the one item
+/// This is item it splitted by comments
+/// ...and that is the third!
+/// ...
+/// >
+/// ```
+///
+/// Depending on [`Self::filter`], only some of that possible constructs would be
+/// an element.
+///
+/// [`Text`]: DeEvent::Text
+/// [`CData`]: DeEvent::CData
struct MapValueSeqAccess<'de, 'a, 'm, R>
where
R: XmlRead<'de>,
@@ -686,7 +716,7 @@ where
// Start(tag), Text, CData
_ => seed
- .deserialize(SeqValueDeserializer { map: self.map })
+ .deserialize(SeqItemDeserializer { map: self.map })
.map(Some),
};
}
@@ -695,8 +725,8 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////
-/// A deserializer for a value of sequence.
-struct SeqValueDeserializer<'de, 'a, 'm, R>
+/// A deserializer for a single item of a sequence.
+struct SeqItemDeserializer<'de, 'a, 'm, R>
where
R: XmlRead<'de>,
{
@@ -705,18 +735,22 @@ where
map: &'m mut MapAccess<'de, 'a, R>,
}
-impl<'de, 'a, 'm, R> SeqValueDeserializer<'de, 'a, 'm, R>
+impl<'de, 'a, 'm, R> SeqItemDeserializer<'de, 'a, 'm, R>
where
R: XmlRead<'de>,
{
- /// Returns a text event, used inside [`deserialize_primitives!()`]
+ /// Returns a next string as concatenated content of consequent [`Text`] and
+ /// [`CData`] events, used inside [`deserialize_primitives!()`].
+ ///
+ /// [`Text`]: DeEvent::Text
+ /// [`CData`]: DeEvent::CData
#[inline]
- fn next_text(&mut self, unescape: bool) -> Result, DeError> {
- self.map.de.next_text_impl(unescape, true)
+ fn read_string(&mut self, unescape: bool) -> Result, DeError> {
+ self.map.de.read_string_impl(unescape, true)
}
}
-impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R>
+impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqItemDeserializer<'de, 'a, 'm, R>
where
R: XmlRead<'de>,
{
@@ -764,22 +798,31 @@ where
self.deserialize_tuple(len, visitor)
}
+ /// This method deserializes a sequence inside of element that itself is a
+ /// sequence element:
+ ///
+ /// ```xml
+ /// <>
+ /// ...
+ /// inner sequence
+ /// inner sequence
+ /// inner sequence
+ /// ...
+ /// >
+ /// ```
fn deserialize_seq(self, visitor: V) -> Result
where
V: Visitor<'de>,
{
match self.map.de.next()? {
- DeEvent::Text(e) => SimpleTypeDeserializer::from_cow(
- // Comment to prevent auto-formatting and keep Text and Cdata similar
- e.into_inner(),
- true,
- self.map.de.reader.decoder(),
+ DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode(true)?,
)
.deserialize_seq(visitor),
- DeEvent::CData(e) => SimpleTypeDeserializer::from_cow(
- e.into_inner(),
- false,
- self.map.de.reader.decoder(),
+ DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode()?,
)
.deserialize_seq(visitor),
// This is a sequence element. We cannot treat it as another flatten
@@ -787,16 +830,14 @@ where
// it to `xs:simpleType` implementation
DeEvent::Start(e) => {
let value = match self.map.de.next()? {
- DeEvent::Text(e) => SimpleTypeDeserializer::from_cow(
- e.into_inner(),
- true,
- self.map.de.reader.decoder(),
+ DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode(true)?,
)
.deserialize_seq(visitor),
- DeEvent::CData(e) => SimpleTypeDeserializer::from_cow(
- e.into_inner(),
- false,
- self.map.de.reader.decoder(),
+ DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content(
+ // Comment to prevent auto-formatting
+ e.decode()?,
)
.deserialize_seq(visitor),
e => Err(DeError::Unsupported(
diff --git a/src/de/mod.rs b/src/de/mod.rs
index e4efcba1..8fc70d84 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -239,7 +239,7 @@ macro_rules! deserialize_type {
V: Visitor<'de>,
{
// No need to unescape because valid integer representations cannot be escaped
- let text = self.next_text(false)?;
+ let text = self.read_string(false)?;
visitor.$visit(text.parse()?)
}
};
@@ -272,7 +272,7 @@ macro_rules! deserialize_primitives {
V: Visitor<'de>,
{
// No need to unescape because valid boolean representations cannot be escaped
- let text = self.next_text(false)?;
+ let text = self.read_string(false)?;
str2bool(&text, visitor)
}
@@ -297,7 +297,7 @@ macro_rules! deserialize_primitives {
where
V: Visitor<'de>,
{
- let text = self.next_text(true)?;
+ let text = self.read_string(true)?;
match text {
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
Cow::Owned(string) => visitor.visit_string(string),
@@ -684,23 +684,9 @@ where
}
}
- fn next_start(&mut self) -> Result`
/// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to ``
/// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof)
- fn next_text_impl(
+ fn read_string_impl(
&mut self,
unescape: bool,
allow_start: bool,
@@ -860,15 +846,17 @@ where
where
V: Visitor<'de>,
{
- // Try to go to the next `...` or ``
- if let Some(e) = self.next_start()? {
- let name = e.name().as_ref().to_vec();
- let map = map::MapAccess::new(self, e, fields)?;
- let value = visitor.visit_map(map)?;
- self.read_to_end(QName(&name))?;
- Ok(value)
- } else {
- Err(DeError::ExpectedStart)
+ match self.next()? {
+ DeEvent::Start(e) => {
+ let name = e.name().as_ref().to_vec();
+ let map = map::MapAccess::new(self, e, fields)?;
+ let value = visitor.visit_map(map)?;
+ self.read_to_end(QName(&name))?;
+ Ok(value)
+ }
+ DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())),
+ DeEvent::Text(_) | DeEvent::CData(_) => Err(DeError::ExpectedStart),
+ DeEvent::Eof => Err(DeError::UnexpectedEof),
}
}
@@ -1744,10 +1732,10 @@ mod tests {
assert_eq!(reader.next().unwrap(), DeEvent::Eof);
}
- /// Ensures, that [`Deserializer::next_text()`] never can get an `End` event,
+ /// Ensures, that [`Deserializer::read_string()`] never can get an `End` event,
/// because parser reports error early
#[test]
- fn next_text() {
+ fn read_string() {
match from_str::(r#""#) {
Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => {
assert_eq!(expected, "");
diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs
index d7e4ca3a..24f47672 100644
--- a/src/de/simple_type.rs
+++ b/src/de/simple_type.rs
@@ -522,12 +522,12 @@ pub struct SimpleTypeDeserializer<'de, 'a> {
impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> {
/// Creates a deserializer from a value, that possible borrowed from input
- pub fn from_cow(value: Cow<'de, [u8]>, escaped: bool, decoder: Decoder) -> Self {
+ pub fn from_text_content(value: Cow<'de, str>) -> Self {
let content = match value {
- Cow::Borrowed(slice) => CowRef::Input(slice),
- Cow::Owned(content) => CowRef::Owned(content),
+ Cow::Borrowed(slice) => CowRef::Input(slice.as_bytes()),
+ Cow::Owned(content) => CowRef::Owned(content.into_bytes()),
};
- Self::new(content, escaped, decoder)
+ Self::new(content, false, Decoder::utf8())
}
/// Creates a deserializer from a part of value at specified range
diff --git a/tests/serde-de.rs b/tests/serde-de.rs
index fb1432d2..050a6dd9 100644
--- a/tests/serde-de.rs
+++ b/tests/serde-de.rs
@@ -4844,6 +4844,86 @@ mod struct_ {
);
}
+ /// Checks that excess data before the struct correctly handled.
+ /// Any data not allowed before the struct
+ mod excess_data_before {
+ use super::*;
+ use pretty_assertions::assert_eq;
+
+ /// Space-only text events does not treated as data
+ #[test]
+ fn text_spaces_only() {
+ let data: Elements = from_str(
+ // Comment for prevent unnecessary formatting - we use the same style in all tests
+ " \t\n\r42answer",
+ )
+ .unwrap();
+ assert_eq!(
+ data,
+ Elements {
+ float: 42.0,
+ string: "answer".into()
+ }
+ );
+ }
+
+ /// Text events with non-space characters are not allowed
+ #[test]
+ fn text_non_spaces() {
+ match from_str::(
+ "\nexcess text\t42answer",
+ ) {
+ Err(DeError::ExpectedStart) => (),
+ x => panic!("Expected Err(ExpectedStart), but got {:?}", x),
+ };
+ }
+
+ /// CDATA events are not allowed
+ #[test]
+ fn cdata() {
+ match from_str::(
+ "42answer",
+ ) {
+ Err(DeError::ExpectedStart) => (),
+ x => panic!("Expected Err(ExpectedStart), but got {:?}", x),
+ };
+ }
+
+ /// Comments are ignored, so they are allowed
+ #[test]
+ fn comment() {
+ let data: Elements = from_str(
+ // Comment for prevent unnecessary formatting - we use the same style in all tests
+ "42answer",
+ )
+ .unwrap();
+ assert_eq!(
+ data,
+ Elements {
+ float: 42.0,
+ string: "answer".into()
+ }
+ );
+ }
+
+ /// Processing instructions are ignored, so they are allowed
+ #[test]
+ fn pi() {
+ let data: Elements = from_str(
+ // Comment for prevent unnecessary formatting - we use the same style in all tests
+ "42answer",
+ )
+ .unwrap();
+ assert_eq!(
+ data,
+ Elements {
+ float: 42.0,
+ string: "answer".into()
+ }
+ );
+ }
+ }
+
maplike_errors!(Attributes, Mixed);
}
@@ -5980,10 +6060,13 @@ mod xml_schema_lists {
list!(bool_: bool = "true false true" => vec![true, false, true]);
list!(char_: char = "4 2 j" => vec!['4', '2', 'j']);
+ // Expanding of entity references happens before list parsing
+ // This is confirmed by XmlBeans (mature Java library) as well
list!(string: String = "first second third 3" => vec![
"first".to_string(),
"second".to_string(),
- "third 3".to_string(),
+ "third".to_string(),
+ "3".to_string(),
]);
err!(byte_buf: ByteBuf = "first second third 3"
=> Unsupported("byte arrays are not supported as `xs:list` items"));