diff --git a/Changelog.md b/Changelog.md index 17d77c3e..e1d458d3 100644 --- a/Changelog.md +++ b/Changelog.md @@ -33,6 +33,13 @@ - [#514]: Fix wrong reporting `Error::EndEventMismatch` after disabling and enabling `.check_end_names` - [#517]: Fix swapped codes for `\r` and `\n` characters when escaping them +- [#523]: Fix incorrect skipping text and CDATA content before any map-like structures + in serde deserializer, like + ```xml + unwanted text... + ``` +- [#523]: Fix incorrect handling of `xs:list`s with encoded spaces: they still + act as delimiters, which is confirmed also by mature XmlBeans Java library ### Misc Changes @@ -64,6 +71,7 @@ [#514]: https://github.com/tafia/quick-xml/issues/514 [#517]: https://github.com/tafia/quick-xml/issues/517 [#521]: https://github.com/tafia/quick-xml/pull/521 +[#523]: https://github.com/tafia/quick-xml/pull/523 [XML name]: https://www.w3.org/TR/xml11/#NT-Name [documentation]: https://docs.rs/quick-xml/0.27.0/quick_xml/de/index.html#difference-between-text-and-value-special-names diff --git a/src/de/map.rs b/src/de/map.rs index b5c27f78..75f7dbe4 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -311,15 +311,13 @@ where // of that events) // This case are checked by "xml_schema_lists::element" tests in tests/serde-de.rs ValueSource::Text => match self.de.next()? { - DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_cow( - e.into_inner(), - true, - self.de.reader.decoder(), + DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, )), - DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.de.reader.decoder(), + DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, )), // SAFETY: We set `Text` only when we seen `Text` or `CData` _ => unreachable!(), @@ -381,7 +379,7 @@ where /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. map: &'m mut MapAccess<'de, 'a, R>, - /// Determines, should [`Deserializer::next_text_impl()`] expand the second + /// Determines, should [`Deserializer::read_string_impl()`] expand the second /// level of tags or not. /// /// If this field is `true`, we process the following XML shape: @@ -463,10 +461,14 @@ impl<'de, 'a, 'm, R> MapValueDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { - /// Returns a text event, used inside [`deserialize_primitives!()`] + /// Returns a next string as concatenated content of consequent [`Text`] and + /// [`CData`] events, used inside [`deserialize_primitives!()`]. + /// + /// [`Text`]: DeEvent::Text + /// [`CData`]: DeEvent::CData #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.next_text_impl(unescape, self.allow_start) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.map.de.read_string_impl(unescape, self.allow_start) } } @@ -518,6 +520,16 @@ where self.deserialize_tuple(len, visitor) } + /// Deserializes each `` in + /// ```xml + /// + /// ... + /// ... + /// ... + /// + /// ``` + /// as a sequence item, where `` represents a Map in a [`Self::map`], + /// and a `` is a sequential field of that map. fn deserialize_seq(self, visitor: V) -> Result where V: Visitor<'de>, @@ -619,7 +631,25 @@ impl<'de> TagFilter<'de> { /// An accessor to sequence elements forming a value for struct field. /// Technically, this sequence is flattened out into structure and sequence -/// elements are overlapped with other fields of a structure +/// elements are overlapped with other fields of a structure. Each call to +/// [`Self::next_element_seed`] consumes a next sub-tree or consequent list +/// of [`Text`] and [`CData`] events. +/// +/// ```xml +/// <> +/// ... +/// The is the one item +/// This is item it splitted by comments +/// ...and that is the third! +/// ... +/// +/// ``` +/// +/// Depending on [`Self::filter`], only some of that possible constructs would be +/// an element. +/// +/// [`Text`]: DeEvent::Text +/// [`CData`]: DeEvent::CData struct MapValueSeqAccess<'de, 'a, 'm, R> where R: XmlRead<'de>, @@ -686,7 +716,7 @@ where // Start(tag), Text, CData _ => seed - .deserialize(SeqValueDeserializer { map: self.map }) + .deserialize(SeqItemDeserializer { map: self.map }) .map(Some), }; } @@ -695,8 +725,8 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -/// A deserializer for a value of sequence. -struct SeqValueDeserializer<'de, 'a, 'm, R> +/// A deserializer for a single item of a sequence. +struct SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { @@ -705,18 +735,22 @@ where map: &'m mut MapAccess<'de, 'a, R>, } -impl<'de, 'a, 'm, R> SeqValueDeserializer<'de, 'a, 'm, R> +impl<'de, 'a, 'm, R> SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { - /// Returns a text event, used inside [`deserialize_primitives!()`] + /// Returns a next string as concatenated content of consequent [`Text`] and + /// [`CData`] events, used inside [`deserialize_primitives!()`]. + /// + /// [`Text`]: DeEvent::Text + /// [`CData`]: DeEvent::CData #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.next_text_impl(unescape, true) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.map.de.read_string_impl(unescape, true) } } -impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R> +impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { @@ -764,22 +798,31 @@ where self.deserialize_tuple(len, visitor) } + /// This method deserializes a sequence inside of element that itself is a + /// sequence element: + /// + /// ```xml + /// <> + /// ... + /// inner sequence + /// inner sequence + /// inner sequence + /// ... + /// + /// ``` fn deserialize_seq(self, visitor: V) -> Result where V: Visitor<'de>, { match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_cow( - // Comment to prevent auto-formatting and keep Text and Cdata similar - e.into_inner(), - true, - self.map.de.reader.decoder(), + DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, ) .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.map.de.reader.decoder(), + DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, ) .deserialize_seq(visitor), // This is a sequence element. We cannot treat it as another flatten @@ -787,16 +830,14 @@ where // it to `xs:simpleType` implementation DeEvent::Start(e) => { let value = match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - true, - self.map.de.reader.decoder(), + DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, ) .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.map.de.reader.decoder(), + DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, ) .deserialize_seq(visitor), e => Err(DeError::Unsupported( diff --git a/src/de/mod.rs b/src/de/mod.rs index e4efcba1..8fc70d84 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -239,7 +239,7 @@ macro_rules! deserialize_type { V: Visitor<'de>, { // No need to unescape because valid integer representations cannot be escaped - let text = self.next_text(false)?; + let text = self.read_string(false)?; visitor.$visit(text.parse()?) } }; @@ -272,7 +272,7 @@ macro_rules! deserialize_primitives { V: Visitor<'de>, { // No need to unescape because valid boolean representations cannot be escaped - let text = self.next_text(false)?; + let text = self.read_string(false)?; str2bool(&text, visitor) } @@ -297,7 +297,7 @@ macro_rules! deserialize_primitives { where V: Visitor<'de>, { - let text = self.next_text(true)?; + let text = self.read_string(true)?; match text { Cow::Borrowed(string) => visitor.visit_borrowed_str(string), Cow::Owned(string) => visitor.visit_string(string), @@ -684,23 +684,9 @@ where } } - fn next_start(&mut self) -> Result>, DeError> { - loop { - let e = self.next()?; - match e { - DeEvent::Start(e) => return Ok(Some(e)), - DeEvent::End(e) => { - return Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())) - } - DeEvent::Eof => return Ok(None), - _ => (), // ignore texts - } - } - } - #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.next_text_impl(unescape, true) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.read_string_impl(unescape, true) } /// Consumes a one XML element or an XML tree, returns associated text or @@ -735,7 +721,7 @@ where /// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it, consumes events up to `` /// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to `` /// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof) - fn next_text_impl( + fn read_string_impl( &mut self, unescape: bool, allow_start: bool, @@ -860,15 +846,17 @@ where where V: Visitor<'de>, { - // Try to go to the next `...` or `` - if let Some(e) = self.next_start()? { - let name = e.name().as_ref().to_vec(); - let map = map::MapAccess::new(self, e, fields)?; - let value = visitor.visit_map(map)?; - self.read_to_end(QName(&name))?; - Ok(value) - } else { - Err(DeError::ExpectedStart) + match self.next()? { + DeEvent::Start(e) => { + let name = e.name().as_ref().to_vec(); + let map = map::MapAccess::new(self, e, fields)?; + let value = visitor.visit_map(map)?; + self.read_to_end(QName(&name))?; + Ok(value) + } + DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())), + DeEvent::Text(_) | DeEvent::CData(_) => Err(DeError::ExpectedStart), + DeEvent::Eof => Err(DeError::UnexpectedEof), } } @@ -1744,10 +1732,10 @@ mod tests { assert_eq!(reader.next().unwrap(), DeEvent::Eof); } - /// Ensures, that [`Deserializer::next_text()`] never can get an `End` event, + /// Ensures, that [`Deserializer::read_string()`] never can get an `End` event, /// because parser reports error early #[test] - fn next_text() { + fn read_string() { match from_str::(r#""#) { Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { assert_eq!(expected, ""); diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index d7e4ca3a..24f47672 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -522,12 +522,12 @@ pub struct SimpleTypeDeserializer<'de, 'a> { impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { /// Creates a deserializer from a value, that possible borrowed from input - pub fn from_cow(value: Cow<'de, [u8]>, escaped: bool, decoder: Decoder) -> Self { + pub fn from_text_content(value: Cow<'de, str>) -> Self { let content = match value { - Cow::Borrowed(slice) => CowRef::Input(slice), - Cow::Owned(content) => CowRef::Owned(content), + Cow::Borrowed(slice) => CowRef::Input(slice.as_bytes()), + Cow::Owned(content) => CowRef::Owned(content.into_bytes()), }; - Self::new(content, escaped, decoder) + Self::new(content, false, Decoder::utf8()) } /// Creates a deserializer from a part of value at specified range diff --git a/tests/serde-de.rs b/tests/serde-de.rs index fb1432d2..050a6dd9 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -4844,6 +4844,86 @@ mod struct_ { ); } + /// Checks that excess data before the struct correctly handled. + /// Any data not allowed before the struct + mod excess_data_before { + use super::*; + use pretty_assertions::assert_eq; + + /// Space-only text events does not treated as data + #[test] + fn text_spaces_only() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + " \t\n\r42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + + /// Text events with non-space characters are not allowed + #[test] + fn text_non_spaces() { + match from_str::( + "\nexcess text\t42answer", + ) { + Err(DeError::ExpectedStart) => (), + x => panic!("Expected Err(ExpectedStart), but got {:?}", x), + }; + } + + /// CDATA events are not allowed + #[test] + fn cdata() { + match from_str::( + "42answer", + ) { + Err(DeError::ExpectedStart) => (), + x => panic!("Expected Err(ExpectedStart), but got {:?}", x), + }; + } + + /// Comments are ignored, so they are allowed + #[test] + fn comment() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + "42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + + /// Processing instructions are ignored, so they are allowed + #[test] + fn pi() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + "42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + } + maplike_errors!(Attributes, Mixed); } @@ -5980,10 +6060,13 @@ mod xml_schema_lists { list!(bool_: bool = "true false true" => vec![true, false, true]); list!(char_: char = "4 2 j" => vec!['4', '2', 'j']); + // Expanding of entity references happens before list parsing + // This is confirmed by XmlBeans (mature Java library) as well list!(string: String = "first second third 3" => vec![ "first".to_string(), "second".to_string(), - "third 3".to_string(), + "third".to_string(), + "3".to_string(), ]); err!(byte_buf: ByteBuf = "first second third 3" => Unsupported("byte arrays are not supported as `xs:list` items"));