From 54fc1dbd9cff2358fa1ec65de0f33ab42e2c9b9c Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 11 Dec 2022 18:57:58 +0500 Subject: [PATCH 1/5] Add tests for excess elements before the struct failures (2): struct_::excess_data_before::cdata struct_::excess_data_before::text_non_spaces --- tests/serde-de.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/serde-de.rs b/tests/serde-de.rs index fb1432d2..01c82f5a 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -4844,6 +4844,86 @@ mod struct_ { ); } + /// Checks that excess data before the struct correctly handled. + /// Any data not allowed before the struct + mod excess_data_before { + use super::*; + use pretty_assertions::assert_eq; + + /// Space-only text events does not treated as data + #[test] + fn text_spaces_only() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + " \t\n\r42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + + /// Text events with non-space characters are not allowed + #[test] + fn text_non_spaces() { + match from_str::( + "\nexcess text\t42answer", + ) { + Err(DeError::ExpectedStart) => (), + x => panic!("Expected Err(ExpectedStart), but got {:?}", x), + }; + } + + /// CDATA events are not allowed + #[test] + fn cdata() { + match from_str::( + "42answer", + ) { + Err(DeError::ExpectedStart) => (), + x => panic!("Expected Err(ExpectedStart), but got {:?}", x), + }; + } + + /// Comments are ignored, so they are allowed + #[test] + fn comment() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + "42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + + /// Processing instructions are ignored, so they are allowed + #[test] + fn pi() { + let data: Elements = from_str( + // Comment for prevent unnecessary formatting - we use the same style in all tests + "42answer", + ) + .unwrap(); + assert_eq!( + data, + Elements { + float: 42.0, + string: "answer".into() + } + ); + } + } + maplike_errors!(Attributes, Mixed); } From d1601d9d827d187c55db538b04b41ddb6319a8e8 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 11 Dec 2022 19:24:37 +0500 Subject: [PATCH 2/5] Fix incorrect skipping text and CDATA content before any map-like structures in serde deserializer, like ```xml unwanted text... ``` --- Changelog.md | 6 ++++++ src/de/mod.rs | 34 +++++++++++----------------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/Changelog.md b/Changelog.md index 17d77c3e..5dd6d394 100644 --- a/Changelog.md +++ b/Changelog.md @@ -33,6 +33,11 @@ - [#514]: Fix wrong reporting `Error::EndEventMismatch` after disabling and enabling `.check_end_names` - [#517]: Fix swapped codes for `\r` and `\n` characters when escaping them +- [#523]: Fix incorrect skipping text and CDATA content before any map-like structures + in serde deserializer, like + ```xml + unwanted text... + ``` ### Misc Changes @@ -64,6 +69,7 @@ [#514]: https://github.com/tafia/quick-xml/issues/514 [#517]: https://github.com/tafia/quick-xml/issues/517 [#521]: https://github.com/tafia/quick-xml/pull/521 +[#523]: https://github.com/tafia/quick-xml/pull/523 [XML name]: https://www.w3.org/TR/xml11/#NT-Name [documentation]: https://docs.rs/quick-xml/0.27.0/quick_xml/de/index.html#difference-between-text-and-value-special-names diff --git a/src/de/mod.rs b/src/de/mod.rs index e4efcba1..dd1026e3 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -684,20 +684,6 @@ where } } - fn next_start(&mut self) -> Result>, DeError> { - loop { - let e = self.next()?; - match e { - DeEvent::Start(e) => return Ok(Some(e)), - DeEvent::End(e) => { - return Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())) - } - DeEvent::Eof => return Ok(None), - _ => (), // ignore texts - } - } - } - #[inline] fn next_text(&mut self, unescape: bool) -> Result, DeError> { self.next_text_impl(unescape, true) @@ -860,15 +846,17 @@ where where V: Visitor<'de>, { - // Try to go to the next `...` or `` - if let Some(e) = self.next_start()? { - let name = e.name().as_ref().to_vec(); - let map = map::MapAccess::new(self, e, fields)?; - let value = visitor.visit_map(map)?; - self.read_to_end(QName(&name))?; - Ok(value) - } else { - Err(DeError::ExpectedStart) + match self.next()? { + DeEvent::Start(e) => { + let name = e.name().as_ref().to_vec(); + let map = map::MapAccess::new(self, e, fields)?; + let value = visitor.visit_map(map)?; + self.read_to_end(QName(&name))?; + Ok(value) + } + DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().as_ref().to_owned())), + DeEvent::Text(_) | DeEvent::CData(_) => Err(DeError::ExpectedStart), + DeEvent::Eof => Err(DeError::UnexpectedEof), } } From 1373eb12a94646d1fd87362d80b5af5430bdece3 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 14 Nov 2022 00:55:55 +0500 Subject: [PATCH 3/5] Rename `next_text` to `read_string` because new name are better describes the purpose of method --- src/de/map.rs | 22 +++++++++++++++------- src/de/mod.rs | 16 ++++++++-------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/de/map.rs b/src/de/map.rs index b5c27f78..00aa64ab 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -381,7 +381,7 @@ where /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. map: &'m mut MapAccess<'de, 'a, R>, - /// Determines, should [`Deserializer::next_text_impl()`] expand the second + /// Determines, should [`Deserializer::read_string_impl()`] expand the second /// level of tags or not. /// /// If this field is `true`, we process the following XML shape: @@ -463,10 +463,14 @@ impl<'de, 'a, 'm, R> MapValueDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { - /// Returns a text event, used inside [`deserialize_primitives!()`] + /// Returns a next string as concatenated content of consequent [`Text`] and + /// [`CData`] events, used inside [`deserialize_primitives!()`]. + /// + /// [`Text`]: DeEvent::Text + /// [`CData`]: DeEvent::CData #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.next_text_impl(unescape, self.allow_start) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.map.de.read_string_impl(unescape, self.allow_start) } } @@ -709,10 +713,14 @@ impl<'de, 'a, 'm, R> SeqValueDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { - /// Returns a text event, used inside [`deserialize_primitives!()`] + /// Returns a next string as concatenated content of consequent [`Text`] and + /// [`CData`] events, used inside [`deserialize_primitives!()`]. + /// + /// [`Text`]: DeEvent::Text + /// [`CData`]: DeEvent::CData #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.next_text_impl(unescape, true) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.map.de.read_string_impl(unescape, true) } } diff --git a/src/de/mod.rs b/src/de/mod.rs index dd1026e3..8fc70d84 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -239,7 +239,7 @@ macro_rules! deserialize_type { V: Visitor<'de>, { // No need to unescape because valid integer representations cannot be escaped - let text = self.next_text(false)?; + let text = self.read_string(false)?; visitor.$visit(text.parse()?) } }; @@ -272,7 +272,7 @@ macro_rules! deserialize_primitives { V: Visitor<'de>, { // No need to unescape because valid boolean representations cannot be escaped - let text = self.next_text(false)?; + let text = self.read_string(false)?; str2bool(&text, visitor) } @@ -297,7 +297,7 @@ macro_rules! deserialize_primitives { where V: Visitor<'de>, { - let text = self.next_text(true)?; + let text = self.read_string(true)?; match text { Cow::Borrowed(string) => visitor.visit_borrowed_str(string), Cow::Owned(string) => visitor.visit_string(string), @@ -685,8 +685,8 @@ where } #[inline] - fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.next_text_impl(unescape, true) + fn read_string(&mut self, unescape: bool) -> Result, DeError> { + self.read_string_impl(unescape, true) } /// Consumes a one XML element or an XML tree, returns associated text or @@ -721,7 +721,7 @@ where /// |[`DeEvent::Text`] |`text content` |Unescapes `text content` and returns it, consumes events up to `` /// |[`DeEvent::CData`]|``|Returns `cdata content` unchanged, consumes events up to `` /// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof) - fn next_text_impl( + fn read_string_impl( &mut self, unescape: bool, allow_start: bool, @@ -1732,10 +1732,10 @@ mod tests { assert_eq!(reader.next().unwrap(), DeEvent::Eof); } - /// Ensures, that [`Deserializer::next_text()`] never can get an `End` event, + /// Ensures, that [`Deserializer::read_string()`] never can get an `End` event, /// because parser reports error early #[test] - fn next_text() { + fn read_string() { match from_str::(r#""#) { Err(DeError::InvalidXml(Error::EndEventMismatch { expected, found })) => { assert_eq!(expected, ""); From daa652626b1cf6750055d73fa7e5cd3bcb83d810 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 6 Dec 2022 22:06:21 +0500 Subject: [PATCH 4/5] Fix incorrect handling of `xs:list`s with encoded spaces: they still act as delimiters Co-authored-by: Daniel Alley --- Changelog.md | 2 ++ src/de/map.rs | 43 ++++++++++++++++++------------------------- src/de/simple_type.rs | 8 ++++---- tests/serde-de.rs | 5 ++++- 4 files changed, 28 insertions(+), 30 deletions(-) diff --git a/Changelog.md b/Changelog.md index 5dd6d394..e1d458d3 100644 --- a/Changelog.md +++ b/Changelog.md @@ -38,6 +38,8 @@ ```xml unwanted text... ``` +- [#523]: Fix incorrect handling of `xs:list`s with encoded spaces: they still + act as delimiters, which is confirmed also by mature XmlBeans Java library ### Misc Changes diff --git a/src/de/map.rs b/src/de/map.rs index 00aa64ab..f92ce4d3 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -311,15 +311,13 @@ where // of that events) // This case are checked by "xml_schema_lists::element" tests in tests/serde-de.rs ValueSource::Text => match self.de.next()? { - DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_cow( - e.into_inner(), - true, - self.de.reader.decoder(), + DeEvent::Text(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, )), - DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.de.reader.decoder(), + DeEvent::CData(e) => seed.deserialize(SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, )), // SAFETY: We set `Text` only when we seen `Text` or `CData` _ => unreachable!(), @@ -777,17 +775,14 @@ where V: Visitor<'de>, { match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_cow( - // Comment to prevent auto-formatting and keep Text and Cdata similar - e.into_inner(), - true, - self.map.de.reader.decoder(), + DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, ) .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.map.de.reader.decoder(), + DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, ) .deserialize_seq(visitor), // This is a sequence element. We cannot treat it as another flatten @@ -795,16 +790,14 @@ where // it to `xs:simpleType` implementation DeEvent::Start(e) => { let value = match self.map.de.next()? { - DeEvent::Text(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - true, - self.map.de.reader.decoder(), + DeEvent::Text(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode(true)?, ) .deserialize_seq(visitor), - DeEvent::CData(e) => SimpleTypeDeserializer::from_cow( - e.into_inner(), - false, - self.map.de.reader.decoder(), + DeEvent::CData(e) => SimpleTypeDeserializer::from_text_content( + // Comment to prevent auto-formatting + e.decode()?, ) .deserialize_seq(visitor), e => Err(DeError::Unsupported( diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index d7e4ca3a..24f47672 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -522,12 +522,12 @@ pub struct SimpleTypeDeserializer<'de, 'a> { impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { /// Creates a deserializer from a value, that possible borrowed from input - pub fn from_cow(value: Cow<'de, [u8]>, escaped: bool, decoder: Decoder) -> Self { + pub fn from_text_content(value: Cow<'de, str>) -> Self { let content = match value { - Cow::Borrowed(slice) => CowRef::Input(slice), - Cow::Owned(content) => CowRef::Owned(content), + Cow::Borrowed(slice) => CowRef::Input(slice.as_bytes()), + Cow::Owned(content) => CowRef::Owned(content.into_bytes()), }; - Self::new(content, escaped, decoder) + Self::new(content, false, Decoder::utf8()) } /// Creates a deserializer from a part of value at specified range diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 01c82f5a..050a6dd9 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -6060,10 +6060,13 @@ mod xml_schema_lists { list!(bool_: bool = "true false true" => vec![true, false, true]); list!(char_: char = "4 2 j" => vec!['4', '2', 'j']); + // Expanding of entity references happens before list parsing + // This is confirmed by XmlBeans (mature Java library) as well list!(string: String = "first second third 3" => vec![ "first".to_string(), "second".to_string(), - "third 3".to_string(), + "third".to_string(), + "3".to_string(), ]); err!(byte_buf: ByteBuf = "first second third 3" => Unsupported("byte arrays are not supported as `xs:list` items")); From 8283121f631bb43038549cf96a80ddb7d21068c6 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 6 Dec 2022 22:10:56 +0500 Subject: [PATCH 5/5] Add some comments about deserializing sequences and give a more descriptive name to a struct --- src/de/map.rs | 52 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/de/map.rs b/src/de/map.rs index f92ce4d3..75f7dbe4 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -520,6 +520,16 @@ where self.deserialize_tuple(len, visitor) } + /// Deserializes each `` in + /// ```xml + /// + /// ... + /// ... + /// ... + /// + /// ``` + /// as a sequence item, where `` represents a Map in a [`Self::map`], + /// and a `` is a sequential field of that map. fn deserialize_seq(self, visitor: V) -> Result where V: Visitor<'de>, @@ -621,7 +631,25 @@ impl<'de> TagFilter<'de> { /// An accessor to sequence elements forming a value for struct field. /// Technically, this sequence is flattened out into structure and sequence -/// elements are overlapped with other fields of a structure +/// elements are overlapped with other fields of a structure. Each call to +/// [`Self::next_element_seed`] consumes a next sub-tree or consequent list +/// of [`Text`] and [`CData`] events. +/// +/// ```xml +/// <> +/// ... +/// The is the one item +/// This is item it splitted by comments +/// ...and that is the third! +/// ... +/// +/// ``` +/// +/// Depending on [`Self::filter`], only some of that possible constructs would be +/// an element. +/// +/// [`Text`]: DeEvent::Text +/// [`CData`]: DeEvent::CData struct MapValueSeqAccess<'de, 'a, 'm, R> where R: XmlRead<'de>, @@ -688,7 +716,7 @@ where // Start(tag), Text, CData _ => seed - .deserialize(SeqValueDeserializer { map: self.map }) + .deserialize(SeqItemDeserializer { map: self.map }) .map(Some), }; } @@ -697,8 +725,8 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -/// A deserializer for a value of sequence. -struct SeqValueDeserializer<'de, 'a, 'm, R> +/// A deserializer for a single item of a sequence. +struct SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { @@ -707,7 +735,7 @@ where map: &'m mut MapAccess<'de, 'a, R>, } -impl<'de, 'a, 'm, R> SeqValueDeserializer<'de, 'a, 'm, R> +impl<'de, 'a, 'm, R> SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { @@ -722,7 +750,7 @@ where } } -impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R> +impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqItemDeserializer<'de, 'a, 'm, R> where R: XmlRead<'de>, { @@ -770,6 +798,18 @@ where self.deserialize_tuple(len, visitor) } + /// This method deserializes a sequence inside of element that itself is a + /// sequence element: + /// + /// ```xml + /// <> + /// ... + /// inner sequence + /// inner sequence + /// inner sequence + /// ... + /// + /// ``` fn deserialize_seq(self, visitor: V) -> Result where V: Visitor<'de>,