From d2ad730f88147e3db9b64074090cbd41f033142b Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 24 Aug 2021 19:14:05 +0500 Subject: [PATCH 1/6] seq: Add tests for sequences deserialization failures (36): seq::fixed_name::fixed_size::field_after_list::overlapped seq::fixed_name::fixed_size::field_before_list::overlapped seq::fixed_name::fixed_size::two_lists::overlapped seq::fixed_name::fixed_size::unknown_items::overlapped seq::fixed_name::variable_size::field_after_list::overlapped seq::fixed_name::variable_size::field_before_list::overlapped seq::fixed_name::variable_size::two_lists::overlapped seq::fixed_name::variable_size::unknown_items::overlapped seq::variable_name::fixed_size::field_after_list::after seq::variable_name::fixed_size::field_after_list::before seq::variable_name::fixed_size::field_after_list::overlapped seq::variable_name::fixed_size::field_before_list::after seq::variable_name::fixed_size::field_before_list::before seq::variable_name::fixed_size::field_before_list::overlapped seq::variable_name::fixed_size::two_lists::choice_and_fixed::fixed_after seq::variable_name::fixed_size::two_lists::choice_and_fixed::fixed_before seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::fixed_size::two_lists::fixed_and_choice::fixed_after seq::variable_name::fixed_size::two_lists::fixed_and_choice::fixed_before seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_before seq::variable_name::variable_size::field_after_list::after seq::variable_name::variable_size::field_after_list::before seq::variable_name::variable_size::field_after_list::overlapped seq::variable_name::variable_size::field_before_list::after seq::variable_name::variable_size::field_before_list::before seq::variable_name::variable_size::field_before_list::overlapped seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_before seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_before --- Changelog.md | 2 + tests/serde-de.rs | 2282 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2284 insertions(+) diff --git a/Changelog.md b/Changelog.md index 0c8a0499..126e2fbe 100644 --- a/Changelog.md +++ b/Changelog.md @@ -36,9 +36,11 @@ ### New Tests - [#9]: Added tests for incorrect nested tags in input +- [#387]: Added a bunch of tests for sequences deserialization [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 +[#387]: https://github.com/tafia/quick-xml/pull/387 [#391]: https://github.com/tafia/quick-xml/pull/391 ## 0.23.0 -- 2022-05-08 diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 28f6dba5..278f3580 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -512,6 +512,2288 @@ mod tuple_struct { } } +mod seq { + use super::*; + + /// Check that top-level sequences can be deserialized from the multi-root XML documents + mod top_level { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn simple() { + from_str::<[(); 3]>("42answer").unwrap(); + + let data: Vec<()> = from_str("42answer").unwrap(); + assert_eq!(data, vec![(), (), ()]); + } + + /// Special case: empty sequence + #[test] + fn empty() { + from_str::<[(); 0]>("").unwrap(); + + let data: Vec<()> = from_str("").unwrap(); + assert_eq!(data, vec![]); + } + + /// Special case: one-element sequence + #[test] + fn one_element() { + from_str::<[(); 1]>("").unwrap(); + from_str::<[(); 1]>("42").unwrap(); + from_str::<[(); 1]>("text").unwrap(); + from_str::<[(); 1]>("").unwrap(); + + let data: Vec<()> = from_str("").unwrap(); + assert_eq!(data, vec![()]); + + let data: Vec<()> = from_str("42").unwrap(); + assert_eq!(data, vec![()]); + + let data: Vec<()> = from_str("text").unwrap(); + assert_eq!(data, vec![()]); + + let data: Vec<()> = from_str("").unwrap(); + assert_eq!(data, vec![()]); + } + + #[test] + fn excess_attribute() { + from_str::<[(); 3]>(r#"42answer"#) + .unwrap(); + + let data: Vec<()> = + from_str(r#"42answer"#) + .unwrap(); + assert_eq!(data, vec![(), (), ()]); + } + + #[test] + fn mixed_content() { + from_str::<[(); 3]>( + r#" + + text + + "#, + ) + .unwrap(); + + let data: Vec<()> = from_str( + r#" + + text + + "#, + ) + .unwrap(); + assert_eq!(data, vec![(), (), ()]); + } + } + + /// Tests where each sequence item have an identical name in an XML. + /// That explicitly means that `enum`s as list elements are not supported + /// in that case, because enum requires different tags. + /// + /// (by `enums` we mean [externally tagged enums] is serde terminology) + /// + /// [externally tagged enums]: https://serde.rs/enum-representations.html#externally-tagged + mod fixed_name { + use super::*; + + /// This module contains tests where size of the list have a compile-time size + mod fixed_size { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: [(); 3], + } + + /// Simple case: count of elements matches expected size of sequence, + /// each element has the same name. Successful deserialization expected + #[test] + fn simple() { + from_str::( + r#" + + + + + + "#, + ) + .unwrap(); + } + + /// Special case: empty sequence + #[test] + #[ignore = "it is impossible to distinguish between missed field and empty list: use `Option<>` or #[serde(default)]"] + fn empty() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: [(); 0], + } + + from_str::(r#""#).unwrap(); + from_str::(r#""#).unwrap(); + } + + /// Special case: one-element sequence + #[test] + fn one_element() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: [(); 1], + } + + from_str::( + r#" + + + + "#, + ) + .unwrap(); + } + + /// Fever elements than expected size of sequence, each element has + /// the same name. Failure expected + #[test] + fn fever_elements() { + let data = from_str::( + r#" + + + + + "#, + ); + + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 2, expected an array of length 3") + } + e => panic!( + r#"Expected `Err(Custom("invalid length 2, expected an array of length 3"))`, but found {:?}"#, + e + ), + } + } + + /// More elements than expected size of sequence, each element has + /// the same name. Failure expected. If you wish to ignore excess + /// elements, use the special type, that consume as much elements + /// as possible, but ignores excess elements + #[test] + fn excess_elements() { + let data = from_str::( + r#" + + + + + + + "#, + ); + + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `item`"), + e => panic!( + r#"Expected `Err(Custom("duplicate field `item`"))`, but found {:?}"#, + e + ), + } + } + + /// Mixed content assumes, that some elements will have an internal + /// name `$value`, so, unless field named the same, it is expected + /// to fail + #[test] + fn mixed_content() { + let data = from_str::( + r#" + + + text + + + "#, + ); + + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "missing field `item`"), + e => panic!( + r#"Expected `Err(Custom("missing field `item`"))`, but found {:?}"#, + e + ), + } + } + + /// In those tests sequence should be deserialized from an XML + /// with additional elements that is not defined in the struct. + /// That fields should be skipped during deserialization + mod unknown_items { + use super::*; + + #[test] + fn before() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn after() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn overlapped() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + } + + /// In those tests non-sequential field is defined in the struct + /// before sequential, so it will be deserialized before the list. + /// That struct should be deserialized from an XML where these + /// fields comes in an arbitrary order + mod field_before_list { + use super::*; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + node: (), + item: [(); 3], + } + + #[test] + fn before() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn after() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn overlapped() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + } + + /// In those tests non-sequential field is defined in the struct + /// after sequential, so it will be deserialized after the list. + /// That struct should be deserialized from an XML where these + /// fields comes in an arbitrary order + mod field_after_list { + use super::*; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + item: [(); 3], + node: (), + } + + #[test] + fn before() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn after() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn overlapped() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap(); + } + } + + /// In those tests two lists are deserialized simultaneously. + /// Lists should be deserialized even when them overlaps + mod two_lists { + use super::*; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + item: [(); 3], + element: [(); 2], + } + + #[test] + fn splitted() { + from_str::( + r#" + + + + + + + + "#, + ) + .unwrap(); + } + + #[test] + fn overlapped() { + from_str::( + r#" + + + + + + + + "#, + ) + .unwrap(); + } + } + + /// Deserialization of primitives slightly differs from deserialization + /// of complex types, so need to check this separately + #[test] + fn primitives() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: [usize; 3], + } + + let data: List = from_str( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap(); + assert_eq!(data, List { item: [41, 42, 43] }); + + from_str::( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap_err(); + } + } + + /// This module contains tests where size of the list have an unspecified size + mod variable_size { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: Vec<()>, + } + + /// Simple case: count of elements matches expected size of sequence, + /// each element has the same name. Successful deserialization expected + #[test] + fn simple() { + let data: List = from_str( + r#" + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![(), (), ()], + } + ); + } + + /// Special case: empty sequence + #[test] + #[ignore = "it is impossible to distinguish between missed field and empty list: use `Option<>` or #[serde(default)]"] + fn empty() { + let data: List = from_str(r#""#).unwrap(); + assert_eq!(data, List { item: vec![] }); + + let data: List = from_str(r#""#).unwrap(); + assert_eq!(data, List { item: vec![] }); + } + + /// Special case: one-element sequence + #[test] + fn one_element() { + let data: List = from_str( + r#" + + + + "#, + ) + .unwrap(); + + assert_eq!(data, List { item: vec![()] }); + } + + /// Mixed content assumes, that some elements will have an internal + /// name `$value`, so, unless field named the same, it is expected + /// to fail + #[test] + fn mixed_content() { + let data = from_str::( + r#" + + + text + + + "#, + ); + + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "missing field `item`"), + e => panic!( + r#"Expected `Err(Custom("missing field `item`"))`, but found {:?}"#, + e + ), + } + } + + /// In those tests sequence should be deserialized from the XML + /// with additional elements that is not defined in the struct. + /// That fields should be skipped during deserialization + mod unknown_items { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn before() { + let data: List = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![(), (), ()], + } + ); + } + + #[test] + fn after() { + let data: List = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![(), (), ()], + } + ); + } + + #[test] + fn overlapped() { + let data: List = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![(), (), ()], + } + ); + } + } + + /// In those tests non-sequential field is defined in the struct + /// before sequential, so it will be deserialized before the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_before_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Default, Deserialize)] + struct Root { + node: (), + item: Vec<()>, + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![(), (), ()], + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![(), (), ()], + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![(), (), ()], + } + ); + } + } + + /// In those tests non-sequential field is defined in the struct + /// after sequential, so it will be deserialized after the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_after_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Default, Deserialize)] + struct Root { + item: Vec<()>, + node: (), + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![(), (), ()], + node: (), + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![(), (), ()], + node: (), + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![(), (), ()], + node: (), + } + ); + } + } + + /// In those tests two lists are deserialized simultaneously. + /// Lists should be deserialized even when them overlaps + mod two_lists { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + item: Vec<()>, + element: Vec<()>, + } + + #[test] + fn splitted() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![(), (), ()], + element: vec![(), ()], + } + ); + } + + #[test] + fn overlapped() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![(), (), ()], + element: vec![(), ()], + } + ); + } + } + + /// Deserialization of primitives slightly differs from deserialization + /// of complex types, so need to check this separately + #[test] + fn primitives() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + item: Vec, + } + + let data: List = from_str( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![41, 42, 43], + } + ); + + from_str::( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap_err(); + } + } + } + + /// Check that sequences inside element can be deserialized. + /// In terms of serde this is a sequence flatten into the struct: + /// + /// ```ignore + /// struct Root { + /// #[serde(flatten)] + /// items: Vec, + /// } + /// ``` + /// except that fact that this is not supported nowadays + /// (https://github.com/serde-rs/serde/issues/1905) + /// + /// Because this is very frequently used pattern in the XML, quick-xml + /// have a workaround for this. If a field will have a special name `$value` + /// then any `xs:element`s in the `xs:sequence` / `xs:all`, except that + /// which name matches the struct name, will be associated with this field: + /// + /// ```ignore + /// struct Root { + /// field: U, + /// #[serde(rename = "$value")] + /// items: Vec, + /// } + /// ``` + /// In this example `` tag will be associated with a `field` field, + /// but all other tags will be associated with an `items` field. Disadvantages + /// of this approach that you can have only one field, but usually you don't + /// want more + mod variable_name { + use super::*; + use serde::de::{Deserializer, EnumAccess, VariantAccess, Visitor}; + use std::fmt::{self, Formatter}; + + // NOTE: Derive could be possible once https://github.com/serde-rs/serde/issues/2126 is resolved + macro_rules! impl_deserialize_choice { + ($name:ident : $(($field:ident, $field_name:literal)),*) => { + impl<'de> Deserialize<'de> for $name { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(field_identifier)] + #[serde(rename_all = "kebab-case")] + enum Tag { + $($field,)* + Other(String), + } + + struct EnumVisitor; + impl<'de> Visitor<'de> for EnumVisitor { + type Value = $name; + + fn expecting(&self, f: &mut Formatter) -> fmt::Result { + f.write_str("enum ")?; + f.write_str(stringify!($name)) + } + + fn visit_enum(self, data: A) -> Result + where + A: EnumAccess<'de>, + { + match data.variant()? { + $( + (Tag::$field, variant) => variant.unit_variant().map(|_| $name::$field), + )* + (Tag::Other(t), v) => v.unit_variant().map(|_| $name::Other(t)), + } + } + } + + const VARIANTS: &'static [&'static str] = &[ + $($field_name,)* + "" + ]; + deserializer.deserialize_enum(stringify!($name), VARIANTS, EnumVisitor) + } + } + }; + } + + /// Type that can be deserialized from ``, ``, or any other element + #[derive(Debug, PartialEq)] + enum Choice { + One, + Two, + /// Any other tag name except `One` or `Two`, name of tag stored inside variant + Other(String), + } + impl_deserialize_choice!(Choice: (One, "one"), (Two, "two")); + + /// Type that can be deserialized from ``, ``, or any other element + #[derive(Debug, PartialEq)] + enum Choice2 { + First, + Second, + /// Any other tag name except `First` or `Second`, name of tag stored inside variant + Other(String), + } + impl_deserialize_choice!(Choice2: (First, "first"), (Second, "second")); + + /// Type that can be deserialized from ``, ``, or any other element. + /// Used for `primitives` tests + #[derive(Debug, PartialEq, Deserialize)] + #[serde(rename_all = "kebab-case")] + enum Choice3 { + One(usize), + Two(String), + #[serde(other)] + Other, + } + + /// This module contains tests where size of the list have a compile-time size + mod fixed_size { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: [Choice; 3], + } + + /// Simple case: count of elements matches expected size of sequence, + /// each element has the same name. Successful deserialization expected + #[test] + fn simple() { + let data: List = from_str( + r#" + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + /// Special case: empty sequence + #[test] + #[ignore = "it is impossible to distinguish between missed field and empty list: use `Option<>` or #[serde(default)]"] + fn empty() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: [Choice; 0], + } + + from_str::(r#""#).unwrap(); + from_str::(r#""#).unwrap(); + } + + /// Special case: one-element sequence + #[test] + fn one_element() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: [Choice; 1], + } + + let data: List = from_str( + r#" + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: [Choice::One], + } + ); + } + + /// Fever elements than expected size of sequence, each element has + /// the same name. Failure expected + #[test] + fn fever_elements() { + from_str::( + r#" + + + + + "#, + ) + .unwrap_err(); + } + + /// More elements than expected size of sequence, each element has + /// the same name. Failure expected. If you wish to ignore excess + /// elements, use the special type, that consume as much elements + /// as possible, but ignores excess elements + #[test] + fn excess_elements() { + from_str::( + r#" + + + + + + + "#, + ) + .unwrap_err(); + } + + #[test] + fn mixed_content() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: [(); 3], + } + + from_str::( + r#" + + + text + + + "#, + ) + .unwrap(); + } + + // There cannot be unknown items, because any tag name is accepted + + /// In those tests non-sequential field is defined in the struct + /// before sequential, so it will be deserialized before the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_before_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + node: (), + #[serde(rename = "$value")] + item: [Choice; 3], + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + } + + /// In those tests non-sequential field is defined in the struct + /// after sequential, so it will be deserialized after the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_after_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + #[serde(rename = "$value")] + item: [Choice; 3], + node: (), + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + } + + /// In those tests two lists are deserialized simultaneously. + /// Lists should be deserialized even when them overlaps + mod two_lists { + use super::*; + + /// A field with a variable-name items defined before a field with a fixed-name + /// items + mod choice_and_fixed { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + #[serde(rename = "$value")] + item: [Choice; 3], + element: [(); 2], + } + + /// A list with fixed-name elements located before a list with variable-name + /// elements in an XML + #[test] + fn fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements located after a list with variable-name + /// elements in an XML + #[test] + fn fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a fixed-name one + #[test] + fn overlapped_fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a variable-name one + #[test] + fn overlapped_fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + } + + /// A field with a variable-name items defined after a field with a fixed-name + /// items + mod fixed_and_choice { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + element: [(); 2], + #[serde(rename = "$value")] + item: [Choice; 3], + } + + /// A list with fixed-name elements located before a list with variable-name + /// elements in an XML + #[test] + fn fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements located after a list with variable-name + /// elements in an XML + #[test] + fn fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a fixed-name one + #[test] + fn overlapped_fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a variable-name one + #[test] + fn overlapped_fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [(), ()], + } + ); + } + } + + /// Tests are ignored, but exists to show a problem. + /// May be it will be solved in the future + mod choice_and_choice { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + #[serde(rename = "$value")] + item: [Choice; 3], + // Actually, we cannot rename both fields to `$value`, which is now + // required to indicate, that field accepts elements with any name + #[serde(rename = "$value")] + element: [Choice2; 2], + } + + #[test] + #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] + fn splitted() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [Choice2::First, Choice2::Second], + } + ); + } + + #[test] + #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] + fn overlapped() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: [Choice::One, Choice::Two, Choice::Other("three".into())], + element: [Choice2::First, Choice2::Second], + } + ); + } + } + } + + /// Deserialization of primitives slightly differs from deserialization + /// of complex types, so need to check this separately + #[test] + fn primitives() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: [Choice3; 3], + } + + let data: List = from_str( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: [ + Choice3::One(41), + Choice3::Two("42".to_string()), + Choice3::Other, + ], + } + ); + + from_str::( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap_err(); + } + } + + /// This module contains tests where size of the list have an unspecified size + mod variable_size { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: Vec, + } + + /// Simple case: count of elements matches expected size of sequence, + /// each element has the same name. Successful deserialization expected + #[test] + fn simple() { + let data: List = from_str( + r#" + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + /// Special case: empty sequence + #[test] + #[ignore = "it is impossible to distinguish between missed field and empty list: use `Option<>` or #[serde(default)]"] + fn empty() { + let data = from_str::(r#""#).unwrap(); + assert_eq!(data, List { item: vec![] }); + + let data = from_str::(r#""#).unwrap(); + assert_eq!(data, List { item: vec![] }); + } + + /// Special case: one-element sequence + #[test] + fn one_element() { + let data: List = from_str( + r#" + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![Choice::One], + } + ); + } + + #[test] + fn mixed_content() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: Vec<()>, + } + + let data: List = from_str( + r#" + + + text + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![(), (), ()], + } + ); + } + + // There cannot be unknown items, because any tag name is accepted + + /// In those tests non-sequential field is defined in the struct + /// before sequential, so it will be deserialized before the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_before_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + node: (), + #[serde(rename = "$value")] + item: Vec, + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + node: (), + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + } + + /// In those tests non-sequential field is defined in the struct + /// after sequential, so it will be deserialized after the list. + /// That struct should be deserialized from the XML where these + /// fields comes in an arbitrary order + mod field_after_list { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Root { + #[serde(rename = "$value")] + item: Vec, + node: (), + } + + #[test] + fn before() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + + #[test] + fn after() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + + #[test] + fn overlapped() { + let data: Root = from_str( + r#" + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Root { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + node: (), + } + ); + } + } + + /// In those tests two lists are deserialized simultaneously. + /// Lists should be deserialized even when them overlaps + mod two_lists { + use super::*; + + /// A field with a variable-name items defined before a field with a fixed-name + /// items + mod choice_and_fixed { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + #[serde(rename = "$value")] + item: Vec, + element: Vec<()>, + } + + /// A list with fixed-name elements located before a list with variable-name + /// elements in an XML + #[test] + fn fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![(), ()], + } + ); + } + + /// A list with fixed-name elements located after a list with variable-name + /// elements in an XML + #[test] + fn fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a fixed-name one + #[test] + fn overlapped_fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![(), ()], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a variable-name one + #[test] + fn overlapped_fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![(), ()], + } + ); + } + } + + /// A field with a variable-name items defined after a field with a fixed-name + /// items + mod fixed_and_choice { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + element: Vec<()>, + #[serde(rename = "$value")] + item: Vec, + } + + /// A list with fixed-name elements located before a list with variable-name + /// elements in an XML + #[test] + fn fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + element: vec![(), ()], + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + /// A list with fixed-name elements located after a list with variable-name + /// elements in an XML + #[test] + fn fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + element: vec![(), ()], + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a fixed-name one + #[test] + fn overlapped_fixed_before() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + element: vec![(), ()], + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + + /// A list with fixed-name elements are mixed with a list with variable-name + /// elements in an XML, and the first element is a variable-name one + #[test] + fn overlapped_fixed_after() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + element: vec![(), ()], + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + } + ); + } + } + + /// Tests are ignored, but exists to show a problem. + /// May be it will be solved in the future + mod choice_and_choice { + use super::*; + use pretty_assertions::assert_eq; + + #[derive(Debug, PartialEq, Deserialize)] + struct Pair { + #[serde(rename = "$value")] + item: Vec, + // Actually, we cannot rename both fields to `$value`, which is now + // required to indicate, that field accepts elements with any name + #[serde(rename = "$value")] + element: Vec, + } + + #[test] + #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] + fn splitted() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![Choice2::First, Choice2::Second], + } + ); + } + + #[test] + #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] + fn overlapped() { + let data: Pair = from_str( + r#" + + + + + + + + "#, + ) + .unwrap(); + + assert_eq!( + data, + Pair { + item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], + element: vec![Choice2::First, Choice2::Second], + } + ); + } + } + } + + /// Deserialization of primitives slightly differs from deserialization + /// of complex types, so need to check this separately + #[test] + fn primitives() { + #[derive(Debug, PartialEq, Deserialize)] + struct List { + #[serde(rename = "$value")] + item: Vec, + } + + let data: List = from_str( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap(); + + assert_eq!( + data, + List { + item: vec![ + Choice3::One(41), + Choice3::Two("42".to_string()), + Choice3::Other, + ], + } + ); + + from_str::( + r#" + + 41 + 42 + 43 + + "#, + ) + .unwrap_err(); + } + } + } +} + macro_rules! maplike_errors { ($type:ty) => { mod non_closed { From 55f541b155454826ba45fad5b5dfe781ade003c3 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 30 Mar 2022 23:45:11 +0500 Subject: [PATCH 2/6] seq: Allow to have an ordinary elements together with a `$value` field Example of a struct, that should be a valid definition: enum Choice { one, two, three } struct Root { #[serde(rename = "$value")] item: [Choice; 3], node: (), } and this struct should be able to deserialized from The following tests were fixed (10): seq::variable_name::fixed_size::field_after_list::after seq::variable_name::fixed_size::field_after_list::before seq::variable_name::fixed_size::field_before_list::after seq::variable_name::fixed_size::field_before_list::before seq::variable_name::fixed_size::two_lists::choice_and_fixed::fixed_after seq::variable_name::fixed_size::two_lists::choice_and_fixed::fixed_before seq::variable_name::fixed_size::two_lists::fixed_and_choice::fixed_after seq::variable_name::fixed_size::two_lists::fixed_and_choice::fixed_before seq::variable_name::variable_size::field_after_list::before seq::variable_name::variable_size::field_before_list::before failures (26): seq::fixed_name::fixed_size::field_after_list::overlapped seq::fixed_name::fixed_size::field_before_list::overlapped seq::fixed_name::fixed_size::two_lists::overlapped seq::fixed_name::fixed_size::unknown_items::overlapped seq::fixed_name::variable_size::field_after_list::overlapped seq::fixed_name::variable_size::field_before_list::overlapped seq::fixed_name::variable_size::two_lists::overlapped seq::fixed_name::variable_size::unknown_items::overlapped seq::variable_name::fixed_size::field_after_list::overlapped seq::variable_name::fixed_size::field_before_list::overlapped seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_before seq::variable_name::variable_size::field_after_list::after seq::variable_name::variable_size::field_after_list::overlapped seq::variable_name::variable_size::field_before_list::after seq::variable_name::variable_size::field_before_list::overlapped seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_before seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_before Co-authored-by: Daniel Alley --- Changelog.md | 1 + src/de/map.rs | 41 ++++++++++++++++---- src/de/seq.rs | 103 +++++++++++++++++++++++++++++++++++++++++++------- src/reader.rs | 17 +++++++++ 4 files changed, 140 insertions(+), 22 deletions(-) diff --git a/Changelog.md b/Changelog.md index 126e2fbe..4ac17708 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,7 @@ - [#9]: Deserialization erroneously was successful in some cases where error is expected. This broke deserialization of untagged enums which rely on error if variant cannot be parsed +- [#387]: Allow to have an ordinary elements together with a `$value` field ### Misc Changes diff --git a/src/de/map.rs b/src/de/map.rs index 38346027..eb62e82b 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -2,6 +2,7 @@ use crate::{ de::escape::EscapedDeserializer, + de::seq::not_in, de::{deserialize_bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX}, errors::serialize::DeError, events::attributes::IterState, @@ -43,12 +44,14 @@ enum ValueSource { /// /// ``` Text, - /// Next value should be deserialized from an element with an any name. - /// Corresponding tag name will always be associated with a field with name - /// [`INNER_VALUE`]. + /// Next value should be deserialized from an element with an any name, except + /// elements with a name matching one of the struct fields. Corresponding tag + /// name will always be associated with a field with name [`INNER_VALUE`]. /// - /// That state is set when call to [`peek()`] returns a [`Start`] event - /// _and_ struct has a field with a special name [`INNER_VALUE`]. + /// That state is set when call to [`peek()`] returns a [`Start`] event, which + /// [`name()`] is not listed in the [list of known fields] (which for a struct + /// is a list of field names, and for a map that is an empty list), _and_ + /// struct has a field with a special name [`INNER_VALUE`]. /// /// When in this state, next event, returned by [`next()`], will be a [`Start`], /// which represents both a key, and a value. Value would be deserialized from @@ -97,7 +100,9 @@ enum ValueSource { /// [`Start`]: DeEvent::Start /// [`peek()`]: Deserializer::peek() /// [`next()`]: Deserializer::next() + /// [`name()`]: BytesStart::name() /// [`Text`]: Self::Text + /// [list of known fields]: MapAccess::fields Content, /// Next value should be deserialized from an element with a dedicated name. /// @@ -135,7 +140,24 @@ enum ValueSource { Nested, } -/// A deserializer for `Attributes` +/// A deserializer that extracts map-like structures from an XML. This deserializer +/// represents a one XML tag: +/// +/// ```xml +/// ... +/// ``` +/// +/// Name of this tag is stored in a [`Self::start`] property. +/// +/// # Lifetimes +/// +/// - `'de` lifetime represents a buffer, from which deserialized values can +/// borrow their data. Depending on the underlying reader, there can be an +/// internal buffer of deserializer (i.e. deserializer itself) or an input +/// (in that case it is possible to approach zero-copy deserialization). +/// +/// - `'a` lifetime represents a parent deserializer, which could own the data +/// buffer. pub(crate) struct MapAccess<'de, 'a, R> where R: XmlRead<'de>, @@ -149,6 +171,8 @@ where /// Current state of the accessor that determines what next call to API /// methods should return. source: ValueSource, + /// List of field names of the struct. It is empty for maps + fields: &'static [&'static str], /// list of fields yet to unflatten (defined as starting with $unflatten=) unflatten_fields: Vec<&'static [u8]>, } @@ -161,13 +185,14 @@ where pub fn new( de: &'a mut Deserializer<'de, R>, start: BytesStart<'de>, - fields: &[&'static str], + fields: &'static [&'static str], ) -> Result { Ok(MapAccess { de, start, iter: IterState::new(0, false), source: ValueSource::Unknown, + fields, unflatten_fields: fields .iter() .filter(|f| f.starts_with(UNFLATTEN_PREFIX)) @@ -230,7 +255,7 @@ where // } // TODO: This should be handled by #[serde(flatten)] // See https://github.com/serde-rs/serde/issues/1905 - DeEvent::Start(_) if has_value_field => { + DeEvent::Start(e) if has_value_field && not_in(self.fields, e, decoder)? => { self.source = ValueSource::Content; seed.deserialize(INNER_VALUE.into_deserializer()).map(Some) } diff --git a/src/de/seq.rs b/src/de/seq.rs index daeac2de..8e01a479 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,18 +1,74 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; use crate::events::BytesStart; +use crate::reader::Decoder; use serde::de::{self, DeserializeSeed}; +#[cfg(not(feature = "encoding"))] +use std::borrow::Cow; +/// Check if tag `start` is included in the `fields` list. `decoder` is used to +/// get a string representation of a tag. +/// +/// Returns `true`, if `start` is not in the `fields` list and `false` otherwise. +pub fn not_in( + fields: &'static [&'static str], + start: &BytesStart, + decoder: Decoder, +) -> Result { + #[cfg(not(feature = "encoding"))] + let tag = Cow::Borrowed(decoder.decode(start.name())?); + + #[cfg(feature = "encoding")] + let tag = decoder.decode(start.name()); + + Ok(fields.iter().all(|&field| field != tag.as_ref())) +} + +/// A filter that determines, what tags should form a sequence. +/// +/// There are two types of sequences: +/// - sequence where each element represented by tags with the same name +/// - sequence where each element can have a different tag +/// +/// The first variant could represent a collection of structs, the second -- +/// a collection of enum variants. +/// +/// In the second case we don't know what tag name should be expected as a +/// sequence element, so we accept any element. Since the sequence are flattened +/// into maps, we skip elements which have dedicated fields in a struct by using an +/// `Exclude` filter that filters out elements with names matching field names +/// from the struct. +/// +/// # Lifetimes +/// +/// `'de` represents a lifetime of the XML input, when filter stores the +/// dedicated tag name #[derive(Debug)] -enum Names { - Unknown, - Peek(Vec), +enum TagFilter<'de> { + /// A `SeqAccess` interested only in tags with specified name to deserialize + /// an XML like this: + /// + /// ```xml + /// <...> + /// + /// + /// + /// ... + /// + /// ``` + /// + /// The tag name is stored inside (`b"tag"` for that example) + Include(BytesStart<'de>), //TODO: Need to store only name instead of a whole tag + /// A `SeqAccess` interested in tags with any name, except explicitly listed. + /// Excluded tags are used as struct field names and therefore should not + /// fall into a `$value` category + Exclude(&'static [&'static str]), } -impl Names { - fn is_valid(&self, start: &BytesStart) -> bool { +impl<'de> TagFilter<'de> { + fn is_suitable(&self, start: &BytesStart, decoder: Decoder) -> Result { match self { - Names::Unknown => true, - Names::Peek(n) => n == start.name(), + Self::Include(n) => Ok(n.name() == start.name()), + Self::Exclude(fields) => not_in(fields, start, decoder), } } } @@ -22,8 +78,10 @@ pub struct SeqAccess<'de, 'a, R> where R: XmlRead<'de>, { + /// Deserializer used to deserialize sequence items de: &'a mut Deserializer<'de, R>, - names: Names, + /// Filter that determines whether a tag is a part of this sequence. + filter: TagFilter<'de>, } impl<'a, 'de, R> SeqAccess<'de, 'a, R> @@ -32,16 +90,17 @@ where { /// Get a new SeqAccess pub fn new(de: &'a mut Deserializer<'de, R>) -> Result { - let names = if de.has_value_field { - Names::Unknown + let filter = if de.has_value_field { + TagFilter::Exclude(&[]) } else { if let DeEvent::Start(e) = de.peek()? { - Names::Peek(e.name().to_vec()) + // Clone is cheap if event borrows from the input + TagFilter::Include(e.clone()) } else { - Names::Unknown + TagFilter::Exclude(&[]) } }; - Ok(SeqAccess { de, names }) + Ok(Self { de, filter }) } } @@ -55,10 +114,26 @@ where where T: DeserializeSeed<'de>, { + let decoder = self.de.reader.decoder(); match self.de.peek()? { DeEvent::Eof | DeEvent::End(_) => Ok(None), - DeEvent::Start(e) if !self.names.is_valid(e) => Ok(None), + DeEvent::Start(e) if !self.filter.is_suitable(e, decoder)? => Ok(None), _ => seed.deserialize(&mut *self.de).map(Some), } } } + +#[test] +fn test_not_in() { + let tag = BytesStart::borrowed_name(b"tag"); + + assert_eq!(not_in(&[], &tag, Decoder::utf8()).unwrap(), true); + assert_eq!( + not_in(&["no", "such", "tags"], &tag, Decoder::utf8()).unwrap(), + true + ); + assert_eq!( + not_in(&["some", "tag", "included"], &tag, Decoder::utf8()).unwrap(), + false + ); +} diff --git a/src/reader.rs b/src/reader.rs index 17b4f344..271c0741 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1669,6 +1669,23 @@ impl Decoder { } } +/// This implementation is required for tests of other parts of the library +#[cfg(test)] +#[cfg(feature = "serialize")] +impl Decoder { + #[cfg(not(feature = "encoding"))] + pub(crate) fn utf8() -> Self { + Decoder + } + + #[cfg(feature = "encoding")] + pub(crate) fn utf8() -> Self { + Decoder { + encoding: encoding_rs::UTF_8, + } + } +} + #[cfg(test)] mod test { macro_rules! check { From 6a52ecc2dcc388dc24128186fa744d60b0577090 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 30 Mar 2022 22:00:09 +0500 Subject: [PATCH 3/6] seq: Split sequence accessors for top-level access and in-map access The following tests were fixed (6): seq::variable_name::variable_size::field_after_list::after seq::variable_name::variable_size::field_before_list::after seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::fixed_before failures (20): seq::fixed_name::fixed_size::field_after_list::overlapped seq::fixed_name::fixed_size::field_before_list::overlapped seq::fixed_name::fixed_size::two_lists::overlapped seq::fixed_name::fixed_size::unknown_items::overlapped seq::fixed_name::variable_size::field_after_list::overlapped seq::fixed_name::variable_size::field_before_list::overlapped seq::fixed_name::variable_size::two_lists::overlapped seq::fixed_name::variable_size::unknown_items::overlapped seq::variable_name::fixed_size::field_after_list::overlapped seq::variable_name::fixed_size::field_before_list::overlapped seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::fixed_size::two_lists::fixed_and_choice::overlapped_fixed_before seq::variable_name::variable_size::field_after_list::overlapped seq::variable_name::variable_size::field_before_list::overlapped seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_after seq::variable_name::variable_size::two_lists::choice_and_fixed::overlapped_fixed_before seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_after seq::variable_name::variable_size::two_lists::fixed_and_choice::overlapped_fixed_before Co-authored-by: Daniel Alley --- src/de/map.rs | 194 +++++++++++++++++++++++++++++++++++++++++++++++--- src/de/mod.rs | 2 +- src/de/seq.rs | 23 +++--- 3 files changed, 200 insertions(+), 19 deletions(-) diff --git a/src/de/map.rs b/src/de/map.rs index eb62e82b..dfdd35b9 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -2,14 +2,14 @@ use crate::{ de::escape::EscapedDeserializer, - de::seq::not_in, + de::seq::{not_in, TagFilter}, de::{deserialize_bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX}, errors::serialize::DeError, events::attributes::IterState, events::{BytesCData, BytesStart}, reader::Decoder, }; -use serde::de::{self, DeserializeSeed, IntoDeserializer, Visitor}; +use serde::de::{self, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor}; use serde::serde_if_integer128; use std::borrow::Cow; use std::ops::Range; @@ -312,7 +312,10 @@ where // is implicit and equals to the `INNER_VALUE` constant, and the value // is a `Text` or a `CData` event (the value deserializer will see one // of that events) - ValueSource::Text => seed.deserialize(MapValueDeserializer { map: self }), + ValueSource::Text => seed.deserialize(MapValueDeserializer { + map: self, + allow_start: false, + }), // This arm processes the following XML shape: // // ... @@ -320,7 +323,10 @@ where // The whole map represented by an `` element, the map key // is implicit and equals to the `INNER_VALUE` constant, and the value // is a `Start` event (the value deserializer will see that event) - ValueSource::Content => seed.deserialize(MapValueDeserializer { map: self }), + ValueSource::Content => seed.deserialize(MapValueDeserializer { + map: self, + allow_start: false, + }), // This arm processes the following XML shape: // // ... @@ -328,7 +334,10 @@ where // The whole map represented by an `` element, the map key // is a `tag`, and the value is a `Start` event (the value deserializer // will see that event) - ValueSource::Nested => seed.deserialize(&mut *self.de), + ValueSource::Nested => seed.deserialize(MapValueDeserializer { + map: self, + allow_start: true, + }), ValueSource::Unknown => Err(DeError::KeyNotRead), } } @@ -364,6 +373,82 @@ where /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. map: &'m mut MapAccess<'de, 'a, R>, + /// Determines, should [`Deserializer::next_text_impl()`] expand the second + /// level of tags or not. + /// + /// If this field is `true`, we process the following XML shape: + /// + /// ```xml + /// + /// ... + /// + /// ``` + /// + /// The whole map represented by an `` element, the map key is a `tag`, + /// and the value starts with is a `Start("tag")` (the value deserializer will + /// see that event first) and extended to the matching `End("tag")` event. + /// In order to deserialize primitives (such as `usize`) we need to allow to + /// look inside the one levels of tags, so the + /// + /// ```xml + /// 42 + /// ``` + /// + /// could be deserialized into `42usize` without problems, and at the same time + /// + /// ```xml + /// + /// + /// + /// + /// + /// ``` + /// could be deserialized to a struct. + /// + /// If this field is `false`, we processes the one of following XML shapes: + /// + /// ```xml + /// + /// text value + /// + /// ``` + /// ```xml + /// + /// + /// + /// ``` + /// ```xml + /// + /// ... + /// + /// ``` + /// + /// The whole map represented by an `` element, the map key is + /// implicit and equals to the [`INNER_VALUE`] constant, and the value is + /// a [`Text`], a [`CData`], or a [`Start`] event (the value deserializer + /// will see one of those events). In the first two cases the value of this + /// field do not matter (because we already see the textual event and there + /// no reasons to look "inside" something), but in the last case the primitives + /// should raise a deserialization error, because that means that you trying + /// to deserialize the following struct: + /// + /// ```ignore + /// struct AnyName { + /// #[serde(rename = "$value")] + /// any_name: String, + /// } + /// ``` + /// which means that `any_name` should get a content of the `` element. + /// + /// Changing this can be valuable for , + /// but those fields should be explicitly marked that they want to get any + /// possible markup as a `String` and that mark is different from marking them + /// as accepting "text content" which the currently `$value` means. + /// + /// [`Text`]: DeEvent::Text + /// [`CData`]: DeEvent::CData + /// [`Start`]: DeEvent::Start + allow_start: bool, } impl<'de, 'a, 'm, R> MapValueDeserializer<'de, 'a, 'm, R> @@ -373,7 +458,7 @@ where /// Returns a text event, used inside [`deserialize_primitives!()`] #[inline] fn next_text(&mut self, unescape: bool) -> Result, DeError> { - self.map.de.next_text_impl(unescape, false) + self.map.de.next_text_impl(unescape, self.allow_start) } /// Returns a decoder, used inside [`deserialize_primitives!()`] @@ -396,10 +481,6 @@ where forward!(deserialize_unit_struct(name: &'static str)); forward!(deserialize_newtype_struct(name: &'static str)); - forward!(deserialize_seq); - forward!(deserialize_tuple(len: usize)); - forward!(deserialize_tuple_struct(name: &'static str, len: usize)); - forward!(deserialize_map); forward!(deserialize_struct( name: &'static str, @@ -414,8 +495,101 @@ where forward!(deserialize_any); forward!(deserialize_ignored_any); + /// Tuple representation is the same as [sequences](#method.deserialize_seq). + fn deserialize_tuple(self, _len: usize, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + /// Named tuple representation is the same as [unnamed tuples](#method.deserialize_tuple). + fn deserialize_tuple_struct( + self, + _name: &'static str, + len: usize, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_tuple(len, visitor) + } + + fn deserialize_seq(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + let filter = if self.allow_start { + match self.map.de.peek()? { + // Clone is cheap if event borrows from the input + DeEvent::Start(e) => TagFilter::Include(e.clone()), + // SAFETY: we use that deserializer with `allow_start == true` + // only from the `MapAccess::next_value_seed` and only when we + // peeked `Start` event + _ => unreachable!(), + } + } else { + TagFilter::Exclude(self.map.fields) + }; + let seq = visitor.visit_seq(MapValueSeqAccess { + map: self.map, + filter, + }); + #[cfg(feature = "overlapped-lists")] + self.map.de.start_replay(); + seq + } + #[inline] fn is_human_readable(&self) -> bool { self.map.de.is_human_readable() } } + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// An accessor to sequence elements forming a value for struct field. +/// Technically, this sequence is flattened out into structure and sequence +/// elements are overlapped with other fields of a structure +struct MapValueSeqAccess<'de, 'a, 'm, R> +where + R: XmlRead<'de>, +{ + /// Accessor to a map that creates this accessor and to a deserializer for + /// a sequence items. + map: &'m mut MapAccess<'de, 'a, R>, + /// Filter that determines whether a tag is a part of this sequence. + /// + /// Iteration will stop when found a tag that does not pass this filter. + filter: TagFilter<'de>, +} + +impl<'de, 'a, 'm, R> SeqAccess<'de> for MapValueSeqAccess<'de, 'a, 'm, R> +where + R: XmlRead<'de>, +{ + type Error = DeError; + + fn next_element_seed(&mut self, seed: T) -> Result, DeError> + where + T: DeserializeSeed<'de>, + { + let decoder = self.map.de.reader.decoder(); + match self.map.de.peek()? { + // Stop iteration when list elements ends + DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => Ok(None), + + // Stop iteration after reaching a closing tag + DeEvent::End(e) if e.name() == self.map.start.name() => Ok(None), + // This is a unmatched closing tag, so the XML is invalid + DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())), + // We cannot get `Eof` legally, because we always inside of the + // opened tag `self.map.start` + DeEvent::Eof => Err(DeError::UnexpectedEof), + + // Start(tag), Text, CData + _ => seed.deserialize(&mut *self.map.de).map(Some), + } + } +} diff --git a/src/de/mod.rs b/src/de/mod.rs index 424155eb..e5f1f74c 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -649,7 +649,7 @@ where where V: Visitor<'de>, { - visitor.visit_seq(seq::SeqAccess::new(self)?) + visitor.visit_seq(seq::TopLevelSeqAccess::new(self)?) } fn deserialize_map(self, visitor: V) -> Result diff --git a/src/de/seq.rs b/src/de/seq.rs index 8e01a479..ae0d359c 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,7 +1,7 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; use crate::events::BytesStart; use crate::reader::Decoder; -use serde::de::{self, DeserializeSeed}; +use serde::de::{DeserializeSeed, SeqAccess}; #[cfg(not(feature = "encoding"))] use std::borrow::Cow; @@ -43,7 +43,7 @@ pub fn not_in( /// `'de` represents a lifetime of the XML input, when filter stores the /// dedicated tag name #[derive(Debug)] -enum TagFilter<'de> { +pub enum TagFilter<'de> { /// A `SeqAccess` interested only in tags with specified name to deserialize /// an XML like this: /// @@ -65,7 +65,7 @@ enum TagFilter<'de> { } impl<'de> TagFilter<'de> { - fn is_suitable(&self, start: &BytesStart, decoder: Decoder) -> Result { + pub fn is_suitable(&self, start: &BytesStart, decoder: Decoder) -> Result { match self { Self::Include(n) => Ok(n.name() == start.name()), Self::Exclude(fields) => not_in(fields, start, decoder), @@ -74,21 +74,23 @@ impl<'de> TagFilter<'de> { } /// A SeqAccess -pub struct SeqAccess<'de, 'a, R> +pub struct TopLevelSeqAccess<'de, 'a, R> where R: XmlRead<'de>, { /// Deserializer used to deserialize sequence items de: &'a mut Deserializer<'de, R>, /// Filter that determines whether a tag is a part of this sequence. + /// + /// Iteration will stop when found a tag that does not pass this filter. filter: TagFilter<'de>, } -impl<'a, 'de, R> SeqAccess<'de, 'a, R> +impl<'a, 'de, R> TopLevelSeqAccess<'de, 'a, R> where R: XmlRead<'de>, { - /// Get a new SeqAccess + /// Creates a new accessor to a top-level sequence of XML elements. pub fn new(de: &'a mut Deserializer<'de, R>) -> Result { let filter = if de.has_value_field { TagFilter::Exclude(&[]) @@ -104,7 +106,7 @@ where } } -impl<'de, 'a, R> de::SeqAccess<'de> for SeqAccess<'de, 'a, R> +impl<'de, 'a, R> SeqAccess<'de> for TopLevelSeqAccess<'de, 'a, R> where R: XmlRead<'de>, { @@ -116,8 +118,13 @@ where { let decoder = self.de.reader.decoder(); match self.de.peek()? { - DeEvent::Eof | DeEvent::End(_) => Ok(None), + // Stop iteration when list elements ends DeEvent::Start(e) if !self.filter.is_suitable(e, decoder)? => Ok(None), + // This is unmatched End tag at top-level + DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())), + DeEvent::Eof => Ok(None), + + // Start(tag), Text, CData _ => seed.deserialize(&mut *self.de).map(Some), } } From 1142da4707be0812761164b47609653e525e9f40 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 3 Apr 2022 02:20:16 +0500 Subject: [PATCH 4/6] seq: Move `has_value_field` into `MapAccess`, because it is not used outside it This fixes a data race with nested maps in sequence fields. Also, top-level sequences always have `has_value_field == false`, so no need to check it Co-authored-by: Daniel Alley --- Changelog.md | 3 +++ src/de/map.rs | 12 ++++++++++-- src/de/mod.rs | 11 ----------- src/de/seq.rs | 12 ++++-------- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/Changelog.md b/Changelog.md index 4ac17708..69541dab 100644 --- a/Changelog.md +++ b/Changelog.md @@ -15,6 +15,9 @@ - [#9]: Deserialization erroneously was successful in some cases where error is expected. This broke deserialization of untagged enums which rely on error if variant cannot be parsed - [#387]: Allow to have an ordinary elements together with a `$value` field +- [#387]: Internal deserializer state can be broken when deserializing a map with + a sequence field (such as `Vec`), where elements of this sequence contains + another sequence. This error affects only users with the `serialize` feature enabled ### Misc Changes diff --git a/src/de/map.rs b/src/de/map.rs index dfdd35b9..f27d63c2 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -173,6 +173,14 @@ where source: ValueSource, /// List of field names of the struct. It is empty for maps fields: &'static [&'static str], + /// If `true`, then the deserialized struct has a field with a special name: + /// [`INNER_VALUE`]. That field should be deserialized from the text content + /// of an XML node: + /// + /// ```xml + /// value for INNER_VALUE field + /// ``` + has_value_field: bool, /// list of fields yet to unflatten (defined as starting with $unflatten=) unflatten_fields: Vec<&'static [u8]>, } @@ -193,6 +201,7 @@ where iter: IterState::new(0, false), source: ValueSource::Unknown, fields, + has_value_field: fields.contains(&INNER_VALUE), unflatten_fields: fields .iter() .filter(|f| f.starts_with(UNFLATTEN_PREFIX)) @@ -217,7 +226,6 @@ where // FIXME: There error positions counted from end of tag name - need global position let slice = self.start.attributes_raw(); let decoder = self.de.reader.decoder(); - let has_value_field = self.de.has_value_field; if let Some(a) = self.iter.next(slice).transpose()? { // try getting map from attributes (key= "value") @@ -255,7 +263,7 @@ where // } // TODO: This should be handled by #[serde(flatten)] // See https://github.com/serde-rs/serde/issues/1905 - DeEvent::Start(e) if has_value_field && not_in(self.fields, e, decoder)? => { + DeEvent::Start(e) if self.has_value_field && not_in(self.fields, e, decoder)? => { self.source = ValueSource::Content; seed.deserialize(INNER_VALUE.into_deserializer()).map(Some) } diff --git a/src/de/mod.rs b/src/de/mod.rs index e5f1f74c..10b5c987 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -255,14 +255,6 @@ where { reader: R, peek: Option>, - /// Special sing that deserialized struct have a field with the special - /// name (see constant `INNER_VALUE`). That field should be deserialized - /// from the text content of the XML node: - /// - /// ```xml - /// value for INNER_VALUE field - /// ``` - has_value_field: bool, } /// Deserialize an instance of type `T` from a string of XML text. @@ -354,7 +346,6 @@ where Deserializer { reader, peek: None, - has_value_field: false, } } @@ -544,10 +535,8 @@ where // Try to go to the next `...` or `` if let Some(e) = self.next_start()? { let name = e.name().to_vec(); - self.has_value_field = fields.contains(&INNER_VALUE); let map = map::MapAccess::new(self, e, fields)?; let value = visitor.visit_map(map)?; - self.has_value_field = false; self.read_to_end(&name)?; Ok(value) } else { diff --git a/src/de/seq.rs b/src/de/seq.rs index ae0d359c..c5c7f299 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -92,15 +92,11 @@ where { /// Creates a new accessor to a top-level sequence of XML elements. pub fn new(de: &'a mut Deserializer<'de, R>) -> Result { - let filter = if de.has_value_field { - TagFilter::Exclude(&[]) + let filter = if let DeEvent::Start(e) = de.peek()? { + // Clone is cheap if event borrows from the input + TagFilter::Include(e.clone()) } else { - if let DeEvent::Start(e) = de.peek()? { - // Clone is cheap if event borrows from the input - TagFilter::Include(e.clone()) - } else { - TagFilter::Exclude(&[]) - } + TagFilter::Exclude(&[]) }; Ok(Self { de, filter }) } From 7a356ec06363f6fcb0abb4336899c26279291dfe Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 30 Mar 2022 00:03:32 +0500 Subject: [PATCH 5/6] seq: Allow overlapping between list items and other items Example of such XML: Here we need to skip `` in order to collect all ``s. So ability to skip events and replay them later was added This fixes all remaining tests Co-authored-by: Daniel Alley --- .github/workflows/rust.yml | 4 + Cargo.toml | 40 ++++ Changelog.md | 5 + src/de/map.rs | 42 ++-- src/de/mod.rs | 382 +++++++++++++++++++++++++++++++++- src/de/seq.rs | 30 ++- tests/serde-de.rs | 410 +++++++++++++++++++++++++++++-------- 7 files changed, 805 insertions(+), 108 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index d4447457..6b4f1077 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,6 +40,10 @@ jobs: env: LLVM_PROFILE_FILE: coverage/serialize-escape-html-%p-%m.profraw run: cargo test --features serialize,escape-html + - name: Run tests (all features) + env: + LLVM_PROFILE_FILE: coverage/all-features-%p-%m.profraw + run: cargo test --all-features - name: Prepare coverage information for upload if: runner.os == 'Linux' # --token is required by grcov, but not required by coveralls.io, so pass diff --git a/Cargo.toml b/Cargo.toml index 1113a795..8e503338 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,46 @@ default = [] ## [standard compliant]: https://www.w3.org/TR/xml11/#charencoding encoding = ["encoding_rs"] +## This feature enables support for deserializing lists where tags are overlapped +## with tags that do not correspond to the list. +## +## When this feature is enabled, the XML: +## ```xml +## +## +## +## +## +## +## ``` +## could be deserialized to a struct: +## ```ignore +## #[derive(Deserialize)] +## #[serde(rename_all = "kebab-case")] +## struct AnyName { +## item: Vec<()>, +## another_item: (), +## } +## ``` +## +## When this feature is not enabled (default), only the first element will be +## associated with the field, and the deserialized type will report an error +## (duplicated field) when the deserializer encounters a second ``. +## +## Note, that enabling this feature can lead to high and even unlimited memory +## consumption, because deserializer should check all events up to the end of a +## container tag (`` in that example) to figure out that there are no +## more items for a field. If `` or even EOF is not encountered, the +## parsing will never end which can lead to a denial-of-service (DoS) scenario. +## +## Having several lists and overlapped elements for them in XML could also lead +## to quadratic parsing time, because the deserializer must check the list of +## events as many times as the number of sequence fields present in the schema. +## +## This feature works only with `serialize` feature and has no effect if `serialize` +## is not enabled. +overlapped-lists = [] + ## Enables support for [`serde`] serialization and deserialization serialize = ["serde"] diff --git a/Changelog.md b/Changelog.md index 69541dab..f55c3f5c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -10,6 +10,11 @@ ## Unreleased +### New Features + +- [#387]: Allow overlapping between elements of sequence and other elements + (using new feature `overlapped-lists`) + ### Bug Fixes - [#9]: Deserialization erroneously was successful in some cases where error is expected. diff --git a/src/de/map.rs b/src/de/map.rs index f27d63c2..dfe24676 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -105,6 +105,9 @@ enum ValueSource { /// [list of known fields]: MapAccess::fields Content, /// Next value should be deserialized from an element with a dedicated name. + /// If deserialized type is a sequence, then that sequence will collect all + /// elements with the same name until it will be filled. If not all elements + /// would be consumed, the rest will be ignored. /// /// That state is set when call to [`peek()`] returns a [`Start`] event, which /// [`name()`] represents a field name. That name will be deserialized as a key. @@ -569,7 +572,11 @@ where map: &'m mut MapAccess<'de, 'a, R>, /// Filter that determines whether a tag is a part of this sequence. /// - /// Iteration will stop when found a tag that does not pass this filter. + /// When feature `overlapped-lists` is not activated, iteration will stop + /// when found a tag that does not pass this filter. + /// + /// When feature `overlapped-lists` is activated, all tags, that not pass + /// this check, will be skipped. filter: TagFilter<'de>, } @@ -584,20 +591,29 @@ where T: DeserializeSeed<'de>, { let decoder = self.map.de.reader.decoder(); - match self.map.de.peek()? { - // Stop iteration when list elements ends - DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => Ok(None), + loop { + break match self.map.de.peek()? { + // If we see a tag that we not interested, skip it + #[cfg(feature = "overlapped-lists")] + DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => { + self.map.de.skip()?; + continue; + } + // Stop iteration when list elements ends + #[cfg(not(feature = "overlapped-lists"))] + DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => Ok(None), - // Stop iteration after reaching a closing tag - DeEvent::End(e) if e.name() == self.map.start.name() => Ok(None), - // This is a unmatched closing tag, so the XML is invalid - DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())), - // We cannot get `Eof` legally, because we always inside of the - // opened tag `self.map.start` - DeEvent::Eof => Err(DeError::UnexpectedEof), + // Stop iteration after reaching a closing tag + DeEvent::End(e) if e.name() == self.map.start.name() => Ok(None), + // This is a unmatched closing tag, so the XML is invalid + DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())), + // We cannot get `Eof` legally, because we always inside of the + // opened tag `self.map.start` + DeEvent::Eof => Err(DeError::UnexpectedEof), - // Start(tag), Text, CData - _ => seed.deserialize(&mut *self.map.de).map(Some), + // Start(tag), Text, CData + _ => seed.deserialize(&mut *self.map.de).map(Some), + }; } } } diff --git a/src/de/mod.rs b/src/de/mod.rs index 10b5c987..eb4a7976 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -226,6 +226,8 @@ use crate::{ }; use serde::de::{self, Deserialize, DeserializeOwned, Visitor}; use std::borrow::Cow; +#[cfg(feature = "overlapped-lists")] +use std::collections::VecDeque; use std::io::BufRead; pub(crate) const INNER_VALUE: &str = "$value"; @@ -248,12 +250,35 @@ pub enum DeEvent<'a> { Eof, } -/// An xml deserializer +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// A structure that deserializes XML into Rust values. pub struct Deserializer<'de, R> where R: XmlRead<'de>, { + /// An XML reader that streams events into this deserializer reader: R, + + /// When deserializing sequences sometimes we have to skip unwanted events. + /// That events should be stored and then replayed. This is a replay buffer, + /// that streams events while not empty. When it exhausted, events will + /// requested from [`Self::reader`]. + #[cfg(feature = "overlapped-lists")] + read: VecDeque>, + /// When deserializing sequences sometimes we have to skip events, because XML + /// is tolerant to elements order and even if in the XSD order is strictly + /// specified (using `xs:sequence`) most of XML parsers allows order violations. + /// That means, that elements, forming a sequence, could be overlapped with + /// other elements, do not related to that sequence. + /// + /// In order to support this, deserializer will scan events and skip unwanted + /// events, store them here. After call [`Self::start_replay()`] all events + /// moved from this to [`Self::read`]. + #[cfg(feature = "overlapped-lists")] + write: VecDeque>, + + #[cfg(not(feature = "overlapped-lists"))] peek: Option>, } @@ -345,6 +370,13 @@ where pub fn new(reader: R) -> Self { Deserializer { reader, + + #[cfg(feature = "overlapped-lists")] + read: VecDeque::new(), + #[cfg(feature = "overlapped-lists")] + write: VecDeque::new(), + + #[cfg(not(feature = "overlapped-lists"))] peek: None, } } @@ -355,6 +387,20 @@ where Self::new(reader) } + #[cfg(feature = "overlapped-lists")] + fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> { + if self.read.is_empty() { + self.read.push_front(self.reader.next()?); + } + if let Some(event) = self.read.front() { + return Ok(&event); + } + // SAFETY: `self.read` was filled in the code above. + // NOTE: Can be replaced with `unsafe { std::hint::unreachable_unchecked() }` + // if unsafe code will be allowed + unreachable!() + } + #[cfg(not(feature = "overlapped-lists"))] fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> { if self.peek.is_none() { self.peek = Some(self.reader.next()?); @@ -370,12 +416,69 @@ where } fn next(&mut self) -> Result, DeError> { + // Replay skipped or peeked events + #[cfg(feature = "overlapped-lists")] + if let Some(event) = self.read.pop_front() { + return Ok(event); + } + #[cfg(not(feature = "overlapped-lists"))] if let Some(e) = self.peek.take() { return Ok(e); } self.reader.next() } + /// Extracts XML tree of events from and stores them in the skipped events + /// buffer from which they can be retrieved later. You MUST call + /// [`Self::start_replay()`] after calling this to give access to the skipped + /// events and release internal buffers. + #[cfg(feature = "overlapped-lists")] + fn skip(&mut self) -> Result<(), DeError> { + let event = self.next()?; + self.write.push_back(event); + match self.write.back() { + // Skip all subtree, if we skip a start event + Some(DeEvent::Start(e)) => { + let end = e.name().to_owned(); + let mut depth = 0; + loop { + let event = self.next()?; + match event { + DeEvent::Start(ref e) if e.name() == end => { + self.write.push_back(event); + depth += 1; + } + DeEvent::End(ref e) if e.name() == end => { + self.write.push_back(event); + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + _ => self.write.push_back(event), + } + } + } + _ => Ok(()), + } + } + + /// Moves all buffered events to the end of [`Self::write`] buffer and swaps + /// read and write buffers. + /// + /// After calling this method, [`Self::peek()`] and [`Self::next()`] starts + /// return events that was skipped previously by calling [`Self::skip()`], + /// and only when all that events will be consumed, the deserializer starts + /// to drain events from underlying reader. + /// + /// This method MUST be called if any number of [`Self::skip()`] was called + /// after [`Self::new()`] or `start_replay()` or you'll lost events. + #[cfg(feature = "overlapped-lists")] + fn start_replay(&mut self) { + self.write.append(&mut self.read); + std::mem::swap(&mut self.read, &mut self.write); + } + fn next_start(&mut self) -> Result>, DeError> { loop { let e = self.next()?; @@ -465,6 +568,33 @@ where self.reader.decoder() } + /// Drops all events until event with [name](BytesEnd::name()) `name` won't be + /// dropped. This method should be called after [`Self::next()`] + #[cfg(feature = "overlapped-lists")] + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { + let mut depth = 0; + loop { + match self.read.pop_front() { + Some(DeEvent::Start(e)) if e.name() == name => { + depth += 1; + } + Some(DeEvent::End(e)) if e.name() == name => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + + // Drop all other skipped events + Some(_) => continue, + + // If we do not have skipped events, use effective reading that will + // not allocate memory for events + None => return self.reader.read_to_end(name), + } + } + } + #[cfg(not(feature = "overlapped-lists"))] fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { // First one might be in self.peek match self.next()? { @@ -638,7 +768,10 @@ where where V: Visitor<'de>, { - visitor.visit_seq(seq::TopLevelSeqAccess::new(self)?) + let seq = visitor.visit_seq(seq::TopLevelSeqAccess::new(self)?); + #[cfg(feature = "overlapped-lists")] + self.start_replay(); + seq } fn deserialize_map(self, visitor: V) -> Result @@ -793,6 +926,251 @@ mod tests { use super::*; use pretty_assertions::assert_eq; + #[cfg(feature = "overlapped-lists")] + mod skip { + use super::*; + use crate::de::DeEvent::*; + use crate::events::{BytesEnd, BytesText}; + use pretty_assertions::assert_eq; + + /// Checks that `peek()` and `read()` behaves correctly after `skip()` + #[test] + fn read_and_peek() { + let mut de = Deserializer::from_slice( + br#" + + + text + + + + + + "#, + ); + + // Initial conditions - both are empty + assert_eq!(de.read, vec![]); + assert_eq!(de.write, vec![]); + + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"root")) + ); + assert_eq!( + de.peek().unwrap(), + &Start(BytesStart::borrowed_name(b"inner")) + ); + + // Should skip first tree + de.skip().unwrap(); + assert_eq!(de.read, vec![]); + assert_eq!( + de.write, + vec![ + Start(BytesStart::borrowed_name(b"inner")), + Text(BytesText::from_escaped_str("text")), + Start(BytesStart::borrowed_name(b"inner")), + End(BytesEnd::borrowed(b"inner")), + End(BytesEnd::borrowed(b"inner")), + ] + ); + + // Consume . Now unconsumed XML looks like: + // + // + // text + // + // + // + // + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"next")) + ); + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"next"))); + + // We finish writing. Next call to `next()` should start replay that messages: + // + // + // text + // + // + // + // and after that stream that messages: + // + // + // + de.start_replay(); + assert_eq!( + de.read, + vec![ + Start(BytesStart::borrowed_name(b"inner")), + Text(BytesText::from_escaped_str("text")), + Start(BytesStart::borrowed_name(b"inner")), + End(BytesEnd::borrowed(b"inner")), + End(BytesEnd::borrowed(b"inner")), + ] + ); + assert_eq!(de.write, vec![]); + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"inner")) + ); + + // Skip `#text` node and consume after it + de.skip().unwrap(); + assert_eq!( + de.read, + vec![ + Start(BytesStart::borrowed_name(b"inner")), + End(BytesEnd::borrowed(b"inner")), + End(BytesEnd::borrowed(b"inner")), + ] + ); + assert_eq!( + de.write, + vec![ + // This comment here to keep the same formatting of both arrays + // otherwise rustfmt suggest one-line it + Text(BytesText::from_escaped_str("text")), + ] + ); + + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"inner")) + ); + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"inner"))); + + // We finish writing. Next call to `next()` should start replay messages: + // + // text + // + // + // and after that stream that messages: + // + // + // + de.start_replay(); + assert_eq!( + de.read, + vec![ + Text(BytesText::from_escaped_str("text")), + End(BytesEnd::borrowed(b"inner")), + ] + ); + assert_eq!(de.write, vec![]); + assert_eq!( + de.next().unwrap(), + Text(BytesText::from_escaped_str("text")) + ); + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"inner"))); + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"target")) + ); + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"target"))); + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root"))); + } + + /// Checks that `read_to_end()` behaves correctly after `skip()` + #[test] + fn read_to_end() { + let mut de = Deserializer::from_slice( + br#" + + + text + + + + + + + "#, + ); + + // Initial conditions - both are empty + assert_eq!(de.read, vec![]); + assert_eq!(de.write, vec![]); + + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"root")) + ); + + // Skip the tree + de.skip().unwrap(); + assert_eq!(de.read, vec![]); + assert_eq!( + de.write, + vec![ + Start(BytesStart::borrowed_name(b"skip")), + Text(BytesText::from_escaped_str("text")), + Start(BytesStart::borrowed_name(b"skip")), + End(BytesEnd::borrowed(b"skip")), + End(BytesEnd::borrowed(b"skip")), + ] + ); + + // Drop all events thet represents tree. Now unconsumed XML looks like: + // + // + // text + // + // + // + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"target")) + ); + de.read_to_end(b"target").unwrap(); + assert_eq!(de.read, vec![]); + assert_eq!( + de.write, + vec![ + Start(BytesStart::borrowed_name(b"skip")), + Text(BytesText::from_escaped_str("text")), + Start(BytesStart::borrowed_name(b"skip")), + End(BytesEnd::borrowed(b"skip")), + End(BytesEnd::borrowed(b"skip")), + ] + ); + + // We finish writing. Next call to `next()` should start replay that messages: + // + // + // text + // + // + // + // and after that stream that messages: + // + // + de.start_replay(); + assert_eq!( + de.read, + vec![ + Start(BytesStart::borrowed_name(b"skip")), + Text(BytesText::from_escaped_str("text")), + Start(BytesStart::borrowed_name(b"skip")), + End(BytesEnd::borrowed(b"skip")), + End(BytesEnd::borrowed(b"skip")), + ] + ); + assert_eq!(de.write, vec![]); + + assert_eq!( + de.next().unwrap(), + Start(BytesStart::borrowed_name(b"skip")) + ); + de.read_to_end(b"skip").unwrap(); + + assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root"))); + } + } + #[test] fn read_to_end() { use crate::de::DeEvent::*; diff --git a/src/de/seq.rs b/src/de/seq.rs index c5c7f299..a1a5b9de 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -82,7 +82,11 @@ where de: &'a mut Deserializer<'de, R>, /// Filter that determines whether a tag is a part of this sequence. /// - /// Iteration will stop when found a tag that does not pass this filter. + /// When feature `overlapped-lists` is not activated, iteration will stop + /// when found a tag that does not pass this filter. + /// + /// When feature `overlapped-lists` is activated, all tags, that not pass + /// this check, will be skipped. filter: TagFilter<'de>, } @@ -113,15 +117,23 @@ where T: DeserializeSeed<'de>, { let decoder = self.de.reader.decoder(); - match self.de.peek()? { - // Stop iteration when list elements ends - DeEvent::Start(e) if !self.filter.is_suitable(e, decoder)? => Ok(None), - // This is unmatched End tag at top-level - DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())), - DeEvent::Eof => Ok(None), + loop { + break match self.de.peek()? { + // If we see a tag that we not interested, skip it + #[cfg(feature = "overlapped-lists")] + DeEvent::Start(e) if !self.filter.is_suitable(e, decoder)? => { + self.de.skip()?; + continue; + } + // Stop iteration when list elements ends + #[cfg(not(feature = "overlapped-lists"))] + DeEvent::Start(e) if !self.filter.is_suitable(e, decoder)? => Ok(None), + DeEvent::End(_) => Ok(None), + DeEvent::Eof => Ok(None), - // Start(tag), Text, CData - _ => seed.deserialize(&mut *self.de).map(Some), + // Start(tag), Text, CData + _ => seed.deserialize(&mut *self.de).map(Some), + }; } } } diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 278f3580..1b100ff1 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -738,6 +738,8 @@ mod seq { /// That fields should be skipped during deserialization mod unknown_items { use super::*; + #[cfg(not(feature = "overlapped-lists"))] + use pretty_assertions::assert_eq; #[test] fn before() { @@ -771,7 +773,7 @@ mod seq { #[test] fn overlapped() { - from_str::( + let data = from_str::( r#" @@ -780,8 +782,21 @@ mod seq { "#, - ) - .unwrap(); + ); + + #[cfg(feature = "overlapped-lists")] + data.unwrap(); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -791,6 +806,8 @@ mod seq { /// fields comes in an arbitrary order mod field_before_list { use super::*; + #[cfg(not(feature = "overlapped-lists"))] + use pretty_assertions::assert_eq; #[derive(Debug, PartialEq, Deserialize)] struct Root { @@ -830,7 +847,7 @@ mod seq { #[test] fn overlapped() { - from_str::( + let data = from_str::( r#" @@ -839,8 +856,21 @@ mod seq { "#, - ) - .unwrap(); + ); + + #[cfg(feature = "overlapped-lists")] + data.unwrap(); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -850,6 +880,8 @@ mod seq { /// fields comes in an arbitrary order mod field_after_list { use super::*; + #[cfg(not(feature = "overlapped-lists"))] + use pretty_assertions::assert_eq; #[derive(Debug, PartialEq, Deserialize)] struct Root { @@ -889,7 +921,7 @@ mod seq { #[test] fn overlapped() { - from_str::( + let data = from_str::( r#" @@ -898,8 +930,21 @@ mod seq { "#, - ) - .unwrap(); + ); + + #[cfg(feature = "overlapped-lists")] + data.unwrap(); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -907,6 +952,8 @@ mod seq { /// Lists should be deserialized even when them overlaps mod two_lists { use super::*; + #[cfg(not(feature = "overlapped-lists"))] + use pretty_assertions::assert_eq; #[derive(Debug, PartialEq, Deserialize)] struct Pair { @@ -932,7 +979,7 @@ mod seq { #[test] fn overlapped() { - from_str::( + let data = from_str::( r#" @@ -942,8 +989,21 @@ mod seq { "#, - ) - .unwrap(); + ); + + #[cfg(feature = "overlapped-lists")] + data.unwrap(); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -1117,7 +1177,7 @@ mod seq { #[test] fn overlapped() { - let data: List = from_str( + let data = from_str::( r#" @@ -1126,15 +1186,24 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), List { item: vec![(), (), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `item`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `item`")), got {:?}"#, + e + ), + } } } @@ -1200,7 +1269,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -1209,16 +1278,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { node: (), item: vec![(), (), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "duplicate field `item`") + } + e => panic!( + r#"Expected Err(Custom("duplicate field `item`")), got {:?}"#, + e + ), + } } } @@ -1284,7 +1364,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -1293,16 +1373,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { item: vec![(), (), ()], node: (), } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "duplicate field `item`") + } + e => panic!( + r#"Expected Err(Custom("duplicate field `item`")), got {:?}"#, + e + ), + } } } @@ -1344,7 +1435,7 @@ mod seq { #[test] fn overlapped() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -1354,16 +1445,25 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: vec![(), (), ()], element: vec![(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `item`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `item`")), got {:?}"#, + e + ), + } } } @@ -1713,7 +1813,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -1722,16 +1822,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { node: (), item: [Choice::One, Choice::Two, Choice::Other("three".into())], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -1798,7 +1909,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -1807,16 +1918,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { item: [Choice::One, Choice::Two, Choice::Other("three".into())], node: (), } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -1894,7 +2016,7 @@ mod seq { /// elements in an XML, and the first element is a fixed-name one #[test] fn overlapped_fixed_before() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -1904,23 +2026,34 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: [Choice::One, Choice::Two, Choice::Other("three".into())], element: [(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 2") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 2")), got {:?}"#, + e + ), + } } /// A list with fixed-name elements are mixed with a list with variable-name /// elements in an XML, and the first element is a variable-name one #[test] fn overlapped_fixed_after() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -1930,16 +2063,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: [Choice::One, Choice::Two, Choice::Other("three".into())], element: [(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -2012,7 +2156,7 @@ mod seq { /// elements in an XML, and the first element is a fixed-name one #[test] fn overlapped_fixed_before() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2022,23 +2166,34 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: [Choice::One, Choice::Two, Choice::Other("three".into())], element: [(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 2") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 2")), got {:?}"#, + e + ), + } } /// A list with fixed-name elements are mixed with a list with variable-name /// elements in an XML, and the first element is a variable-name one #[test] fn overlapped_fixed_after() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2048,16 +2203,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: [Choice::One, Choice::Two, Choice::Other("three".into())], element: [(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } @@ -2105,7 +2271,7 @@ mod seq { #[test] #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] fn overlapped() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2115,16 +2281,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: [Choice::One, Choice::Two, Choice::Other("three".into())], element: [Choice2::First, Choice2::Second], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } } @@ -2331,7 +2508,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -2340,16 +2517,25 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { node: (), item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `$value`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `$value`")), got {:?}"#, + e + ), + } } } @@ -2416,7 +2602,7 @@ mod seq { #[test] fn overlapped() { - let data: Root = from_str( + let data = from_str::( r#" @@ -2425,16 +2611,25 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Root { item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], node: (), } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `$value`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `$value`")), got {:?}"#, + e + ), + } } } @@ -2512,7 +2707,7 @@ mod seq { /// elements in an XML, and the first element is a fixed-name one #[test] fn overlapped_fixed_before() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2522,23 +2717,32 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], element: vec![(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `element`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `element`")), got {:?}"#, + e + ), + } } /// A list with fixed-name elements are mixed with a list with variable-name /// elements in an XML, and the first element is a variable-name one #[test] fn overlapped_fixed_after() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2548,16 +2752,25 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], element: vec![(), ()], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `$value`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `$value`")), got {:?}"#, + e + ), + } } } @@ -2630,7 +2843,7 @@ mod seq { /// elements in an XML, and the first element is a fixed-name one #[test] fn overlapped_fixed_before() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2640,23 +2853,32 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { element: vec![(), ()], item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `element`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `element`")), got {:?}"#, + e + ), + } } /// A list with fixed-name elements are mixed with a list with variable-name /// elements in an XML, and the first element is a variable-name one #[test] fn overlapped_fixed_after() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2666,16 +2888,25 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { element: vec![(), ()], item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => assert_eq!(e, "duplicate field `$value`"), + e => panic!( + r#"Expected Err(Custom("duplicate field `$value`")), got {:?}"#, + e + ), + } } } @@ -2723,7 +2954,7 @@ mod seq { #[test] #[ignore = "There is no way to associate XML elements with `item` or `element` without extra knowledge from type"] fn overlapped() { - let data: Pair = from_str( + let data = from_str::( r#" @@ -2733,16 +2964,27 @@ mod seq { "#, - ) - .unwrap(); + ); + #[cfg(feature = "overlapped-lists")] assert_eq!( - data, + data.unwrap(), Pair { item: vec![Choice::One, Choice::Two, Choice::Other("three".into())], element: vec![Choice2::First, Choice2::Second], } ); + + #[cfg(not(feature = "overlapped-lists"))] + match data { + Err(DeError::Custom(e)) => { + assert_eq!(e, "invalid length 1, expected an array of length 3") + } + e => panic!( + r#"Expected Err(Custom("invalid length 1, expected an array of length 3")), got {:?}"#, + e + ), + } } } } From 59a5c76bafccaef189df7534a73bbe77bb13cd1b Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 13 May 2022 22:06:25 +0500 Subject: [PATCH 6/6] seq: Allow to limit number of skipped events to prevent huge memory consumption Co-authored-by: Daniel Alley --- Cargo.toml | 6 +++ src/de/mod.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/errors.rs | 8 ++++ 3 files changed, 137 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8e503338..0da6e32e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,8 +78,14 @@ encoding = ["encoding_rs"] ## to quadratic parsing time, because the deserializer must check the list of ## events as many times as the number of sequence fields present in the schema. ## +## To reduce negative consequences, always [limit] the maximum number of events +## that [`Deserializer`] will buffer. +## ## This feature works only with `serialize` feature and has no effect if `serialize` ## is not enabled. +## +## [limit]: crate::de::Deserializer::event_buffer_size +## [`Deserializer`]: crate::de::Deserializer overlapped-lists = [] ## Enables support for [`serde`] serialization and deserialization diff --git a/src/de/mod.rs b/src/de/mod.rs index eb4a7976..71ba50c7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -229,6 +229,8 @@ use std::borrow::Cow; #[cfg(feature = "overlapped-lists")] use std::collections::VecDeque; use std::io::BufRead; +#[cfg(feature = "overlapped-lists")] +use std::num::NonZeroUsize; pub(crate) const INNER_VALUE: &str = "$value"; pub(crate) const UNFLATTEN_PREFIX: &str = "$unflatten="; @@ -277,6 +279,12 @@ where /// moved from this to [`Self::read`]. #[cfg(feature = "overlapped-lists")] write: VecDeque>, + /// Maximum number of events that can be skipped when processing sequences + /// that occur out-of-order. This field is used to prevent potential + /// denial-of-service (DoS) attacks which could cause infinite memory + /// consumption when parsing a very large amount of XML into a sequence field. + #[cfg(feature = "overlapped-lists")] + limit: Option, #[cfg(not(feature = "overlapped-lists"))] peek: Option>, @@ -375,6 +383,8 @@ where read: VecDeque::new(), #[cfg(feature = "overlapped-lists")] write: VecDeque::new(), + #[cfg(feature = "overlapped-lists")] + limit: None, #[cfg(not(feature = "overlapped-lists"))] peek: None, @@ -387,6 +397,71 @@ where Self::new(reader) } + /// Set the maximum number of events that could be skipped during deserialization + /// of sequences. + /// + /// If `` contains more than specified nested elements, `#text` or + /// CDATA nodes, then [`DeError::TooManyEvents`] will be returned during + /// deserialization of sequence field (any type that uses [`deserialize_seq`] + /// for the deserialization, for example, `Vec`). + /// + /// This method can be used to prevent a [DoS] attack and infinite memory + /// consumption when parsing a very large XML to a sequence field. + /// + /// It is strongly recommended to set limit to some value when you parse data + /// from untrusted sources. You should choose a value that your typical XMLs + /// can have _between_ different elements that corresponds to the same sequence. + /// + /// # Examples + /// + /// Let's imagine, that we deserialize such structure: + /// ``` + /// struct List { + /// item: Vec<()>, + /// } + /// ``` + /// + /// The XML that we try to parse look like this: + /// ```xml + /// + /// + /// + /// + /// with text + /// + /// + /// + /// + /// + /// + /// + /// ``` + /// + /// There, when we deserialize the `item` field, we need to buffer 7 events, + /// before we can deserialize the second ``: + /// + /// - `` + /// - `` + /// - `#text(with text)` + /// - `` + /// - `` (virtual start event) + /// - `` (vitrual end event) + /// - `` + /// + /// Note, that `` internally represented as 2 events: + /// one for the start tag and one for the end tag. In the future this can be + /// eliminated, but for now we use [auto-expanding feature] of a reader, + /// because this simplifies deserializer code. + /// + /// [`deserialize_seq`]: serde::Deserializer::deserialize_seq + /// [DoS]: https://en.wikipedia.org/wiki/Denial-of-service_attack + /// [auto-expanding feature]: Reader::expand_empty_elements + #[cfg(feature = "overlapped-lists")] + pub fn event_buffer_size(&mut self, limit: Option) -> &mut Self { + self.limit = limit; + self + } + #[cfg(feature = "overlapped-lists")] fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> { if self.read.is_empty() { @@ -435,7 +510,7 @@ where #[cfg(feature = "overlapped-lists")] fn skip(&mut self) -> Result<(), DeError> { let event = self.next()?; - self.write.push_back(event); + self.skip_event(event)?; match self.write.back() { // Skip all subtree, if we skip a start event Some(DeEvent::Start(e)) => { @@ -445,17 +520,17 @@ where let event = self.next()?; match event { DeEvent::Start(ref e) if e.name() == end => { - self.write.push_back(event); + self.skip_event(event)?; depth += 1; } DeEvent::End(ref e) if e.name() == end => { - self.write.push_back(event); + self.skip_event(event)?; if depth == 0 { return Ok(()); } depth -= 1; } - _ => self.write.push_back(event), + _ => self.skip_event(event)?, } } } @@ -463,6 +538,18 @@ where } } + #[cfg(feature = "overlapped-lists")] + #[inline] + fn skip_event(&mut self, event: DeEvent<'de>) -> Result<(), DeError> { + if let Some(max) = self.limit { + if self.write.len() >= max.get() { + return Err(DeError::TooManyEvents(max)); + } + } + self.write.push_back(event); + Ok(()) + } + /// Moves all buffered events to the end of [`Self::write`] buffer and swaps /// read and write buffers. /// @@ -1169,6 +1256,38 @@ mod tests { assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root"))); } + + /// Checks that limiting buffer size works correctly + #[test] + fn limit() { + use serde::Deserialize; + + #[derive(Debug, Deserialize)] + #[allow(unused)] + struct List { + item: Vec<()>, + } + + let mut de = Deserializer::from_slice( + br#" + + + + with text + + + + + + "#, + ); + de.event_buffer_size(NonZeroUsize::new(3)); + + match List::deserialize(&mut de) { + Err(DeError::TooManyEvents(count)) => assert_eq!(count.get(), 3), + e => panic!("Expected `Err(TooManyEvents(3))`, but found {:?}", e), + } + } } #[test] diff --git a/src/errors.rs b/src/errors.rs index cc92c7d0..54a87205 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -116,6 +116,8 @@ pub mod serialize { use super::*; use crate::utils::write_byte_string; use std::fmt; + #[cfg(feature = "overlapped-lists")] + use std::num::NonZeroUsize; use std::num::{ParseFloatError, ParseIntError}; /// (De)serialization error @@ -159,6 +161,10 @@ pub mod serialize { ExpectedStart, /// Unsupported operation Unsupported(&'static str), + /// Too many events were skipped while deserializing a sequence, event limit + /// exceeded. The limit was provided as an argument + #[cfg(feature = "overlapped-lists")] + TooManyEvents(NonZeroUsize), } impl fmt::Display for DeError { @@ -183,6 +189,8 @@ pub mod serialize { DeError::UnexpectedEof => write!(f, "Unexpected `Event::Eof`"), DeError::ExpectedStart => write!(f, "Expecting `Event::Start`"), DeError::Unsupported(s) => write!(f, "Unsupported operation {}", s), + #[cfg(feature = "overlapped-lists")] + DeError::TooManyEvents(s) => write!(f, "Deserializer buffers {} events, limit exceeded", s), } } }