Skip to content

Commit

Permalink
xs:list: Implement deserialization of xs:lists
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun committed Mar 27, 2022
1 parent 210ca02 commit fd1ef88
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 13 deletions.
8 changes: 2 additions & 6 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,8 @@
- test: add tests for trivial documents (empty / only comment / `<root>...</root>` -- one tag with content)
- fix: CDATA was not handled in many cases where it should
- fix: do not unescape CDATA content because it never escaped by design
([#311](https://github.com/tafia/quick-xml/issues/311)).

NOTE: now text content when deserialized into bytes (`Vec<u8>` / `&[u8]`), also unescaped.
It is impossible to get a raw XML data in bytes buffer. Actually, deserializing of bytes
should be prohibited, because XML cannot store raw byte data. You should store binary
data in a string hex- or base64- or any-other-schema-encoded.
([#311](https://github.com/tafia/quick-xml/issues/311))
- feat: add support for XML Schema `xs:list` ([#376](https://github.com/tafia/quick-xml/pull/376))

## 0.23.0-alpha3

Expand Down
48 changes: 42 additions & 6 deletions src/de/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use crate::{
de::escape::EscapedDeserializer,
de::simple_type::SimpleTypeDeserializer,
de::{BorrowingReader, DeEvent, Deserializer, INNER_VALUE, UNFLATTEN_PREFIX},
errors::serialize::DeError,
events::attributes::Attribute,
Expand All @@ -18,12 +19,23 @@ enum State {
/// `next_key_seed` checked the attributes list and find it is not exhausted yet.
/// Next call to the `next_value_seed` will deserialize type from the attribute value
Attribute,
/// The same as `InnerValue`
/// Next event returned will be a [`DeEvent::Start`], which represents a key.
/// Value should be deserialized from that XML node:
///
/// ```xml
/// <any-tag>
/// <key>...</key>
/// <!--^^^^^^^^^^^^^^ - this node will be used to deserialize map value -->
/// </any-tag>
/// ```
Nested,
/// Value should be deserialized from the text content of the XML node:
///
/// ```xml
/// <...>text content for field value<...>
/// <any-tag>
/// <key>text content</key>
/// <!-- ^^^^^^^^^^^^ - this will be used to deserialize map value -->
/// </any-tag>
/// ```
InnerValue,
}
Expand Down Expand Up @@ -124,7 +136,7 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a,
// TODO: This should be handled by #[serde(flatten)]
// See https://github.com/serde-rs/serde/issues/1905
DeEvent::Start(_) if has_value_field => {
self.state = State::InnerValue;
self.state = State::Nested;
seed.deserialize(INNER_VALUE.into_deserializer()).map(Some)
}
DeEvent::Start(e) => {
Expand All @@ -144,7 +156,7 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a,
// #[serde(rename = "$unflatten=xxx")]
// xxx: String,
// }
self.state = State::InnerValue;
self.state = State::Nested;
seed.deserialize(self.unflatten_fields.remove(p).into_deserializer())
} else {
let name = Cow::Borrowed(e.local_name());
Expand All @@ -166,12 +178,36 @@ impl<'de, 'a, R: BorrowingReader<'de>> de::MapAccess<'de> for MapAccess<'de, 'a,
State::Attribute => {
let decoder = self.de.reader.decoder();
match self.next_attr()? {
Some(a) => seed.deserialize(EscapedDeserializer::new(a.value, decoder, true)),
Some(a) => {
//FIXME: we have to clone value because of wrong lifetimes on `a`
// It should be bound to the input lifetime, but it instead bound
// to a deserializer lifetime
let value: Vec<_> = a.value.into_owned();
seed.deserialize(SimpleTypeDeserializer::new(value.into(), true, decoder))
}
// We set `Attribute` state only when we are sure that `next_attr()` returns a value
None => unreachable!(),
}
}
State::Nested | State::InnerValue => seed.deserialize(&mut *self.de),
// This case are checked by "de::tests::xml_schema_lists::element" tests
State::InnerValue => {
let decoder = self.de.reader.decoder();
match self.de.next()? {
DeEvent::Text(e) => {
//TODO: It is better to store event content as part of state
seed.deserialize(SimpleTypeDeserializer::new(e.into_inner(), true, decoder))
}
// It is better to format similar code similarly, but rustfmt disagree
#[rustfmt::skip]
DeEvent::CData(e) => {
//TODO: It is better to store event content as part of state
seed.deserialize(SimpleTypeDeserializer::new(e.into_inner(), false, decoder))
}
// SAFETY: We set `InnerValue` only when we seen `Text` or `CData`
_ => unreachable!(),
}
}
State::Nested => seed.deserialize(&mut *self.de),
State::Empty => Err(DeError::EndOfAttributes),
}
}
Expand Down
169 changes: 168 additions & 1 deletion src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,11 @@ mod tests {
in_struct!(char_: char = "<root>r</root>", 'r');

in_struct!(string: String = "<root>escaped&#x20;string</root>", "escaped string".into());
in_struct!(byte_buf: ByteBuf = "<root>escaped&#x20;byte_buf</root>", ByteBuf(r"escaped byte_buf".into()));
// Byte buffers give access to raw data from the input, so never deserialized
// TODO: It is a bit unusual and it would be better comletely forbid deserialization
// into bytes, because XML cannot store any bytes natively. User should use some sort
// of encoding to string, for example, hex or base64
in_struct!(byte_buf: ByteBuf = "<root>escaped&#x20;byte_buf</root>", ByteBuf(r"escaped&#x20;byte_buf".into()));
}

/// Tests deserialization from CDATA content in a tag.
Expand Down Expand Up @@ -2328,4 +2332,167 @@ mod tests {
}
}
}

/// https://www.w3schools.com/xml/el_list.asp
mod xml_schema_lists {
use super::*;

macro_rules! list {
($name:ident: $type:ty = $xml:literal => $result:expr) => {
#[test]
fn $name() {
let data: List<$type> = from_str($xml).unwrap();

assert_eq!(data, List { list: $result });
}
};
}

macro_rules! err {
($name:ident: $type:ty = $xml:literal => $kind:ident($err:literal)) => {
#[test]
fn $name() {
let err = from_str::<List<$type>>($xml).unwrap_err();

match err {
DeError::$kind(e) => assert_eq!(e, $err),
_ => panic!(
"Expected `{}({})`, found `{:?}`",
stringify!($kind),
$err,
err
),
}
}
};
}

/// Checks that sequences can be deserialized from an XML attribute content
/// according to the `xs:list` XML Schema type
mod attribute {
use super::*;

#[derive(Debug, Deserialize, PartialEq)]
struct List<T> {
list: Vec<T>,
}

list!(i8_: i8 = r#"<root list="1 -2 3"/>"# => vec![1, -2, 3]);
list!(i16_: i16 = r#"<root list="1 -2 3"/>"# => vec![1, -2, 3]);
list!(i32_: i32 = r#"<root list="1 -2 3"/>"# => vec![1, -2, 3]);
list!(i64_: i64 = r#"<root list="1 -2 3"/>"# => vec![1, -2, 3]);

list!(u8_: u8 = r#"<root list="1 2 3"/>"# => vec![1, 2, 3]);
list!(u16_: u16 = r#"<root list="1 2 3"/>"# => vec![1, 2, 3]);
list!(u32_: u32 = r#"<root list="1 2 3"/>"# => vec![1, 2, 3]);
list!(u64_: u64 = r#"<root list="1 2 3"/>"# => vec![1, 2, 3]);

serde_if_integer128! {
list!(i128_: i128 = r#"<root list="1 -2 3"/>"# => vec![1, -2, 3]);
list!(u128_: u128 = r#"<root list="1 2 3"/>"# => vec![1, 2, 3]);
}

list!(f32_: f32 = r#"<root list="1.23 -4.56 7.89"/>"# => vec![1.23, -4.56, 7.89]);
list!(f64_: f64 = r#"<root list="1.23 -4.56 7.89"/>"# => vec![1.23, -4.56, 7.89]);

list!(bool_: bool = r#"<root list="true false true"/>"# => vec![true, false, true]);
list!(char_: char = r#"<root list="4 2 j"/>"# => vec!['4', '2', 'j']);

list!(string: String = r#"<root list="first second third&#x20;3"/>"# => vec![
"first".to_string(),
"second".to_string(),
"third 3".to_string(),
]);
err!(byte_buf: ByteBuf = r#"<root list="first second third&#x20;3"/>"#
=> Unsupported("byte arrays are not supported as `xs:list` items"));

list!(unit: () = r#"<root list="1 second false"/>"# => vec![(), (), ()]);
}

/// Checks that sequences can be deserialized from an XML text content
/// according to the `xs:list` XML Schema type
mod element {
use super::*;

#[derive(Debug, Deserialize, PartialEq)]
struct List<T> {
// Give it a special name that means text content of the XML node
#[serde(rename = "$value")]
list: Vec<T>,
}

mod text {
use super::*;

list!(i8_: i8 = "<root>1 -2 3</root>" => vec![1, -2, 3]);
list!(i16_: i16 = "<root>1 -2 3</root>" => vec![1, -2, 3]);
list!(i32_: i32 = "<root>1 -2 3</root>" => vec![1, -2, 3]);
list!(i64_: i64 = "<root>1 -2 3</root>" => vec![1, -2, 3]);

list!(u8_: u8 = "<root>1 2 3</root>" => vec![1, 2, 3]);
list!(u16_: u16 = "<root>1 2 3</root>" => vec![1, 2, 3]);
list!(u32_: u32 = "<root>1 2 3</root>" => vec![1, 2, 3]);
list!(u64_: u64 = "<root>1 2 3</root>" => vec![1, 2, 3]);

serde_if_integer128! {
list!(i128_: i128 = "<root>1 -2 3</root>" => vec![1, -2, 3]);
list!(u128_: u128 = "<root>1 2 3</root>" => vec![1, 2, 3]);
}

list!(f32_: f32 = "<root>1.23 -4.56 7.89</root>" => vec![1.23, -4.56, 7.89]);
list!(f64_: f64 = "<root>1.23 -4.56 7.89</root>" => vec![1.23, -4.56, 7.89]);

list!(bool_: bool = "<root>true false true</root>" => vec![true, false, true]);
list!(char_: char = "<root>4 2 j</root>" => vec!['4', '2', 'j']);

list!(string: String = "<root>first second third&#x20;3</root>" => vec![
"first".to_string(),
"second".to_string(),
"third 3".to_string(),
]);
err!(byte_buf: ByteBuf = "<root>first second third&#x20;3</root>"
=> Unsupported("byte arrays are not supported as `xs:list` items"));

list!(unit: () = "<root>1 second false</root>" => vec![(), (), ()]);
}

mod cdata {
use super::*;

list!(i8_: i8 = "<root><![CDATA[1 -2 3]]></root>" => vec![1, -2, 3]);
list!(i16_: i16 = "<root><![CDATA[1 -2 3]]></root>" => vec![1, -2, 3]);
list!(i32_: i32 = "<root><![CDATA[1 -2 3]]></root>" => vec![1, -2, 3]);
list!(i64_: i64 = "<root><![CDATA[1 -2 3]]></root>" => vec![1, -2, 3]);

list!(u8_: u8 = "<root><![CDATA[1 2 3]]></root>" => vec![1, 2, 3]);
list!(u16_: u16 = "<root><![CDATA[1 2 3]]></root>" => vec![1, 2, 3]);
list!(u32_: u32 = "<root><![CDATA[1 2 3]]></root>" => vec![1, 2, 3]);
list!(u64_: u64 = "<root><![CDATA[1 2 3]]></root>" => vec![1, 2, 3]);

serde_if_integer128! {
list!(i128_: i128 = "<root><![CDATA[1 -2 3]]></root>" => vec![1, -2, 3]);
list!(u128_: u128 = "<root><![CDATA[1 2 3]]></root>" => vec![1, 2, 3]);
}

list!(f32_: f32 = "<root><![CDATA[1.23 -4.56 7.89]]></root>" => vec![1.23, -4.56, 7.89]);
list!(f64_: f64 = "<root><![CDATA[1.23 -4.56 7.89]]></root>" => vec![1.23, -4.56, 7.89]);

list!(bool_: bool = "<root><![CDATA[true false true]]></root>" => vec![true, false, true]);
list!(char_: char = "<root><![CDATA[4 2 j]]></root>" => vec!['4', '2', 'j']);

// Cannot get whitespace in the value in any way if CDATA used:
// - literal spaces means list item delimiters
// - escaped sequences are not decoded in CDATA
list!(string: String = "<root><![CDATA[first second third&#x20;3]]></root>" => vec![
"first".to_string(),
"second".to_string(),
"third&#x20;3".to_string(),
]);
err!(byte_buf: ByteBuf = "<root>first second third&#x20;3</root>"
=> Unsupported("byte arrays are not supported as `xs:list` items"));

list!(unit: () = "<root>1 second false</root>" => vec![(), (), ()]);
}
}
}
}
6 changes: 6 additions & 0 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,12 @@ impl<'a> BytesText<'a> {
Self::from_plain(content.as_bytes())
}

/// Extracts the inner `Cow` from the `BytesText` event container.
#[inline]
pub fn into_inner(self) -> Cow<'a, [u8]> {
self.content
}

/// Ensures that all data is owned to extend the object's lifetime if
/// necessary.
#[inline]
Expand Down

0 comments on commit fd1ef88

Please sign in to comment.