diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 2dfba93f..3216d9cf 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -22,6 +22,8 @@ jobs:
run: cargo test --features encoding,serialize
- name: Run tests (escape-html+serialize)
run: cargo test --features escape-html,serialize
+ - name: Run tests (all features)
+ run: cargo test --all-features
- name: Check fmt
run: cargo fmt -- --check
diff --git a/Cargo.toml b/Cargo.toml
index 1113a795..cfa8de1a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,46 @@ default = []
## [standard compliant]: https://www.w3.org/TR/xml11/#charencoding
encoding = ["encoding_rs"]
+## This feature enables support of deserializing of lists which tags are overlapped
+## with tags that does not correspond to the list.
+##
+## When this feature is enabled, that XML:
+## ```xml
+##
+##
+##
+##
+##
+##
+## ```
+## could be deserialized to a struct:
+## ```ignore
+## #[derive(Deserialize)]
+## #[serde(rename_all = "kebab-case")]
+## struct AnyName {
+## item: Vec<()>,
+## another_item: (),
+## }
+## ```
+##
+## When feature is not enabled (default), only first element will be assotiated
+## with a field, and deserializer will report an error when it encounter a second
+## ``.
+##
+## Note, that enabling this feature can lead to high and even unlimited memory
+## consumption, because deserializer should check all events up to the end of a
+## container tag (`` in that example) to figure out that there are no
+## more items for a field. If `` or EOF even not encountered, the
+## parsing will never end which can lead to DoS.
+##
+## Having several lists and overlapped elements for them in XML could also lead
+## to quadratic parsing time, because deserialzier have to check list of events
+## as many times as count of sequence fields present.
+##
+## This feature works only with `serialize` feature and has no effect if `serialize`
+## is not enabled.
+overlapped-lists = []
+
## Enables support for [`serde`] serialization and deserialization
serialize = ["serde"]
diff --git a/Changelog.md b/Changelog.md
index f31b4287..95b226a3 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -10,6 +10,11 @@
## Unreleased
+### New Features
+
+- [#12]: Allow overlapping between elements of sequence and other elements
+ (using new feature `overlapped-lists`)
+
### Bug Fixes
- [#9]: Deserialization erroneously was successful in some cases where error is expected.
diff --git a/src/de/map.rs b/src/de/map.rs
index 97db27ae..167311b4 100644
--- a/src/de/map.rs
+++ b/src/de/map.rs
@@ -105,6 +105,9 @@ enum ValueSource {
/// [list of known fields]: MapAccess::fields
Content,
/// Next value should be deserialized from an element with a dedicated name.
+ /// If deserialized type is a sequence, then that sequence will collect all
+ /// elements with the same name until it will be filled. If not all elements
+ /// would be consumed, the rest will be ignored.
///
/// That state is set when call to [`peek()`] returns a [`Start`] event, which
/// [`name()`] represents a field name. That name will be deserialized as a key.
@@ -585,20 +588,29 @@ where
T: DeserializeSeed<'de>,
{
let decoder = self.map.de.reader.decoder();
- match self.map.de.peek()? {
- // Stop iteration when list elements ends
- DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => Ok(None),
+ loop {
+ break match self.map.de.peek()? {
+ // If we see a tag that we not interested, skip it
+ #[cfg(feature = "overlapped-lists")]
+ DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => {
+ self.map.de.skip()?;
+ continue;
+ }
+ // Stop iteration when list elements ends
+ #[cfg(not(feature = "overlapped-lists"))]
+ DeEvent::Start(e) if !self.filter.is_suitable(&e, decoder)? => Ok(None),
- // Stop iteration after reaching a closing tag
- DeEvent::End(e) if e.name() == self.map.start.name() => Ok(None),
- // This is a unmatched closing tag, so the XML is invalid
- DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())),
- // We cannot get `Eof` legally, because we always inside of the
- // opened tag `self.map.start`
- DeEvent::Eof => Err(DeError::UnexpectedEof),
+ // Stop iteration after reaching a closing tag
+ DeEvent::End(e) if e.name() == self.map.start.name() => Ok(None),
+ // This is a unmatched closing tag, so the XML is invalid
+ DeEvent::End(e) => Err(DeError::UnexpectedEnd(e.name().to_owned())),
+ // We cannot get `Eof` legally, because we always inside of the
+ // opened tag `self.map.start`
+ DeEvent::Eof => Err(DeError::UnexpectedEof),
- // Start(tag), Text, CData
- _ => seed.deserialize(&mut *self.map.de).map(Some),
+ // Start(tag), Text, CData
+ _ => seed.deserialize(&mut *self.map.de).map(Some),
+ };
}
}
}
diff --git a/src/de/mod.rs b/src/de/mod.rs
index 10b5c987..4b8f2290 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -226,6 +226,8 @@ use crate::{
};
use serde::de::{self, Deserialize, DeserializeOwned, Visitor};
use std::borrow::Cow;
+#[cfg(feature = "overlapped-lists")]
+use std::collections::VecDeque;
use std::io::BufRead;
pub(crate) const INNER_VALUE: &str = "$value";
@@ -248,12 +250,35 @@ pub enum DeEvent<'a> {
Eof,
}
-/// An xml deserializer
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// A structure that deserializes XML into Rust values.
pub struct Deserializer<'de, R>
where
R: XmlRead<'de>,
{
+ /// An XML reader that streams events into this deserializer
reader: R,
+
+ /// When deserializing sequences sometimes we have to skip unwanted events.
+ /// That events should be stored and then replayed. This is a replay buffer,
+ /// that streams events while not empty. When it exhausted, events will
+ /// requested from [`Self::reader`].
+ #[cfg(feature = "overlapped-lists")]
+ read: VecDeque>,
+ /// When deserializing sequences sometimes we have to skip events, because XML
+ /// is tolerant to elements order and even if in the XSD order is strictly
+ /// specified (using `xs:sequence`) most of XML parsers allows order violations.
+ /// That means, that elements, forming a sequence, could be overlapped with
+ /// other elements, do not related to that sequence.
+ ///
+ /// In order to support this, deserializer will scan events and skip unwanted
+ /// events, store them here. After call [`Self::start_replay()`] all events
+ /// moved from this to [`Self::read`].
+ #[cfg(feature = "overlapped-lists")]
+ write: VecDeque>,
+
+ #[cfg(not(feature = "overlapped-lists"))]
peek: Option>,
}
@@ -345,6 +370,13 @@ where
pub fn new(reader: R) -> Self {
Deserializer {
reader,
+
+ #[cfg(feature = "overlapped-lists")]
+ read: VecDeque::new(),
+ #[cfg(feature = "overlapped-lists")]
+ write: VecDeque::new(),
+
+ #[cfg(not(feature = "overlapped-lists"))]
peek: None,
}
}
@@ -355,6 +387,20 @@ where
Self::new(reader)
}
+ #[cfg(feature = "overlapped-lists")]
+ fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> {
+ if self.read.is_empty() {
+ self.read.push_front(self.reader.next()?);
+ }
+ if let Some(event) = self.read.front() {
+ return Ok(&event);
+ }
+ // SAFETY: `self.read` was filled in the code above.
+ // NOTE: Can be replaced with `unsafe { std::hint::unreachable_unchecked() }`
+ // if unsafe code will be allowed
+ unreachable!()
+ }
+ #[cfg(not(feature = "overlapped-lists"))]
fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> {
if self.peek.is_none() {
self.peek = Some(self.reader.next()?);
@@ -370,12 +416,69 @@ where
}
fn next(&mut self) -> Result, DeError> {
+ // Replay skipped or peeked events
+ #[cfg(feature = "overlapped-lists")]
+ if let Some(event) = self.read.pop_front() {
+ return Ok(event);
+ }
+ #[cfg(not(feature = "overlapped-lists"))]
if let Some(e) = self.peek.take() {
return Ok(e);
}
self.reader.next()
}
+ /// Extracts XML tree of events from and stores them in the skipped events
+ /// buffer from which they can be retrieved later. You MUST call
+ /// [`Self::start_replay()`] after calling this to give acces to the skipped
+ /// events and release internal buffers.
+ #[cfg(feature = "overlapped-lists")]
+ fn skip(&mut self) -> Result<(), DeError> {
+ let event = self.next()?;
+ self.write.push_back(event);
+ match self.write.back() {
+ // Skip all subtree, if we skip a start event
+ Some(DeEvent::Start(e)) => {
+ let end = e.name().to_owned();
+ let mut depth = 0;
+ loop {
+ let event = self.next()?;
+ match event {
+ DeEvent::Start(ref e) if e.name() == end => {
+ self.write.push_back(event);
+ depth += 1;
+ }
+ DeEvent::End(ref e) if e.name() == end => {
+ self.write.push_back(event);
+ if depth == 0 {
+ return Ok(());
+ }
+ depth -= 1;
+ }
+ _ => self.write.push_back(event),
+ }
+ }
+ }
+ _ => Ok(()),
+ }
+ }
+
+ /// Moves all buffered events to the end of [`Self::write`] buffer and swaps
+ /// read and write buffers.
+ ///
+ /// After calling this method, [`Self::peek()`] and [`Self::next()`] starts
+ /// return events that was skipped previously by calling [`Self::skip()`],
+ /// and only when all that events will be consumed, the deserializer starts
+ /// to drain events from underlying reader.
+ ///
+ /// This method MUST be called if any number of [`Self::skip()`] was called
+ /// after [`Self::new()`] or `start_replay()` or you'll lost events.
+ #[cfg(feature = "overlapped-lists")]
+ fn start_replay(&mut self) {
+ self.write.append(&mut self.read);
+ std::mem::swap(&mut self.read, &mut self.write);
+ }
+
fn next_start(&mut self) -> Result