Skip to content

Commit

Permalink
seq: Allow to limit number of skipped events to prevent huge memory c…
Browse files Browse the repository at this point in the history
…onsumption

Co-authored-by: Daniel Alley <dalley@redhat.com>
  • Loading branch information
Mingun and dralley committed Jun 2, 2022
1 parent ef1f1e8 commit 3ee447f
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 4 deletions.
6 changes: 6 additions & 0 deletions Cargo.toml
Expand Up @@ -78,8 +78,14 @@ encoding = ["encoding_rs"]
## to quadratic parsing time, because the deserializer must check the list of
## events as many times as the number of sequence fields present in the schema.
##
## ## To reduce negative consequences, always [limit] the maximum number of events
## that [`Deserializer`] will buffer.
##
## This feature works only with `serialize` feature and has no effect if `serialize`
## is not enabled.
##
## [limit]: crate::de::Deserializer::event_buffer_size
## [`Deserializer`]: crate::de::Deserializer
overlapped-lists = []

## Enables support for [`serde`] serialization and deserialization
Expand Down
127 changes: 123 additions & 4 deletions src/de/mod.rs
Expand Up @@ -229,6 +229,8 @@ use std::borrow::Cow;
#[cfg(feature = "overlapped-lists")]
use std::collections::VecDeque;
use std::io::BufRead;
#[cfg(feature = "overlapped-lists")]
use std::num::NonZeroUsize;

pub(crate) const INNER_VALUE: &str = "$value";
pub(crate) const UNFLATTEN_PREFIX: &str = "$unflatten=";
Expand Down Expand Up @@ -277,6 +279,12 @@ where
/// moved from this to [`Self::read`].
#[cfg(feature = "overlapped-lists")]
write: VecDeque<DeEvent<'de>>,
/// Maximum number of events that can be skipped when processing sequences
/// that occur out-of-order. This field is used to prevent potential
/// denial-of-service (DoS) attacks which could cause infinite memory
/// consumption when parsing a very large amount of XML into a sequence field.
#[cfg(feature = "overlapped-lists")]
limit: Option<NonZeroUsize>,

#[cfg(not(feature = "overlapped-lists"))]
peek: Option<DeEvent<'de>>,
Expand Down Expand Up @@ -375,6 +383,8 @@ where
read: VecDeque::new(),
#[cfg(feature = "overlapped-lists")]
write: VecDeque::new(),
#[cfg(feature = "overlapped-lists")]
limit: None,

#[cfg(not(feature = "overlapped-lists"))]
peek: None,
Expand All @@ -387,6 +397,71 @@ where
Self::new(reader)
}

/// Set the maximum number of events that could be skipped during deserialization
/// of sequences.
///
/// If `<element>` contains more than specified nested elements, `#text` or
/// CDATA nodes, then [`DeError::TooManyEvents`] will be returned during
/// deserialization of sequence field (any type that uses [`deserialize_seq`]
/// for the deserialization, for example, `Vec<T>`).
///
/// This method can be used to prevent a [DoS] attack and infinite memory
/// consumption when parsing a very large XML to a sequence field.
///
/// It is strongly recommended to set limit to some value when you parse data
/// from untrusted sources. You should choose a value that your typical XMLs
/// can have _between_ different elements that corresponds to the same sequence.
///
/// # Examples
///
/// Let's imagine, that we deserialize such structure:
/// ```
/// struct List {
/// item: Vec<()>,
/// }
/// ```
///
/// The XML that we try to parse look like this:
/// ```xml
/// <any-name>
/// <item/>
/// <!-- Bufferization starts at this point -->
/// <another-item>
/// <some-element>with text</some-element>
/// <yet-another-element/>
/// </another-item>
/// <!-- Buffer will be emptied at this point; 7 events were buffered -->
/// <item/>
/// <!-- There is nothing to buffer, because elements follows each other -->
/// <item/>
/// </any-name>
/// ```
///
/// There, when we deserialize the `item` field, we need to buffer 7 events,
/// before we can deserialize the second `<item/>`:
///
/// - `<another-item>`
/// - `<some-element>`
/// - `#text(with text)`
/// - `</some-element>`
/// - `<yet-another-element/>` (virtual start event)
/// - `<yet-another-element/>` (vitrual end event)
/// - `</another-item>`
///
/// Note, that `<yet-another-element/>` internally represented as 2 events:
/// one for the start tag and one for the end tag. In the future this can be
/// eliminated, but for now we use [auto-expanding feature] of a reader,
/// because this simplifies deserializer code.
///
/// [`deserialize_seq`]: serde::Deserializer::deserialize_seq
/// [DoS]: https://en.wikipedia.org/wiki/Denial-of-service_attack
/// [auto-expanding feature]: Reader::expand_empty_elements
#[cfg(feature = "overlapped-lists")]
pub fn event_buffer_size(&mut self, limit: Option<NonZeroUsize>) -> &mut Self {
self.limit = limit;
self
}

#[cfg(feature = "overlapped-lists")]
fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> {
if self.read.is_empty() {
Expand Down Expand Up @@ -435,7 +510,7 @@ where
#[cfg(feature = "overlapped-lists")]
fn skip(&mut self) -> Result<(), DeError> {
let event = self.next()?;
self.write.push_back(event);
self.skip_event(event)?;
match self.write.back() {
// Skip all subtree, if we skip a start event
Some(DeEvent::Start(e)) => {
Expand All @@ -445,24 +520,36 @@ where
let event = self.next()?;
match event {
DeEvent::Start(ref e) if e.name() == end => {
self.write.push_back(event);
self.skip_event(event)?;
depth += 1;
}
DeEvent::End(ref e) if e.name() == end => {
self.write.push_back(event);
self.skip_event(event)?;
if depth == 0 {
return Ok(());
}
depth -= 1;
}
_ => self.write.push_back(event),
_ => self.skip_event(event)?,
}
}
}
_ => Ok(()),
}
}

#[cfg(feature = "overlapped-lists")]
#[inline]
fn skip_event(&mut self, event: DeEvent<'de>) -> Result<(), DeError> {
if let Some(max) = self.limit {
if self.write.len() >= max.get() {
return Err(DeError::TooManyEvents(max));
}
}
self.write.push_back(event);
Ok(())
}

/// Moves all buffered events to the end of [`Self::write`] buffer and swaps
/// read and write buffers.
///
Expand Down Expand Up @@ -1169,6 +1256,38 @@ mod tests {

assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root")));
}

/// Checks that limiting buffer size works correctly
#[test]
fn limit() {
use serde::Deserialize;

#[derive(Debug, Deserialize)]
#[allow(unused)]
struct List {
item: Vec<()>,
}

let mut de = Deserializer::from_slice(
br#"
<any-name>
<item/>
<another-item>
<some-element>with text</some-element>
<yet-another-element/>
</another-item>
<item/>
<item/>
</any-name>
"#,
);
de.event_buffer_size(NonZeroUsize::new(3));

match List::deserialize(&mut de) {
Err(DeError::TooManyEvents(count)) => assert_eq!(count.get(), 3),
e => panic!("Expected `Err(TooManyEvents(3))`, but found {:?}", e),
}
}
}

#[test]
Expand Down
8 changes: 8 additions & 0 deletions src/errors.rs
Expand Up @@ -116,6 +116,8 @@ pub mod serialize {
use super::*;
use crate::utils::write_byte_string;
use std::fmt;
#[cfg(feature = "overlapped-lists")]
use std::num::NonZeroUsize;
use std::num::{ParseFloatError, ParseIntError};

/// (De)serialization error
Expand Down Expand Up @@ -159,6 +161,10 @@ pub mod serialize {
ExpectedStart,
/// Unsupported operation
Unsupported(&'static str),
/// Too many events were skipped while deserializing a sequence, event limit
/// exceeded. The limit was provided as an argument
#[cfg(feature = "overlapped-lists")]
TooManyEvents(NonZeroUsize),
}

impl fmt::Display for DeError {
Expand All @@ -183,6 +189,8 @@ pub mod serialize {
DeError::UnexpectedEof => write!(f, "Unexpected `Event::Eof`"),
DeError::ExpectedStart => write!(f, "Expecting `Event::Start`"),
DeError::Unsupported(s) => write!(f, "Unsupported operation {}", s),
#[cfg(feature = "overlapped-lists")]
DeError::TooManyEvents(s) => write!(f, "Deserializer buffers {} events, limit exceeded", s),
}
}
}
Expand Down

0 comments on commit 3ee447f

Please sign in to comment.