diff --git a/Cargo.toml b/Cargo.toml index 8e503338..0da6e32e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,8 +78,14 @@ encoding = ["encoding_rs"] ## to quadratic parsing time, because the deserializer must check the list of ## events as many times as the number of sequence fields present in the schema. ## +## To reduce negative consequences, always [limit] the maximum number of events +## that [`Deserializer`] will buffer. +## ## This feature works only with `serialize` feature and has no effect if `serialize` ## is not enabled. +## +## [limit]: crate::de::Deserializer::event_buffer_size +## [`Deserializer`]: crate::de::Deserializer overlapped-lists = [] ## Enables support for [`serde`] serialization and deserialization diff --git a/src/de/mod.rs b/src/de/mod.rs index eb4a7976..71ba50c7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -229,6 +229,8 @@ use std::borrow::Cow; #[cfg(feature = "overlapped-lists")] use std::collections::VecDeque; use std::io::BufRead; +#[cfg(feature = "overlapped-lists")] +use std::num::NonZeroUsize; pub(crate) const INNER_VALUE: &str = "$value"; pub(crate) const UNFLATTEN_PREFIX: &str = "$unflatten="; @@ -277,6 +279,12 @@ where /// moved from this to [`Self::read`]. #[cfg(feature = "overlapped-lists")] write: VecDeque>, + /// Maximum number of events that can be skipped when processing sequences + /// that occur out-of-order. This field is used to prevent potential + /// denial-of-service (DoS) attacks which could cause infinite memory + /// consumption when parsing a very large amount of XML into a sequence field. + #[cfg(feature = "overlapped-lists")] + limit: Option, #[cfg(not(feature = "overlapped-lists"))] peek: Option>, @@ -375,6 +383,8 @@ where read: VecDeque::new(), #[cfg(feature = "overlapped-lists")] write: VecDeque::new(), + #[cfg(feature = "overlapped-lists")] + limit: None, #[cfg(not(feature = "overlapped-lists"))] peek: None, @@ -387,6 +397,71 @@ where Self::new(reader) } + /// Set the maximum number of events that could be skipped during deserialization + /// of sequences. + /// + /// If `` contains more than specified nested elements, `#text` or + /// CDATA nodes, then [`DeError::TooManyEvents`] will be returned during + /// deserialization of sequence field (any type that uses [`deserialize_seq`] + /// for the deserialization, for example, `Vec`). + /// + /// This method can be used to prevent a [DoS] attack and infinite memory + /// consumption when parsing a very large XML to a sequence field. + /// + /// It is strongly recommended to set limit to some value when you parse data + /// from untrusted sources. You should choose a value that your typical XMLs + /// can have _between_ different elements that corresponds to the same sequence. + /// + /// # Examples + /// + /// Let's imagine, that we deserialize such structure: + /// ``` + /// struct List { + /// item: Vec<()>, + /// } + /// ``` + /// + /// The XML that we try to parse look like this: + /// ```xml + /// + /// + /// + /// + /// with text + /// + /// + /// + /// + /// + /// + /// + /// ``` + /// + /// There, when we deserialize the `item` field, we need to buffer 7 events, + /// before we can deserialize the second ``: + /// + /// - `` + /// - `` + /// - `#text(with text)` + /// - `` + /// - `` (virtual start event) + /// - `` (vitrual end event) + /// - `` + /// + /// Note, that `` internally represented as 2 events: + /// one for the start tag and one for the end tag. In the future this can be + /// eliminated, but for now we use [auto-expanding feature] of a reader, + /// because this simplifies deserializer code. + /// + /// [`deserialize_seq`]: serde::Deserializer::deserialize_seq + /// [DoS]: https://en.wikipedia.org/wiki/Denial-of-service_attack + /// [auto-expanding feature]: Reader::expand_empty_elements + #[cfg(feature = "overlapped-lists")] + pub fn event_buffer_size(&mut self, limit: Option) -> &mut Self { + self.limit = limit; + self + } + #[cfg(feature = "overlapped-lists")] fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> { if self.read.is_empty() { @@ -435,7 +510,7 @@ where #[cfg(feature = "overlapped-lists")] fn skip(&mut self) -> Result<(), DeError> { let event = self.next()?; - self.write.push_back(event); + self.skip_event(event)?; match self.write.back() { // Skip all subtree, if we skip a start event Some(DeEvent::Start(e)) => { @@ -445,17 +520,17 @@ where let event = self.next()?; match event { DeEvent::Start(ref e) if e.name() == end => { - self.write.push_back(event); + self.skip_event(event)?; depth += 1; } DeEvent::End(ref e) if e.name() == end => { - self.write.push_back(event); + self.skip_event(event)?; if depth == 0 { return Ok(()); } depth -= 1; } - _ => self.write.push_back(event), + _ => self.skip_event(event)?, } } } @@ -463,6 +538,18 @@ where } } + #[cfg(feature = "overlapped-lists")] + #[inline] + fn skip_event(&mut self, event: DeEvent<'de>) -> Result<(), DeError> { + if let Some(max) = self.limit { + if self.write.len() >= max.get() { + return Err(DeError::TooManyEvents(max)); + } + } + self.write.push_back(event); + Ok(()) + } + /// Moves all buffered events to the end of [`Self::write`] buffer and swaps /// read and write buffers. /// @@ -1169,6 +1256,38 @@ mod tests { assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root"))); } + + /// Checks that limiting buffer size works correctly + #[test] + fn limit() { + use serde::Deserialize; + + #[derive(Debug, Deserialize)] + #[allow(unused)] + struct List { + item: Vec<()>, + } + + let mut de = Deserializer::from_slice( + br#" + + + + with text + + + + + + "#, + ); + de.event_buffer_size(NonZeroUsize::new(3)); + + match List::deserialize(&mut de) { + Err(DeError::TooManyEvents(count)) => assert_eq!(count.get(), 3), + e => panic!("Expected `Err(TooManyEvents(3))`, but found {:?}", e), + } + } } #[test] diff --git a/src/errors.rs b/src/errors.rs index cc92c7d0..54a87205 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -116,6 +116,8 @@ pub mod serialize { use super::*; use crate::utils::write_byte_string; use std::fmt; + #[cfg(feature = "overlapped-lists")] + use std::num::NonZeroUsize; use std::num::{ParseFloatError, ParseIntError}; /// (De)serialization error @@ -159,6 +161,10 @@ pub mod serialize { ExpectedStart, /// Unsupported operation Unsupported(&'static str), + /// Too many events were skipped while deserializing a sequence, event limit + /// exceeded. The limit was provided as an argument + #[cfg(feature = "overlapped-lists")] + TooManyEvents(NonZeroUsize), } impl fmt::Display for DeError { @@ -183,6 +189,8 @@ pub mod serialize { DeError::UnexpectedEof => write!(f, "Unexpected `Event::Eof`"), DeError::ExpectedStart => write!(f, "Expecting `Event::Start`"), DeError::Unsupported(s) => write!(f, "Unsupported operation {}", s), + #[cfg(feature = "overlapped-lists")] + DeError::TooManyEvents(s) => write!(f, "Deserializer buffers {} events, limit exceeded", s), } } }