diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 570a8814..ba8ef70a 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -377,7 +377,7 @@ impl NsReader { #[cfg(test)] mod test { use super::TokioAdapter; - use crate::reader::test::check; + use crate::reader::test::{check, small_buffers}; check!( #[tokio::test] @@ -387,4 +387,10 @@ mod test { &mut Vec::new(), async, await ); + + small_buffers!( + #[tokio::test] + read_event_into_async: tokio::io::BufReader<_>, + async, await + ); } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 4f83b6a2..6bd21b90 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -117,7 +117,7 @@ macro_rules! impl_buffered_source { // somewhere sane rather than at the EOF Ok(n) if n.is_empty() => return Err(bang_type.to_err()), Ok(available) => { - if let Some((consumed, used)) = bang_type.parse(available, read) { + if let Some((consumed, used)) = bang_type.parse(buf, available) { buf.extend_from_slice(consumed); self $(.$reader)? .consume(used); @@ -406,7 +406,7 @@ impl Reader> { #[cfg(test)] mod test { - use crate::reader::test::check; + use crate::reader::test::{check, small_buffers}; use crate::reader::XmlSource; /// Default buffer constructor just pass the byte array from the test @@ -422,6 +422,11 @@ mod test { &mut Vec::new() ); + small_buffers!( + #[test] + read_event_into: std::io::BufReader<_> + ); + #[cfg(feature = "encoding")] mod encoding { use crate::events::Event; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index bc448300..0d94b2a4 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -742,25 +742,50 @@ impl BangType { /// If element is finished, returns its content up to `>` symbol and /// an index of this symbol, otherwise returns `None` + /// + /// # Parameters + /// - `buf`: buffer with data consumed on previous iterations + /// - `chunk`: data read on current iteration and not yet consumed from reader #[inline(always)] - fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> { + fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { for i in memchr::memchr_iter(b'>', chunk) { match self { // Need to read at least 6 symbols (`!---->`) for properly finished comment // - XML comment // 012345 - i - Self::Comment => { - if offset + i > 4 && chunk[..i].ends_with(b"--") { + Self::Comment if buf.len() + i > 4 => { + if chunk[..i].ends_with(b"--") { // We cannot strip last `--` from the buffer because we need it in case of // check_comments enabled option. XML standard requires that comment // will not end with `--->` sequence because this is a special case of // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) return Some((&chunk[..i], i + 1)); // +1 for `>` } + // End sequence `-|->` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `--|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"--") { + return Some((&[], i + 1)); // +1 for `>` + } } + Self::Comment => {} Self::CData => { if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i - 2], i + 1)); // +1 for `>` + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `]|]>` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `]]|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"]]") { + return Some((&[], i + 1)); // +1 for `>` } } Self::DocType => { @@ -1021,7 +1046,7 @@ mod test { $(.$await)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), - Some((BangType::CData, Bytes(b"![CDATA["))) + Some((BangType::CData, Bytes(b"![CDATA[]]"))) ); assert_eq!(position, 11); } @@ -1042,7 +1067,7 @@ mod test { $(.$await)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), - Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) + Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))) ); assert_eq!(position, 28); } @@ -1751,8 +1776,157 @@ mod test { }; } - // Export a macro for the child modules: + /// Tests for https://github.com/tafia/quick-xml/issues/469 + macro_rules! small_buffers { + ( + #[$test:meta] + $read_event:ident: $BufReader:ty + $(, $async:ident, $await:ident)? + ) => { + mod small_buffers { + use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event}; + use crate::reader::Reader; + use pretty_assertions::assert_eq; + + #[$test] + $($async)? fn decl() { + let xml = ""; + // ^^^^^^^ data that fit into buffer + let size = xml.match_indices("?>").next().unwrap().0 + 1; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3))) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn pi() { + let xml = ""; + // ^^^^^ data that fit into buffer + let size = xml.match_indices("?>").next().unwrap().0 + 1; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::PI(BytesText::new("pi")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn empty() { + let xml = ""; + // ^^^^^^^ data that fit into buffer + let size = xml.match_indices("/>").next().unwrap().0 + 1; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Empty(BytesStart::new("empty")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn cdata1() { + let xml = ""; + // ^^^^^^^^^^^^^^^ data that fit into buffer + let size = xml.match_indices("]]>").next().unwrap().0 + 1; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::CData(BytesCData::new("cdata")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn cdata2() { + let xml = ""; + // ^^^^^^^^^^^^^^^^ data that fit into buffer + let size = xml.match_indices("]]>").next().unwrap().0 + 2; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::CData(BytesCData::new("cdata")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn comment1() { + let xml = ""; + // ^^^^^^^^^^^^ data that fit into buffer + let size = xml.match_indices("-->").next().unwrap().0 + 1; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Comment(BytesText::new("comment")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + #[$test] + $($async)? fn comment2() { + let xml = ""; + // ^^^^^^^^^^^^^ data that fit into buffer + let size = xml.match_indices("-->").next().unwrap().0 + 2; + let br = <$BufReader>::with_capacity(size, xml.as_bytes()); + let mut reader = Reader::from_reader(br); + let mut buf = Vec::new(); + + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Comment(BytesText::new("comment")) + ); + assert_eq!( + reader.$read_event(&mut buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + } + }; + } + + // Export macros for the child modules: // - buffered_reader // - slice_reader pub(super) use check; + pub(super) use small_buffers; } diff --git a/src/reader/parser.rs b/src/reader/parser.rs index 7c3e9eba..a8f70add 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -90,6 +90,7 @@ impl Parser { let len = buf.len(); match bang_type { BangType::Comment if buf.starts_with(b"!--") => { + debug_assert!(buf.ends_with(b"--")); if self.check_comments { // search if '--' not in comments if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) @@ -105,7 +106,11 @@ impl Parser { ))) } BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder()))) + debug_assert!(buf.ends_with(b"]]")); + Ok(Event::CData(BytesCData::wrap( + &buf[8..len - 2], + self.decoder(), + ))) } BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { let start = buf[8..] diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index b90fa524..906437a8 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let bang_type = BangType::new(self[1..].first().copied())?; - if let Some((bytes, i)) = bang_type.parse(self, 0) { + if let Some((bytes, i)) = bang_type.parse(&[], self) { *position += i; *self = &self[i..]; return Ok(Some((bang_type, bytes)));