diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 9ecbabd4..6bd21b90 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -117,7 +117,7 @@ macro_rules! impl_buffered_source { // somewhere sane rather than at the EOF Ok(n) if n.is_empty() => return Err(bang_type.to_err()), Ok(available) => { - if let Some((consumed, used)) = bang_type.parse(available, read) { + if let Some((consumed, used)) = bang_type.parse(buf, available) { buf.extend_from_slice(consumed); self $(.$reader)? .consume(used); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 4e547067..0d94b2a4 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -742,25 +742,50 @@ impl BangType { /// If element is finished, returns its content up to `>` symbol and /// an index of this symbol, otherwise returns `None` + /// + /// # Parameters + /// - `buf`: buffer with data consumed on previous iterations + /// - `chunk`: data read on current iteration and not yet consumed from reader #[inline(always)] - fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> { + fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { for i in memchr::memchr_iter(b'>', chunk) { match self { // Need to read at least 6 symbols (`!---->`) for properly finished comment // - XML comment // 012345 - i - Self::Comment => { - if offset + i > 4 && chunk[..i].ends_with(b"--") { + Self::Comment if buf.len() + i > 4 => { + if chunk[..i].ends_with(b"--") { // We cannot strip last `--` from the buffer because we need it in case of // check_comments enabled option. XML standard requires that comment // will not end with `--->` sequence because this is a special case of // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) return Some((&chunk[..i], i + 1)); // +1 for `>` } + // End sequence `-|->` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `--|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"--") { + return Some((&[], i + 1)); // +1 for `>` + } } + Self::Comment => {} Self::CData => { if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i - 2], i + 1)); // +1 for `>` + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `]|]>` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `]]|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"]]") { + return Some((&[], i + 1)); // +1 for `>` } } Self::DocType => { @@ -1021,7 +1046,7 @@ mod test { $(.$await)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), - Some((BangType::CData, Bytes(b"![CDATA["))) + Some((BangType::CData, Bytes(b"![CDATA[]]"))) ); assert_eq!(position, 11); } @@ -1042,7 +1067,7 @@ mod test { $(.$await)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), - Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) + Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))) ); assert_eq!(position, 28); } diff --git a/src/reader/parser.rs b/src/reader/parser.rs index 7c3e9eba..a8f70add 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -90,6 +90,7 @@ impl Parser { let len = buf.len(); match bang_type { BangType::Comment if buf.starts_with(b"!--") => { + debug_assert!(buf.ends_with(b"--")); if self.check_comments { // search if '--' not in comments if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) @@ -105,7 +106,11 @@ impl Parser { ))) } BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder()))) + debug_assert!(buf.ends_with(b"]]")); + Ok(Event::CData(BytesCData::wrap( + &buf[8..len - 2], + self.decoder(), + ))) } BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { let start = buf[8..] diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index b90fa524..906437a8 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let bang_type = BangType::new(self[1..].first().copied())?; - if let Some((bytes, i)) = bang_type.parse(self, 0) { + if let Some((bytes, i)) = bang_type.parse(&[], self) { *position += i; *self = &self[i..]; return Ok(Some((bang_type, bytes)));