From 914bca90124bdf0cb82570c5a6c397fb7aa7725a Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 7 Mar 2022 02:44:26 +0500 Subject: [PATCH] Fix internal panic when parsing not fully forming comment, CDATA and DOCTYPE. Fix tafia/quick-xml#344 --- Changelog.md | 2 + src/reader.rs | 100 +++++++++++++++++++++++++++----------------------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/Changelog.md b/Changelog.md index 7383e952..72fde96b 100644 --- a/Changelog.md +++ b/Changelog.md @@ -29,6 +29,8 @@ - fix: produce consistent error positions in buffered and borrowed readers - feat: `Error::UnexpectedBang` now provide the byte found - refactor: unify code for buffered and borrowed readers +- fix: fix internal panic message when parse malformed XML + ([#344](https://github.com/tafia/quick-xml/issues/344)) ## 0.23.0-alpha3 diff --git a/src/reader.rs b/src/reader.rs index 83602379..9825d278 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -274,7 +274,7 @@ impl Reader { // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_bang(bytes), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), Err(e) => Err(e), }, // ` Reader { /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event - fn read_bang<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { + fn read_bang<'a, 'b>(&'a mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { let uncased_starts_with = |string: &[u8], prefix: &[u8]| { string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) }; let len = buf.len(); - if buf.starts_with(b"!--") { - // FIXME: actually, isn't, it misses - debug_assert!(len >= 5, "Minimum length guaranteed by read_bang_element"); - if self.check_comments { - // search if '--' not in comments - if let Some(p) = - memchr::memchr_iter(b'-', &buf[3..len - 2]).position(|p| buf[3 + p + 1] == b'-') - { - self.buf_position += len - p; - return Err(Error::UnexpectedToken("--".to_string())); + match bang_type { + BangType::Comment if buf.starts_with(b"!--") => { + if self.check_comments { + // search if '--' not in comments + if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) + .position(|p| buf[3 + p + 1] == b'-') + { + self.buf_position += len - p; + return Err(Error::UnexpectedToken("--".to_string())); + } } + Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) } - Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) - } else if uncased_starts_with(buf, b"![CDATA[") { - Ok(Event::CData(BytesText::from_plain(&buf[8..]))) - } else if uncased_starts_with(buf, b"!DOCTYPE") { - let start = buf[8..] - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or_else(|| len - 8); - debug_assert!(start < len - 8, "DocType must have a name"); - Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) - } else { - unreachable!("Proper bang start guaranteed by read_bang_element"); + BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { + Ok(Event::CData(BytesText::from_plain(&buf[8..]))) + } + BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { + let start = buf[8..] + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or_else(|| len - 8); + debug_assert!(start < len - 8, "DocType must have a name"); + Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) + } + _ => Err(bang_type.to_err()), } } @@ -976,7 +977,11 @@ where /// - `position`: Will be increased by amount of bytes consumed /// /// [events]: crate::events::Event - fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result>; + fn read_bang_element( + &mut self, + buf: B, + position: &mut usize, + ) -> Result>; fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; @@ -1042,7 +1047,7 @@ impl<'b, 'i, R: BufRead + 'i> BufferedInput<'b, 'i, &'b mut Vec> for R { &mut self, buf: &'b mut Vec, position: &mut usize, - ) -> Result> { + ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to // start with it. let start = buf.len(); @@ -1085,7 +1090,7 @@ impl<'b, 'i, R: BufRead + 'i> BufferedInput<'b, 'i, &'b mut Vec> for R { if read == 0 { Ok(None) } else { - Ok(Some(&buf[start..])) + Ok(Some((bang_type, &buf[start..]))) } } @@ -1259,7 +1264,11 @@ impl<'a> BufferedInput<'a, 'a, ()> for &'a [u8] { })) } - fn read_bang_element(&mut self, _buf: (), position: &mut usize) -> Result> { + fn read_bang_element( + &mut self, + _buf: (), + position: &mut usize, + ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to // start with it. debug_assert_eq!(self[0], b'!'); @@ -1269,7 +1278,7 @@ impl<'a> BufferedInput<'a, 'a, ()> for &'a [u8] { if let Some((bytes, i)) = bang_type.parse(self, 0) { *position += i; *self = &self[i..]; - return Ok(Some(bytes)); + return Ok(Some((bang_type, bytes))); } // Note: Do not update position, so the error points to @@ -1353,6 +1362,7 @@ impl<'a> BufferedInput<'a, 'a, ()> for &'a [u8] { } /// Possible elements started with ` CData, @@ -1741,7 +1751,7 @@ mod test { /// Checks that reading CDATA content works correctly mod cdata { use crate::errors::Error; - use crate::reader::BufferedInput; + use crate::reader::{BangType, BufferedInput}; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1798,8 +1808,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"![CDATA[")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::CData, Bytes(b"![CDATA["))) ); assert_eq!(position, 11); } @@ -1818,8 +1828,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"![CDATA[cdata]] ]>content")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) ); assert_eq!(position, 28); } @@ -1843,7 +1853,7 @@ mod test { /// [specification]: https://www.w3.org/TR/xml11/#dt-comment mod comment { use crate::errors::Error; - use crate::reader::BufferedInput; + use crate::reader::{BangType, BufferedInput}; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1949,8 +1959,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"!----")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::Comment, Bytes(b"!----"))) ); assert_eq!(position, 6); } @@ -1966,8 +1976,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"!--->comment<---")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::Comment, Bytes(b"!--->comment<---"))) ); assert_eq!(position, 17); } @@ -1977,7 +1987,7 @@ mod test { mod doctype { mod uppercase { use crate::errors::Error; - use crate::reader::BufferedInput; + use crate::reader::{BangType, BufferedInput}; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2028,8 +2038,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"!DOCTYPE")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::DocType, Bytes(b"!DOCTYPE"))) ); assert_eq!(position, 9); } @@ -2055,7 +2065,7 @@ mod test { mod lowercase { use crate::errors::Error; - use crate::reader::BufferedInput; + use crate::reader::{BangType, BufferedInput}; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2106,8 +2116,8 @@ mod test { input .read_bang_element(buf, &mut position) .unwrap() - .map(Bytes), - Some(Bytes(b"!doctype")) + .map(|(ty, data)| (ty, Bytes(data))), + Some((BangType::DocType, Bytes(b"!doctype"))) ); assert_eq!(position, 9); }