Skip to content

Commit

Permalink
Merge pull request #471 from Mingun/fix-buffered-parsing
Browse files Browse the repository at this point in the history
Fix #469 - parsing from buffered reader
  • Loading branch information
Mingun committed Sep 10, 2022
2 parents f8b292b + 75823d5 commit a10b1c3
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 12 deletions.
8 changes: 7 additions & 1 deletion src/reader/async_tokio.rs
Expand Up @@ -377,7 +377,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
#[cfg(test)]
mod test {
use super::TokioAdapter;
use crate::reader::test::check;
use crate::reader::test::{check, small_buffers};

check!(
#[tokio::test]
Expand All @@ -387,4 +387,10 @@ mod test {
&mut Vec::new(),
async, await
);

small_buffers!(
#[tokio::test]
read_event_into_async: tokio::io::BufReader<_>,
async, await
);
}
9 changes: 7 additions & 2 deletions src/reader/buffered_reader.rs
Expand Up @@ -117,7 +117,7 @@ macro_rules! impl_buffered_source {
// somewhere sane rather than at the EOF
Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
Ok(available) => {
if let Some((consumed, used)) = bang_type.parse(available, read) {
if let Some((consumed, used)) = bang_type.parse(buf, available) {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
Expand Down Expand Up @@ -406,7 +406,7 @@ impl Reader<BufReader<File>> {

#[cfg(test)]
mod test {
use crate::reader::test::check;
use crate::reader::test::{check, small_buffers};
use crate::reader::XmlSource;

/// Default buffer constructor just pass the byte array from the test
Expand All @@ -422,6 +422,11 @@ mod test {
&mut Vec::new()
);

small_buffers!(
#[test]
read_event_into: std::io::BufReader<_>
);

#[cfg(feature = "encoding")]
mod encoding {
use crate::events::Event;
Expand Down
188 changes: 181 additions & 7 deletions src/reader/mod.rs
Expand Up @@ -742,25 +742,50 @@ impl BangType {

/// If element is finished, returns its content up to `>` symbol and
/// an index of this symbol, otherwise returns `None`
///
/// # Parameters
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
#[inline(always)]
fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
for i in memchr::memchr_iter(b'>', chunk) {
match self {
// Need to read at least 6 symbols (`!---->`) for properly finished comment
// <!----> - XML comment
// 012345 - i
Self::Comment => {
if offset + i > 4 && chunk[..i].ends_with(b"--") {
Self::Comment if buf.len() + i > 4 => {
if chunk[..i].ends_with(b"--") {
// We cannot strip last `--` from the buffer because we need it in case of
// check_comments enabled option. XML standard requires that comment
// will not end with `--->` sequence because this is a special case of
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `-|->` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `--|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"--") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::Comment => {}
Self::CData => {
if chunk[..i].ends_with(b"]]") {
return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]|]>` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]]|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"]]") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::DocType => {
Expand Down Expand Up @@ -1021,7 +1046,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[")))
Some((BangType::CData, Bytes(b"![CDATA[]]")))
);
assert_eq!(position, 11);
}
Expand All @@ -1042,7 +1067,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
);
assert_eq!(position, 28);
}
Expand Down Expand Up @@ -1751,8 +1776,157 @@ mod test {
};
}

// Export a macro for the child modules:
/// Tests for https://github.com/tafia/quick-xml/issues/469
macro_rules! small_buffers {
(
#[$test:meta]
$read_event:ident: $BufReader:ty
$(, $async:ident, $await:ident)?
) => {
mod small_buffers {
use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
use crate::reader::Reader;
use pretty_assertions::assert_eq;

#[$test]
$($async)? fn decl() {
let xml = "<?xml ?>";
// ^^^^^^^ data that fit into buffer
let size = xml.match_indices("?>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn pi() {
let xml = "<?pi?>";
// ^^^^^ data that fit into buffer
let size = xml.match_indices("?>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::PI(BytesText::new("pi"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn empty() {
let xml = "<empty/>";
// ^^^^^^^ data that fit into buffer
let size = xml.match_indices("/>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Empty(BytesStart::new("empty"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn cdata1() {
let xml = "<![CDATA[cdata]]>";
// ^^^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("]]>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::CData(BytesCData::new("cdata"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn cdata2() {
let xml = "<![CDATA[cdata]]>";
// ^^^^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("]]>").next().unwrap().0 + 2;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::CData(BytesCData::new("cdata"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn comment1() {
let xml = "<!--comment-->";
// ^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("-->").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Comment(BytesText::new("comment"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn comment2() {
let xml = "<!--comment-->";
// ^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("-->").next().unwrap().0 + 2;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Comment(BytesText::new("comment"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}
}
};
}

// Export macros for the child modules:
// - buffered_reader
// - slice_reader
pub(super) use check;
pub(super) use small_buffers;
}
7 changes: 6 additions & 1 deletion src/reader/parser.rs
Expand Up @@ -90,6 +90,7 @@ impl Parser {
let len = buf.len();
match bang_type {
BangType::Comment if buf.starts_with(b"!--") => {
debug_assert!(buf.ends_with(b"--"));
if self.check_comments {
// search if '--' not in comments
if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
Expand All @@ -105,7 +106,11 @@ impl Parser {
)))
}
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder())))
debug_assert!(buf.ends_with(b"]]"));
Ok(Event::CData(BytesCData::wrap(
&buf[8..len - 2],
self.decoder(),
)))
}
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
let start = buf[8..]
Expand Down
2 changes: 1 addition & 1 deletion src/reader/slice_reader.rs
Expand Up @@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {

let bang_type = BangType::new(self[1..].first().copied())?;

if let Some((bytes, i)) = bang_type.parse(self, 0) {
if let Some((bytes, i)) = bang_type.parse(&[], self) {
*position += i;
*self = &self[i..];
return Ok(Some((bang_type, bytes)));
Expand Down

0 comments on commit a10b1c3

Please sign in to comment.