Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #469 - parsing from buffered reader #471

Merged
merged 2 commits into from Sep 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/reader/async_tokio.rs
Expand Up @@ -377,7 +377,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
#[cfg(test)]
mod test {
use super::TokioAdapter;
use crate::reader::test::check;
use crate::reader::test::{check, small_buffers};

check!(
#[tokio::test]
Expand All @@ -387,4 +387,10 @@ mod test {
&mut Vec::new(),
async, await
);

small_buffers!(
#[tokio::test]
read_event_into_async: tokio::io::BufReader<_>,
async, await
);
}
9 changes: 7 additions & 2 deletions src/reader/buffered_reader.rs
Expand Up @@ -117,7 +117,7 @@ macro_rules! impl_buffered_source {
// somewhere sane rather than at the EOF
Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
Ok(available) => {
if let Some((consumed, used)) = bang_type.parse(available, read) {
if let Some((consumed, used)) = bang_type.parse(buf, available) {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
Expand Down Expand Up @@ -406,7 +406,7 @@ impl Reader<BufReader<File>> {

#[cfg(test)]
mod test {
use crate::reader::test::check;
use crate::reader::test::{check, small_buffers};
use crate::reader::XmlSource;

/// Default buffer constructor just pass the byte array from the test
Expand All @@ -422,6 +422,11 @@ mod test {
&mut Vec::new()
);

small_buffers!(
#[test]
read_event_into: std::io::BufReader<_>
);

#[cfg(feature = "encoding")]
mod encoding {
use crate::events::Event;
Expand Down
188 changes: 181 additions & 7 deletions src/reader/mod.rs
Expand Up @@ -742,25 +742,50 @@ impl BangType {

/// If element is finished, returns its content up to `>` symbol and
/// an index of this symbol, otherwise returns `None`
///
/// # Parameters
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
#[inline(always)]
fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
for i in memchr::memchr_iter(b'>', chunk) {
match self {
// Need to read at least 6 symbols (`!---->`) for properly finished comment
// <!----> - XML comment
// 012345 - i
Self::Comment => {
if offset + i > 4 && chunk[..i].ends_with(b"--") {
Self::Comment if buf.len() + i > 4 => {
if chunk[..i].ends_with(b"--") {
// We cannot strip last `--` from the buffer because we need it in case of
// check_comments enabled option. XML standard requires that comment
// will not end with `--->` sequence because this is a special case of
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `-|->` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `--|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"--") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::Comment => {}
Self::CData => {
if chunk[..i].ends_with(b"]]") {
return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]|]>` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `]]|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"]]") {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::DocType => {
Expand Down Expand Up @@ -1021,7 +1046,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[")))
Some((BangType::CData, Bytes(b"![CDATA[]]")))
);
assert_eq!(position, 11);
}
Expand All @@ -1042,7 +1067,7 @@ mod test {
$(.$await)?
.unwrap()
.map(|(ty, data)| (ty, Bytes(data))),
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
);
assert_eq!(position, 28);
}
Expand Down Expand Up @@ -1751,8 +1776,157 @@ mod test {
};
}

// Export a macro for the child modules:
/// Tests for https://github.com/tafia/quick-xml/issues/469
macro_rules! small_buffers {
(
#[$test:meta]
$read_event:ident: $BufReader:ty
$(, $async:ident, $await:ident)?
) => {
mod small_buffers {
use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
use crate::reader::Reader;
use pretty_assertions::assert_eq;

#[$test]
$($async)? fn decl() {
let xml = "<?xml ?>";
// ^^^^^^^ data that fit into buffer
let size = xml.match_indices("?>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn pi() {
let xml = "<?pi?>";
// ^^^^^ data that fit into buffer
let size = xml.match_indices("?>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::PI(BytesText::new("pi"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn empty() {
let xml = "<empty/>";
// ^^^^^^^ data that fit into buffer
let size = xml.match_indices("/>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Empty(BytesStart::new("empty"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn cdata1() {
let xml = "<![CDATA[cdata]]>";
// ^^^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("]]>").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::CData(BytesCData::new("cdata"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn cdata2() {
let xml = "<![CDATA[cdata]]>";
// ^^^^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("]]>").next().unwrap().0 + 2;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::CData(BytesCData::new("cdata"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn comment1() {
let xml = "<!--comment-->";
// ^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("-->").next().unwrap().0 + 1;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Comment(BytesText::new("comment"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}

#[$test]
$($async)? fn comment2() {
let xml = "<!--comment-->";
// ^^^^^^^^^^^^^ data that fit into buffer
let size = xml.match_indices("-->").next().unwrap().0 + 2;
let br = <$BufReader>::with_capacity(size, xml.as_bytes());
let mut reader = Reader::from_reader(br);
let mut buf = Vec::new();

assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Comment(BytesText::new("comment"))
);
assert_eq!(
reader.$read_event(&mut buf) $(.$await)? .unwrap(),
Event::Eof
);
}
}
};
}

// Export macros for the child modules:
// - buffered_reader
// - slice_reader
pub(super) use check;
pub(super) use small_buffers;
}
7 changes: 6 additions & 1 deletion src/reader/parser.rs
Expand Up @@ -90,6 +90,7 @@ impl Parser {
let len = buf.len();
match bang_type {
BangType::Comment if buf.starts_with(b"!--") => {
debug_assert!(buf.ends_with(b"--"));
if self.check_comments {
// search if '--' not in comments
if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
Expand All @@ -105,7 +106,11 @@ impl Parser {
)))
}
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
Ok(Event::CData(BytesCData::wrap(&buf[8..], self.decoder())))
debug_assert!(buf.ends_with(b"]]"));
Ok(Event::CData(BytesCData::wrap(
&buf[8..len - 2],
self.decoder(),
)))
}
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
let start = buf[8..]
Expand Down
2 changes: 1 addition & 1 deletion src/reader/slice_reader.rs
Expand Up @@ -289,7 +289,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {

let bang_type = BangType::new(self[1..].first().copied())?;

if let Some((bytes, i)) = bang_type.parse(self, 0) {
if let Some((bytes, i)) = bang_type.parse(&[], self) {
*position += i;
*self = &self[i..];
return Ok(Some((bang_type, bytes)));
Expand Down