Skip to content

Commit

Permalink
Return Span of skipped bytes from read_to_end*
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun committed Aug 14, 2022
1 parent abec80f commit 7fafac4
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 23 deletions.
2 changes: 2 additions & 0 deletions Changelog.md
Expand Up @@ -40,6 +40,7 @@
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
under the `quick-xml::encoding` namespace.
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags


### Bug Fixes
Expand Down Expand Up @@ -222,6 +223,7 @@
[#440]: https://github.com/tafia/quick-xml/pull/440
[#443]: https://github.com/tafia/quick-xml/pull/443
[#450]: https://github.com/tafia/quick-xml/pull/450
[#455]: https://github.com/tafia/quick-xml/pull/455


## 0.23.0 -- 2022-05-08
Expand Down
6 changes: 4 additions & 2 deletions src/de/mod.rs
Expand Up @@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end_into(name, &mut self.buf) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down Expand Up @@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end(name) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down
14 changes: 8 additions & 6 deletions src/reader/async_tokio.rs
Expand Up @@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader};
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
};
use crate::{Error, Result};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down Expand Up @@ -125,7 +127,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -142,8 +144,8 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)
) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found and moves reader to an `Opened` state.
Expand Down Expand Up @@ -275,7 +277,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
/// (ResolveResult::Bound(ns), Event::Start(start))
/// );
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -295,7 +297,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
) -> Result<Span> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Reader::check_end_names` documentation
self.reader.read_to_end_into_async(end, buf).await
Expand Down
15 changes: 10 additions & 5 deletions src/reader/buffered_reader.rs
Expand Up @@ -10,7 +10,7 @@ use memchr;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
Expand Down Expand Up @@ -277,6 +277,10 @@ impl<R: BufRead> Reader<R> {
/// storage for events content. This function is supposed to be called after
/// you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name.
///
/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
Expand Down Expand Up @@ -340,7 +344,7 @@ impl<R: BufRead> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
Expand All @@ -353,12 +357,13 @@ impl<R: BufRead> Reader<R> {
/// [`End`]: Event::End
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`read_to_end()`]: Self::read_to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
/// [`check_end_names`]: Self::check_end_names
/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
read_to_end!(self, end, buf, read_event_impl, {
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_impl, {
buf.clear();
})
}))
}

/// Reads optional text between start and end tags.
Expand Down
10 changes: 9 additions & 1 deletion src/reader/mod.rs
Expand Up @@ -2,6 +2,7 @@

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;
use std::ops::Range;

use crate::encoding::Decoder;
use crate::errors::{Error, Result};
Expand Down Expand Up @@ -238,16 +239,18 @@ macro_rules! read_to_end {
$clear:block
$(, $await:ident)?
) => {{
let start = $self.buffer_position();
let mut depth = 0;
loop {
$clear
let end = $self.buffer_position();
match $self.$read_event($buf) $(.$await)? {
Err(e) => return Err(e),

Ok(Event::Start(e)) if e.name() == $end => depth += 1,
Ok(Event::End(e)) if e.name() == $end => {
if depth == 0 {
return Ok(());
break start..end;
}
depth -= 1;
}
Expand All @@ -270,6 +273,11 @@ mod slice_reader;

pub use ns_reader::NsReader;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Possible reader states. The state transition diagram (`true` and `false` shows
/// value of [`Reader::expand_empty_elements()`] option):
///
Expand Down
20 changes: 15 additions & 5 deletions src/reader/ns_reader.rs
Expand Up @@ -12,7 +12,7 @@ use std::path::Path;
use crate::errors::Result;
use crate::events::Event;
use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult};
use crate::reader::{Reader, XmlSource};
use crate::reader::{Reader, Span, XmlSource};

/// A low level encoding-agnostic XML event reader that performs namespace resolution.
///
Expand Down Expand Up @@ -425,6 +425,10 @@ impl<R: BufRead> NsReader<R> {
/// storage for events content. This function is supposed to be called after
/// you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name
/// ("the same" means that their local names are the same and their prefixes
/// resolves to the same namespace).
Expand Down Expand Up @@ -491,7 +495,7 @@ impl<R: BufRead> NsReader<R> {
/// (ResolveResult::Bound(ns), Event::Start(start))
/// );
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
Expand All @@ -508,8 +512,9 @@ impl<R: BufRead> NsReader<R> {
/// [`UnexpectedEof`]: crate::errors::Error::UnexpectedEof
/// [`read_to_end()`]: Self::read_to_end
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
#[inline]
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Self::check_end_names` documentation
self.reader.read_to_end_into(end, buf)
Expand Down Expand Up @@ -657,6 +662,10 @@ impl<'i> NsReader<&'i [u8]> {
/// Reads until end element is found. This function is supposed to be called
/// after you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name
/// ("the same" means that their local names are the same and their prefixes
/// resolves to the same namespace).
Expand Down Expand Up @@ -717,7 +726,7 @@ impl<'i> NsReader<&'i [u8]> {
/// (ResolveResult::Bound(ns), Event::Start(start))
/// );
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end(end.name()).unwrap();
Expand All @@ -734,8 +743,9 @@ impl<'i> NsReader<&'i [u8]> {
/// [`UnexpectedEof`]: crate::errors::Error::UnexpectedEof
/// [`read_to_end()`]: Self::read_to_end
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
#[inline]
pub fn read_to_end(&mut self, end: QName) -> Result<()> {
pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Self::check_end_names` documentation
self.reader.read_to_end(end)
Expand Down
13 changes: 9 additions & 4 deletions src/reader/slice_reader.rs
Expand Up @@ -10,7 +10,7 @@ use encoding_rs::UTF_8;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};

use memchr;

Expand Down Expand Up @@ -74,6 +74,10 @@ impl<'a> Reader<&'a [u8]> {
/// Reads until end element is found. This function is supposed to be called
/// after you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name.
///
/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
Expand Down Expand Up @@ -131,7 +135,7 @@ impl<'a> Reader<&'a [u8]> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end(end.name()).unwrap();
Expand All @@ -143,10 +147,11 @@ impl<'a> Reader<&'a [u8]> {
/// [`Start`]: Event::Start
/// [`End`]: Event::End
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
/// [`check_end_names`]: Self::check_end_names
/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
pub fn read_to_end(&mut self, end: QName) -> Result<()> {
read_to_end!(self, end, (), read_event_impl, {})
pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
Ok(read_to_end!(self, end, (), read_event_impl, {}))
}
}

Expand Down

0 comments on commit 7fafac4

Please sign in to comment.