Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement read_text - a method that returns a text between two tags #455

Merged
merged 4 commits into from Aug 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions Changelog.md
Expand Up @@ -40,6 +40,8 @@
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
under the `quick-xml::encoding` namespace.
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags


### Bug Fixes
Expand Down Expand Up @@ -139,6 +141,7 @@
|`*_with_custom_entities`|`*_with`
|`BytesText::unescaped()`|`BytesText::unescape()`
|`Attribute::unescaped_*`|`Attribute::unescape_*`
- [#329]: Also, that functions now borrow from the input instead of event / attribute

- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events
Expand Down Expand Up @@ -181,6 +184,8 @@
- [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
array cannot guarantee borrowing due to possible copying while decoding.

- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand All @@ -199,6 +204,7 @@
[#180]: https://github.com/tafia/quick-xml/issues/180
[#191]: https://github.com/tafia/quick-xml/issues/191
[#324]: https://github.com/tafia/quick-xml/issues/324
[#329]: https://github.com/tafia/quick-xml/issues/329
[#363]: https://github.com/tafia/quick-xml/issues/363
[#387]: https://github.com/tafia/quick-xml/pull/387
[#391]: https://github.com/tafia/quick-xml/pull/391
Expand All @@ -220,6 +226,7 @@
[#440]: https://github.com/tafia/quick-xml/pull/440
[#443]: https://github.com/tafia/quick-xml/pull/443
[#450]: https://github.com/tafia/quick-xml/pull/450
[#455]: https://github.com/tafia/quick-xml/pull/455


## 0.23.0 -- 2022-05-08
Expand Down
18 changes: 6 additions & 12 deletions examples/read_texts.rs
@@ -1,6 +1,5 @@
fn main() {
use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::Reader;

let xml = "<tag1>text1</tag1><tag1>text2</tag1>\
Expand All @@ -9,23 +8,18 @@ fn main() {
let mut reader = Reader::from_str(xml);
reader.trim_text(true);

let mut txt = Vec::new();
let mut buf = Vec::new();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => {
txt.push(
reader
.read_text_into(QName(b"tag2"), &mut Vec::new())
.expect("Cannot decode text value"),
);
match reader.read_event() {
Ok(Event::Start(e)) if e.name().as_ref() == b"tag2" => {
// read_text_into for buffered readers not implemented
let txt = reader
.read_text(e.name())
.expect("Cannot decode text value");
println!("{:?}", txt);
}
Ok(Event::Eof) => break, // exits the loop when reaching end of file
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
_ => (), // There are several other `Event`s we do not consider here
}
buf.clear();
}
}
6 changes: 4 additions & 2 deletions src/de/mod.rs
Expand Up @@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end_into(name, &mut self.buf) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down Expand Up @@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end(name) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down
29 changes: 20 additions & 9 deletions src/events/attributes.rs
Expand Up @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
///
/// This method is available only if `encoding` feature is **not** enabled.
#[cfg(any(doc, not(feature = "encoding")))]
pub fn unescape_value(&self) -> XmlResult<Cow<str>> {
pub fn unescape_value(&self) -> XmlResult<Cow<'a, str>> {
self.unescape_value_with(|_| None)
}

Expand All @@ -61,19 +61,26 @@ impl<'a> Attribute<'a> {
pub fn unescape_value_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> XmlResult<Cow<str>> {
) -> XmlResult<Cow<'a, str>> {
// from_utf8 should never fail because content is always UTF-8 encoded
Ok(unescape_with(
std::str::from_utf8(&self.value)?,
resolve_entity,
)?)
let decoded = match &self.value {
Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes)?),
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => Cow::Owned(std::str::from_utf8(bytes)?.to_string()),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Cow::Borrowed(_) => Ok(decoded),
Cow::Owned(s) => Ok(s.into()),
}
}

/// Decodes then unescapes the value.
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<str>> {
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
self.decode_and_unescape_value_with(reader, |_| None)
}

Expand All @@ -85,8 +92,12 @@ impl<'a> Attribute<'a> {
&self,
reader: &Reader<B>,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> XmlResult<Cow<str>> {
let decoded = reader.decoder().decode(&*self.value)?;
) -> XmlResult<Cow<'a, str>> {
let decoded = match &self.value {
Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Expand Down
14 changes: 8 additions & 6 deletions src/events/mod.rs
Expand Up @@ -732,7 +732,7 @@ impl<'a> BytesText<'a> {
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn unescape(&self) -> Result<Cow<str>> {
pub fn unescape(&self) -> Result<Cow<'a, str>> {
self.unescape_with(|_| None)
}

Expand All @@ -743,8 +743,12 @@ impl<'a> BytesText<'a> {
pub fn unescape_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<str>> {
let decoded = self.decoder.decode(&*self)?;
) -> Result<Cow<'a, str>> {
let decoded = match &self.content {
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Expand All @@ -754,11 +758,9 @@ impl<'a> BytesText<'a> {
}

/// Gets content of this text buffer in the specified encoding and optionally
/// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime
/// of the returned `Cow` is bound to the original buffer / input
/// unescapes it.
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, unescape: bool) -> Result<Cow<'a, str>> {
//TODO: too many copies, can be optimized
let text = match &self.content {
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Expand Down
14 changes: 8 additions & 6 deletions src/reader/async_tokio.rs
Expand Up @@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader};
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
};
use crate::{Error, Result};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down Expand Up @@ -125,7 +127,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -142,8 +144,8 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)
) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found and moves reader to an `Opened` state.
Expand Down Expand Up @@ -275,7 +277,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
/// (ResolveResult::Bound(ns), Event::Start(start))
/// );
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -295,7 +297,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
) -> Result<Span> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Reader::check_end_names` documentation
self.reader.read_to_end_into_async(end, buf).await
Expand Down
63 changes: 10 additions & 53 deletions src/reader/buffered_reader.rs
Expand Up @@ -10,7 +10,7 @@ use memchr;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
Expand Down Expand Up @@ -277,6 +277,10 @@ impl<R: BufRead> Reader<R> {
/// storage for events content. This function is supposed to be called after
/// you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name.
///
/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
Expand Down Expand Up @@ -340,7 +344,7 @@ impl<R: BufRead> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
Expand All @@ -353,60 +357,13 @@ impl<R: BufRead> Reader<R> {
/// [`End`]: Event::End
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`read_to_end()`]: Self::read_to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
/// [`check_end_names`]: Self::check_end_names
/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
read_to_end!(self, end, buf, read_event_impl, {
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_impl, {
buf.clear();
})
}

/// Reads optional text between start and end tags.
///
/// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
/// `String`. If the next event is an [`End`] event, returns the empty string. In all other
/// cases, returns an error.
///
/// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
/// if none is specified).
///
/// # Examples
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::Reader;
/// use quick_xml::events::Event;
///
/// let mut xml = Reader::from_reader(b"
/// <a>&lt;b&gt;</a>
/// <a></a>
/// " as &[u8]);
/// xml.trim_text(true);
///
/// let expected = ["<b>", ""];
/// for &content in expected.iter() {
/// match xml.read_event_into(&mut Vec::new()) {
/// Ok(Event::Start(ref e)) => {
/// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content);
/// },
/// e => panic!("Expecting Start event, found {:?}", e),
/// }
/// }
/// ```
///
/// [`Text`]: Event::Text
/// [`End`]: Event::End
pub fn read_text_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<String> {
let s = match self.read_event_into(buf) {
Err(e) => return Err(e),

Ok(Event::Text(e)) => e.unescape()?.into_owned(),
Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
_ => return Err(Error::TextNotFound),
};
self.read_to_end_into(end, buf)?;
Ok(s)
}))
}
}

Expand Down
10 changes: 9 additions & 1 deletion src/reader/mod.rs
Expand Up @@ -2,6 +2,7 @@

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;
use std::ops::Range;

use crate::encoding::Decoder;
use crate::errors::{Error, Result};
Expand Down Expand Up @@ -238,16 +239,18 @@ macro_rules! read_to_end {
$clear:block
$(, $await:ident)?
) => {{
let start = $self.buffer_position();
let mut depth = 0;
loop {
$clear
let end = $self.buffer_position();
match $self.$read_event($buf) $(.$await)? {
Err(e) => return Err(e),

Ok(Event::Start(e)) if e.name() == $end => depth += 1,
Ok(Event::End(e)) if e.name() == $end => {
if depth == 0 {
return Ok(());
break start..end;
}
depth -= 1;
}
Expand All @@ -270,6 +273,11 @@ mod slice_reader;

pub use ns_reader::NsReader;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Possible reader states. The state transition diagram (`true` and `false` shows
/// value of [`Reader::expand_empty_elements()`] option):
///
Expand Down