Skip to content

Commit

Permalink
Merge pull request #455 from Mingun/read-text
Browse files Browse the repository at this point in the history
Implement `read_text` - a method that returns a text between two tags
  • Loading branch information
dralley committed Aug 15, 2022
2 parents 2bf2d2d + 792d23d commit 87d241a
Show file tree
Hide file tree
Showing 10 changed files with 240 additions and 98 deletions.
7 changes: 7 additions & 0 deletions Changelog.md
Expand Up @@ -40,6 +40,8 @@
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
under the `quick-xml::encoding` namespace.
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags


### Bug Fixes
Expand Down Expand Up @@ -139,6 +141,7 @@
|`*_with_custom_entities`|`*_with`
|`BytesText::unescaped()`|`BytesText::unescape()`
|`Attribute::unescaped_*`|`Attribute::unescape_*`
- [#329]: Also, that functions now borrow from the input instead of event / attribute

- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events
Expand Down Expand Up @@ -181,6 +184,8 @@
- [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
array cannot guarantee borrowing due to possible copying while decoding.

- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand All @@ -199,6 +204,7 @@
[#180]: https://github.com/tafia/quick-xml/issues/180
[#191]: https://github.com/tafia/quick-xml/issues/191
[#324]: https://github.com/tafia/quick-xml/issues/324
[#329]: https://github.com/tafia/quick-xml/issues/329
[#363]: https://github.com/tafia/quick-xml/issues/363
[#387]: https://github.com/tafia/quick-xml/pull/387
[#391]: https://github.com/tafia/quick-xml/pull/391
Expand All @@ -220,6 +226,7 @@
[#440]: https://github.com/tafia/quick-xml/pull/440
[#443]: https://github.com/tafia/quick-xml/pull/443
[#450]: https://github.com/tafia/quick-xml/pull/450
[#455]: https://github.com/tafia/quick-xml/pull/455


## 0.23.0 -- 2022-05-08
Expand Down
18 changes: 6 additions & 12 deletions examples/read_texts.rs
@@ -1,6 +1,5 @@
fn main() {
use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::Reader;

let xml = "<tag1>text1</tag1><tag1>text2</tag1>\
Expand All @@ -9,23 +8,18 @@ fn main() {
let mut reader = Reader::from_str(xml);
reader.trim_text(true);

let mut txt = Vec::new();
let mut buf = Vec::new();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => {
txt.push(
reader
.read_text_into(QName(b"tag2"), &mut Vec::new())
.expect("Cannot decode text value"),
);
match reader.read_event() {
Ok(Event::Start(e)) if e.name().as_ref() == b"tag2" => {
// read_text_into for buffered readers not implemented
let txt = reader
.read_text(e.name())
.expect("Cannot decode text value");
println!("{:?}", txt);
}
Ok(Event::Eof) => break, // exits the loop when reaching end of file
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
_ => (), // There are several other `Event`s we do not consider here
}
buf.clear();
}
}
6 changes: 4 additions & 2 deletions src/de/mod.rs
Expand Up @@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end_into(name, &mut self.buf) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down Expand Up @@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.reader.read_to_end(name) {
Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
other => Ok(other?),
Err(e) => Err(e.into()),
Ok(_) => Ok(()),
}
}

Expand Down
29 changes: 20 additions & 9 deletions src/events/attributes.rs
Expand Up @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
///
/// This method is available only if `encoding` feature is **not** enabled.
#[cfg(any(doc, not(feature = "encoding")))]
pub fn unescape_value(&self) -> XmlResult<Cow<str>> {
pub fn unescape_value(&self) -> XmlResult<Cow<'a, str>> {
self.unescape_value_with(|_| None)
}

Expand All @@ -61,19 +61,26 @@ impl<'a> Attribute<'a> {
pub fn unescape_value_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> XmlResult<Cow<str>> {
) -> XmlResult<Cow<'a, str>> {
// from_utf8 should never fail because content is always UTF-8 encoded
Ok(unescape_with(
std::str::from_utf8(&self.value)?,
resolve_entity,
)?)
let decoded = match &self.value {
Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes)?),
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => Cow::Owned(std::str::from_utf8(bytes)?.to_string()),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Cow::Borrowed(_) => Ok(decoded),
Cow::Owned(s) => Ok(s.into()),
}
}

/// Decodes then unescapes the value.
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<str>> {
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
self.decode_and_unescape_value_with(reader, |_| None)
}

Expand All @@ -85,8 +92,12 @@ impl<'a> Attribute<'a> {
&self,
reader: &Reader<B>,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> XmlResult<Cow<str>> {
let decoded = reader.decoder().decode(&*self.value)?;
) -> XmlResult<Cow<'a, str>> {
let decoded = match &self.value {
Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Expand Down
14 changes: 8 additions & 6 deletions src/events/mod.rs
Expand Up @@ -732,7 +732,7 @@ impl<'a> BytesText<'a> {
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn unescape(&self) -> Result<Cow<str>> {
pub fn unescape(&self) -> Result<Cow<'a, str>> {
self.unescape_with(|_| None)
}

Expand All @@ -743,8 +743,12 @@ impl<'a> BytesText<'a> {
pub fn unescape_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<str>> {
let decoded = self.decoder.decode(&*self)?;
) -> Result<Cow<'a, str>> {
let decoded = match &self.content {
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
};

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Expand All @@ -754,11 +758,9 @@ impl<'a> BytesText<'a> {
}

/// Gets content of this text buffer in the specified encoding and optionally
/// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime
/// of the returned `Cow` is bound to the original buffer / input
/// unescapes it.
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, unescape: bool) -> Result<Cow<'a, str>> {
//TODO: too many copies, can be optimized
let text = match &self.content {
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Expand Down
14 changes: 8 additions & 6 deletions src/reader/async_tokio.rs
Expand Up @@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader};
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
};
use crate::{Error, Result};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down Expand Up @@ -125,7 +127,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -142,8 +144,8 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)
) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found and moves reader to an `Opened` state.
Expand Down Expand Up @@ -275,7 +277,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
/// (ResolveResult::Bound(ns), Event::Start(start))
/// );
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
Expand All @@ -295,7 +297,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<()> {
) -> Result<Span> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Reader::check_end_names` documentation
self.reader.read_to_end_into_async(end, buf).await
Expand Down
63 changes: 10 additions & 53 deletions src/reader/buffered_reader.rs
Expand Up @@ -10,7 +10,7 @@ use memchr;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
Expand Down Expand Up @@ -277,6 +277,10 @@ impl<R: BufRead> Reader<R> {
/// storage for events content. This function is supposed to be called after
/// you already read a [`Start`] event.
///
/// Returns a span that cover content between `>` of an opening tag and `<` of
/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
/// this method was called after reading expanded [`Start`] event.
///
/// Manages nested cases where parent and child elements have the same name.
///
/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
Expand Down Expand Up @@ -340,7 +344,7 @@ impl<R: BufRead> Reader<R> {
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
///
/// //...then, we could skip all events to the corresponding end event.
/// // ...then, we could skip all events to the corresponding end event.
/// // This call will correctly handle nested <outer> elements.
/// // Note, however, that this method does not handle namespaces.
/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
Expand All @@ -353,60 +357,13 @@ impl<R: BufRead> Reader<R> {
/// [`End`]: Event::End
/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
/// [`read_to_end()`]: Self::read_to_end
/// [`expand_empty_elements`]: Self::expand_empty_elements
/// [`check_end_names`]: Self::check_end_names
/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
read_to_end!(self, end, buf, read_event_impl, {
pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
Ok(read_to_end!(self, end, buf, read_event_impl, {
buf.clear();
})
}

/// Reads optional text between start and end tags.
///
/// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
/// `String`. If the next event is an [`End`] event, returns the empty string. In all other
/// cases, returns an error.
///
/// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
/// if none is specified).
///
/// # Examples
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::Reader;
/// use quick_xml::events::Event;
///
/// let mut xml = Reader::from_reader(b"
/// <a>&lt;b&gt;</a>
/// <a></a>
/// " as &[u8]);
/// xml.trim_text(true);
///
/// let expected = ["<b>", ""];
/// for &content in expected.iter() {
/// match xml.read_event_into(&mut Vec::new()) {
/// Ok(Event::Start(ref e)) => {
/// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content);
/// },
/// e => panic!("Expecting Start event, found {:?}", e),
/// }
/// }
/// ```
///
/// [`Text`]: Event::Text
/// [`End`]: Event::End
pub fn read_text_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<String> {
let s = match self.read_event_into(buf) {
Err(e) => return Err(e),

Ok(Event::Text(e)) => e.unescape()?.into_owned(),
Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
_ => return Err(Error::TextNotFound),
};
self.read_to_end_into(end, buf)?;
Ok(s)
}))
}
}

Expand Down
10 changes: 9 additions & 1 deletion src/reader/mod.rs
Expand Up @@ -2,6 +2,7 @@

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;
use std::ops::Range;

use crate::encoding::Decoder;
use crate::errors::{Error, Result};
Expand Down Expand Up @@ -238,16 +239,18 @@ macro_rules! read_to_end {
$clear:block
$(, $await:ident)?
) => {{
let start = $self.buffer_position();
let mut depth = 0;
loop {
$clear
let end = $self.buffer_position();
match $self.$read_event($buf) $(.$await)? {
Err(e) => return Err(e),

Ok(Event::Start(e)) if e.name() == $end => depth += 1,
Ok(Event::End(e)) if e.name() == $end => {
if depth == 0 {
return Ok(());
break start..end;
}
depth -= 1;
}
Expand All @@ -270,6 +273,11 @@ mod slice_reader;

pub use ns_reader::NsReader;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Possible reader states. The state transition diagram (`true` and `false` shows
/// value of [`Reader::expand_empty_elements()`] option):
///
Expand Down

0 comments on commit 87d241a

Please sign in to comment.