Merge pull request #455 from Mingun/read-text

Implement `read_text` - a method that returns a text between two tags
tafia · Aug 15, 2022 · 87d241a · 87d241a
2 parents 2bf2d2d + 792d23d
commit 87d241a
Show file tree

Hide file tree

Showing 10 changed files with 240 additions and 98 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -40,6 +40,8 @@
 - [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
   under the `quick-xml::encoding` namespace.
 - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
+- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
+- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
 
 
 ### Bug Fixes
@@ -139,6 +141,7 @@
   |`*_with_custom_entities`|`*_with`
   |`BytesText::unescaped()`|`BytesText::unescape()`
   |`Attribute::unescaped_*`|`Attribute::unescape_*`
+- [#329]: Also, that functions now borrow from the input instead of event / attribute
 
 - [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
   added to all events
@@ -181,6 +184,8 @@
 - [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
   array cannot guarantee borrowing due to possible copying while decoding.
 
+- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`
+
 ### New Tests
 
 - [#9]: Added tests for incorrect nested tags in input
@@ -199,6 +204,7 @@
 [#180]: https://github.com/tafia/quick-xml/issues/180
 [#191]: https://github.com/tafia/quick-xml/issues/191
 [#324]: https://github.com/tafia/quick-xml/issues/324
+[#329]: https://github.com/tafia/quick-xml/issues/329
 [#363]: https://github.com/tafia/quick-xml/issues/363
 [#387]: https://github.com/tafia/quick-xml/pull/387
 [#391]: https://github.com/tafia/quick-xml/pull/391
@@ -220,6 +226,7 @@
 [#440]: https://github.com/tafia/quick-xml/pull/440
 [#443]: https://github.com/tafia/quick-xml/pull/443
 [#450]: https://github.com/tafia/quick-xml/pull/450
+[#455]: https://github.com/tafia/quick-xml/pull/455
 
 
 ## 0.23.0 -- 2022-05-08

diff --git a/examples/read_texts.rs b/examples/read_texts.rs
@@ -1,6 +1,5 @@
 fn main() {
     use quick_xml::events::Event;
-    use quick_xml::name::QName;
     use quick_xml::Reader;
 
     let xml = "<tag1>text1</tag1><tag1>text2</tag1>\
@@ -9,23 +8,18 @@ fn main() {
     let mut reader = Reader::from_str(xml);
     reader.trim_text(true);
 
-    let mut txt = Vec::new();
-    let mut buf = Vec::new();
-
     loop {
-        match reader.read_event_into(&mut buf) {
-            Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => {
-                txt.push(
-                    reader
-                        .read_text_into(QName(b"tag2"), &mut Vec::new())
-                        .expect("Cannot decode text value"),
-                );
+        match reader.read_event() {
+            Ok(Event::Start(e)) if e.name().as_ref() == b"tag2" => {
+                // read_text_into for buffered readers not implemented
+                let txt = reader
+                    .read_text(e.name())
+                    .expect("Cannot decode text value");
                 println!("{:?}", txt);
             }
             Ok(Event::Eof) => break, // exits the loop when reaching end of file
             Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
             _ => (), // There are several other `Event`s we do not consider here
         }
-        buf.clear();
     }
 }
diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
     fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
         match self.reader.read_to_end_into(name, &mut self.buf) {
             Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
-            other => Ok(other?),
+            Err(e) => Err(e.into()),
+            Ok(_) => Ok(()),
         }
     }
 
@@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
     fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
         match self.reader.read_to_end(name) {
             Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof),
-            other => Ok(other?),
+            Err(e) => Err(e.into()),
+            Ok(_) => Ok(()),
         }
     }
 

diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
     ///
     /// This method is available only if `encoding` feature is **not** enabled.
     #[cfg(any(doc, not(feature = "encoding")))]
-    pub fn unescape_value(&self) -> XmlResult<Cow<str>> {
+    pub fn unescape_value(&self) -> XmlResult<Cow<'a, str>> {
         self.unescape_value_with(|_| None)
     }
 
@@ -61,19 +61,26 @@ impl<'a> Attribute<'a> {
     pub fn unescape_value_with<'entity>(
         &self,
         resolve_entity: impl Fn(&str) -> Option<&'entity str>,
-    ) -> XmlResult<Cow<str>> {
+    ) -> XmlResult<Cow<'a, str>> {
         // from_utf8 should never fail because content is always UTF-8 encoded
-        Ok(unescape_with(
-            std::str::from_utf8(&self.value)?,
-            resolve_entity,
-        )?)
+        let decoded = match &self.value {
+            Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes)?),
+            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+            Cow::Owned(bytes) => Cow::Owned(std::str::from_utf8(bytes)?.to_string()),
+        };
+
+        match unescape_with(&decoded, resolve_entity)? {
+            // Because result is borrowed, no replacements was done and we can use original string
+            Cow::Borrowed(_) => Ok(decoded),
+            Cow::Owned(s) => Ok(s.into()),
+        }
     }
 
     /// Decodes then unescapes the value.
     ///
     /// This will allocate if the value contains any escape sequences or in
     /// non-UTF-8 encoding.
-    pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<str>> {
+    pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
         self.decode_and_unescape_value_with(reader, |_| None)
     }
 
@@ -85,8 +92,12 @@ impl<'a> Attribute<'a> {
         &self,
         reader: &Reader<B>,
         resolve_entity: impl Fn(&str) -> Option<&'entity str>,
-    ) -> XmlResult<Cow<str>> {
-        let decoded = reader.decoder().decode(&*self.value)?;
+    ) -> XmlResult<Cow<'a, str>> {
+        let decoded = match &self.value {
+            Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
+            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+            Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
+        };
 
         match unescape_with(&decoded, resolve_entity)? {
             // Because result is borrowed, no replacements was done and we can use original string

diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -732,7 +732,7 @@ impl<'a> BytesText<'a> {
     ///
     /// This will allocate if the value contains any escape sequences or in
     /// non-UTF-8 encoding.
-    pub fn unescape(&self) -> Result<Cow<str>> {
+    pub fn unescape(&self) -> Result<Cow<'a, str>> {
         self.unescape_with(|_| None)
     }
 
@@ -743,8 +743,12 @@ impl<'a> BytesText<'a> {
     pub fn unescape_with<'entity>(
         &self,
         resolve_entity: impl Fn(&str) -> Option<&'entity str>,
-    ) -> Result<Cow<str>> {
-        let decoded = self.decoder.decode(&*self)?;
+    ) -> Result<Cow<'a, str>> {
+        let decoded = match &self.content {
+            Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
+            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+            Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
+        };
 
         match unescape_with(&decoded, resolve_entity)? {
             // Because result is borrowed, no replacements was done and we can use original string
@@ -754,11 +758,9 @@ impl<'a> BytesText<'a> {
     }
 
     /// Gets content of this text buffer in the specified encoding and optionally
-    /// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime
-    /// of the returned `Cow` is bound to the original buffer / input
+    /// unescapes it.
     #[cfg(feature = "serialize")]
     pub(crate) fn decode(&self, unescape: bool) -> Result<Cow<'a, str>> {
-        //TODO: too many copies, can be optimized
         let text = match &self.content {
             Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
             // Convert to owned, because otherwise Cow will be bound with wrong lifetime

diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs
@@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
 use crate::events::Event;
 use crate::name::{QName, ResolveResult};
 use crate::reader::buffered_reader::impl_buffered_source;
-use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader};
+use crate::reader::{
+    is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
+};
 use crate::{Error, Result};
 
 /// A struct for read XML asynchronously from an [`AsyncBufRead`].
@@ -125,7 +127,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
     /// // First, we read a start event...
     /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
     ///
-    /// //...then, we could skip all events to the corresponding end event.
+    /// // ...then, we could skip all events to the corresponding end event.
     /// // This call will correctly handle nested <outer> elements.
     /// // Note, however, that this method does not handle namespaces.
     /// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
@@ -142,8 +144,8 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
         // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
         end: QName<'n>,
         buf: &mut Vec<u8>,
-    ) -> Result<()> {
-        read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)
+    ) -> Result<Span> {
+        Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
     }
 
     /// Read until '<' is found and moves reader to an `Opened` state.
@@ -275,7 +277,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
     ///     (ResolveResult::Bound(ns), Event::Start(start))
     /// );
     ///
-    /// //...then, we could skip all events to the corresponding end event.
+    /// // ...then, we could skip all events to the corresponding end event.
     /// // This call will correctly handle nested <outer> elements.
     /// // Note, however, that this method does not handle namespaces.
     /// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap();
@@ -295,7 +297,7 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
         // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
         end: QName<'n>,
         buf: &mut Vec<u8>,
-    ) -> Result<()> {
+    ) -> Result<Span> {
         // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
         // match literally the start name. See `Reader::check_end_names` documentation
         self.reader.read_to_end_into_async(end, buf).await

diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
@@ -10,7 +10,7 @@ use memchr;
 use crate::errors::{Error, Result};
 use crate::events::Event;
 use crate::name::QName;
-use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
+use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
 
 macro_rules! impl_buffered_source {
     ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
@@ -277,6 +277,10 @@ impl<R: BufRead> Reader<R> {
     /// storage for events content. This function is supposed to be called after
     /// you already read a [`Start`] event.
     ///
+    /// Returns a span that cover content between `>` of an opening tag and `<` of
+    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
+    /// this method was called after reading expanded [`Start`] event.
+    ///
     /// Manages nested cases where parent and child elements have the same name.
     ///
     /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
@@ -340,7 +344,7 @@ impl<R: BufRead> Reader<R> {
     /// // First, we read a start event...
     /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
     ///
-    /// //...then, we could skip all events to the corresponding end event.
+    /// // ...then, we could skip all events to the corresponding end event.
     /// // This call will correctly handle nested <outer> elements.
     /// // Note, however, that this method does not handle namespaces.
     /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
@@ -353,60 +357,13 @@ impl<R: BufRead> Reader<R> {
     /// [`End`]: Event::End
     /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
     /// [`read_to_end()`]: Self::read_to_end
+    /// [`expand_empty_elements`]: Self::expand_empty_elements
     /// [`check_end_names`]: Self::check_end_names
     /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
-    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<()> {
-        read_to_end!(self, end, buf, read_event_impl, {
+    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
+        Ok(read_to_end!(self, end, buf, read_event_impl, {
             buf.clear();
-        })
-    }
-
-    /// Reads optional text between start and end tags.
-    ///
-    /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
-    /// `String`. If the next event is an [`End`] event, returns the empty string. In all other
-    /// cases, returns an error.
-    ///
-    /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
-    /// if none is specified).
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # use pretty_assertions::assert_eq;
-    /// use quick_xml::Reader;
-    /// use quick_xml::events::Event;
-    ///
-    /// let mut xml = Reader::from_reader(b"
-    ///     <a>&lt;b&gt;</a>
-    ///     <a></a>
-    /// " as &[u8]);
-    /// xml.trim_text(true);
-    ///
-    /// let expected = ["<b>", ""];
-    /// for &content in expected.iter() {
-    ///     match xml.read_event_into(&mut Vec::new()) {
-    ///         Ok(Event::Start(ref e)) => {
-    ///             assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content);
-    ///         },
-    ///         e => panic!("Expecting Start event, found {:?}", e),
-    ///     }
-    /// }
-    /// ```
-    ///
-    /// [`Text`]: Event::Text
-    /// [`End`]: Event::End
-    pub fn read_text_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<String> {
-        let s = match self.read_event_into(buf) {
-            Err(e) => return Err(e),
-
-            Ok(Event::Text(e)) => e.unescape()?.into_owned(),
-            Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
-            Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
-            _ => return Err(Error::TextNotFound),
-        };
-        self.read_to_end_into(end, buf)?;
-        Ok(s)
+        }))
     }
 }
 

diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -2,6 +2,7 @@
 
 #[cfg(feature = "encoding")]
 use encoding_rs::Encoding;
+use std::ops::Range;
 
 use crate::encoding::Decoder;
 use crate::errors::{Error, Result};
@@ -238,16 +239,18 @@ macro_rules! read_to_end {
         $clear:block
         $(, $await:ident)?
     ) => {{
+        let start = $self.buffer_position();
         let mut depth = 0;
         loop {
             $clear
+            let end = $self.buffer_position();
             match $self.$read_event($buf) $(.$await)? {
                 Err(e) => return Err(e),
 
                 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
                 Ok(Event::End(e)) if e.name() == $end => {
                     if depth == 0 {
-                        return Ok(());
+                        break start..end;
                     }
                     depth -= 1;
                 }
@@ -270,6 +273,11 @@ mod slice_reader;
 
 pub use ns_reader::NsReader;
 
+/// Range of input in bytes, that corresponds to some piece of XML
+pub type Span = Range<usize>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Possible reader states. The state transition diagram (`true` and `false` shows
 /// value of [`Reader::expand_empty_elements()`] option):
 ///