tafia#154: Implement read_text which read all content between tags …

…as a text, including other markup
Mingun · Aug 14, 2022 · d67f426 · d67f426
1 parent 7fafac4
commit d67f426
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 0 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -41,6 +41,7 @@
   under the `quick-xml::encoding` namespace.
 - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
 - [#455]: Change return type of all `read_to_end*` methods to return a span between tags
+- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
 
 
 ### Bug Fixes

diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs
@@ -4,6 +4,7 @@
 //! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname
 //! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname
 
+use std::borrow::Cow;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::ops::Deref;
@@ -750,6 +751,75 @@ impl<'i> NsReader<&'i [u8]> {
         // match literally the start name. See `Self::check_end_names` documentation
         self.reader.read_to_end(end)
     }
+
+    /// Reads content between start and end tags, including any markup. This
+    /// function is supposed to be called after you already read a [`Start`] event.
+    ///
+    /// Manages nested cases where parent and child elements have the same name.
+    ///
+    /// This method does not unescape read data, instead it returns content
+    /// "as is" of the XML document. This is because it has no idea what text
+    /// it reads, and if, for example, it contains CDATA section, attempt to
+    /// unescape it content will spoil data.
+    ///
+    /// Any text will be decoded using the XML current [`decoder()`].
+    ///
+    /// Actually, this method perform the following code:
+    ///
+    /// ```ignore
+    /// let span = reader.read_to_end(end)?;
+    /// let text = reader.decoder().decode(&reader.inner_slice[span]);
+    /// ```
+    ///
+    /// # Examples
+    ///
+    /// This example shows, how you can read a HTML content from your XML document.
+    ///
+    /// ```
+    /// # use pretty_assertions::assert_eq;
+    /// # use std::borrow::Cow;
+    /// use quick_xml::events::{BytesStart, Event};
+    /// use quick_xml::NsReader;
+    ///
+    /// let mut reader = NsReader::from_str(r#"
+    ///     <html>
+    ///         <title>This is a HTML text</title>
+    ///         <p>Usual XML rules does not apply inside it
+    ///         <p>For example, elements not needed to be &quot;closed&quot;
+    ///     </html>
+    /// "#);
+    /// reader.trim_text(true);
+    ///
+    /// let start = BytesStart::new("html");
+    /// let end   = start.to_end().into_owned();
+    ///
+    /// // First, we read a start event...
+    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
+    /// // ...and disable checking of end names because we expect HTML further...
+    /// reader.check_end_names(false);
+    ///
+    /// // ...then, we could read text content until close tag.
+    /// // This call will correctly handle nested <html> elements.
+    /// let text = reader.read_text(end.name()).unwrap();
+    /// assert_eq!(text, Cow::Borrowed(r#"
+    ///         <title>This is a HTML text</title>
+    ///         <p>Usual XML rules does not apply inside it
+    ///         <p>For example, elements not needed to be &quot;closed&quot;
+    ///     "#));
+    ///
+    /// // Now we can enable checks again
+    /// reader.check_end_names(true);
+    ///
+    /// // At the end we should get an Eof event, because we ate the whole XML
+    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
+    /// ```
+    ///
+    /// [`Start`]: Event::Start
+    /// [`decoder()`]: Reader::decoder()
+    #[inline]
+    pub fn read_text(&mut self, end: QName) -> Result<Cow<'i, str>> {
+        self.reader.read_text(end)
+    }
 }
 
 impl<R> Deref for NsReader<R> {

diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs
@@ -2,6 +2,8 @@
 //! underlying byte stream. This implementation supports not using an
 //! intermediate buffer as the byte slice itself can be used to borrow from.
 
+use std::borrow::Cow;
+
 #[cfg(feature = "encoding")]
 use crate::reader::EncodingRef;
 #[cfg(feature = "encoding")]
@@ -153,6 +155,78 @@ impl<'a> Reader<&'a [u8]> {
     pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
         Ok(read_to_end!(self, end, (), read_event_impl, {}))
     }
+
+    /// Reads content between start and end tags, including any markup. This
+    /// function is supposed to be called after you already read a [`Start`] event.
+    ///
+    /// Manages nested cases where parent and child elements have the same name.
+    ///
+    /// This method does not unescape read data, instead it returns content
+    /// "as is" of the XML document. This is because it has no idea what text
+    /// it reads, and if, for example, it contains CDATA section, attempt to
+    /// unescape it content will spoil data.
+    ///
+    /// Any text will be decoded using the XML current [`decoder()`].
+    ///
+    /// Actually, this method perform the following code:
+    ///
+    /// ```ignore
+    /// let span = reader.read_to_end(end)?;
+    /// let text = reader.decoder().decode(&reader.inner_slice[span]);
+    /// ```
+    ///
+    /// # Examples
+    ///
+    /// This example shows, how you can read a HTML content from your XML document.
+    ///
+    /// ```
+    /// # use pretty_assertions::assert_eq;
+    /// # use std::borrow::Cow;
+    /// use quick_xml::events::{BytesStart, Event};
+    /// use quick_xml::Reader;
+    ///
+    /// let mut reader = Reader::from_str("
+    ///     <html>
+    ///         <title>This is a HTML text</title>
+    ///         <p>Usual XML rules does not apply inside it
+    ///         <p>For example, elements not needed to be &quot;closed&quot;
+    ///     </html>
+    /// ");
+    /// reader.trim_text(true);
+    ///
+    /// let start = BytesStart::new("html");
+    /// let end   = start.to_end().into_owned();
+    ///
+    /// // First, we read a start event...
+    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
+    /// // ...and disable checking of end names because we expect HTML further...
+    /// reader.check_end_names(false);
+    ///
+    /// // ...then, we could read text content until close tag.
+    /// // This call will correctly handle nested <html> elements.
+    /// let text = reader.read_text(end.name()).unwrap();
+    /// assert_eq!(text, Cow::Borrowed(r#"
+    ///         <title>This is a HTML text</title>
+    ///         <p>Usual XML rules does not apply inside it
+    ///         <p>For example, elements not needed to be &quot;closed&quot;
+    ///     "#));
+    ///
+    /// // Now we can enable checks again
+    /// reader.check_end_names(true);
+    ///
+    /// // At the end we should get an Eof event, because we ate the whole XML
+    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
+    /// ```
+    ///
+    /// [`Start`]: Event::Start
+    /// [`decoder()`]: Self::decoder()
+    pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
+        // self.reader will be changed, so store original reference
+        let buffer = self.reader;
+        let span = self.read_to_end(end)?;
+
+        self.decoder().decode(&buffer[0..span.len()])
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////