diff --git a/Changelog.md b/Changelog.md index e75d1407..1bf0fe4f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -41,6 +41,7 @@ under the `quick-xml::encoding` namespace. - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers - [#455]: Change return type of all `read_to_end*` methods to return a span between tags +- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags ### Bug Fixes diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 379f067c..0c02e5e2 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -4,6 +4,7 @@ //! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname //! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname +use std::borrow::Cow; use std::fs::File; use std::io::{BufRead, BufReader}; use std::ops::Deref; @@ -750,6 +751,75 @@ impl<'i> NsReader<&'i [u8]> { // match literally the start name. See `Self::check_end_names` documentation self.reader.read_to_end(end) } + + /// Reads content between start and end tags, including any markup. This + /// function is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// Any text will be decoded using the XML current [`decoder()`]. + /// + /// Actually, this method perform the following code: + /// + /// ```ignore + /// let span = reader.read_to_end(end)?; + /// let text = reader.decoder().decode(&reader.inner_slice[span]); + /// ``` + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::NsReader; + /// + /// let mut reader = NsReader::from_str(r#" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// "#); + /// reader.trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.check_end_names(false); + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text(end.name()).unwrap(); + /// assert_eq!(text, Cow::Borrowed(r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#)); + /// + /// // Now we can enable checks again + /// reader.check_end_names(true); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`decoder()`]: Reader::decoder() + #[inline] + pub fn read_text(&mut self, end: QName) -> Result> { + self.reader.read_text(end) + } } impl Deref for NsReader { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 0c2e5dfd..1f196a01 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -2,6 +2,8 @@ //! underlying byte stream. This implementation supports not using an //! intermediate buffer as the byte slice itself can be used to borrow from. +use std::borrow::Cow; + #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] @@ -153,6 +155,78 @@ impl<'a> Reader<&'a [u8]> { pub fn read_to_end(&mut self, end: QName) -> Result { Ok(read_to_end!(self, end, (), read_event_impl, {})) } + + /// Reads content between start and end tags, including any markup. This + /// function is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// Any text will be decoded using the XML current [`decoder()`]. + /// + /// Actually, this method perform the following code: + /// + /// ```ignore + /// let span = reader.read_to_end(end)?; + /// let text = reader.decoder().decode(&reader.inner_slice[span]); + /// ``` + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// "); + /// reader.trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.check_end_names(false); + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text(end.name()).unwrap(); + /// assert_eq!(text, Cow::Borrowed(r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#)); + /// + /// // Now we can enable checks again + /// reader.check_end_names(true); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`decoder()`]: Self::decoder() + pub fn read_text(&mut self, end: QName) -> Result> { + // self.reader will be changed, so store original reference + let buffer = self.reader; + let span = self.read_to_end(end)?; + + self.decoder().decode(&buffer[0..span.len()]) + } } ////////////////////////////////////////////////////////////////////////////////////////////////////