From abec80f8f27a84c0941da929e52ab31b6264846e Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 9 Aug 2022 22:34:23 +0500 Subject: [PATCH 1/4] Fix #329: Borrow from input in `unescape_*` methods Because those methods usually used on events returned by reader, which always borrow content from input / buffer, actual allocation count does not changed --- Changelog.md | 2 ++ src/events/attributes.rs | 29 ++++++++++++++++++++--------- src/events/mod.rs | 14 ++++++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/Changelog.md b/Changelog.md index 23f5e72b..4fea4ed2 100644 --- a/Changelog.md +++ b/Changelog.md @@ -139,6 +139,7 @@ |`*_with_custom_entities`|`*_with` |`BytesText::unescaped()`|`BytesText::unescape()` |`Attribute::unescaped_*`|`Attribute::unescape_*` +- [#329]: Also, that functions now borrow from the input instead of event / attribute - [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method added to all events @@ -199,6 +200,7 @@ [#180]: https://github.com/tafia/quick-xml/issues/180 [#191]: https://github.com/tafia/quick-xml/issues/191 [#324]: https://github.com/tafia/quick-xml/issues/324 +[#329]: https://github.com/tafia/quick-xml/issues/329 [#363]: https://github.com/tafia/quick-xml/issues/363 [#387]: https://github.com/tafia/quick-xml/pull/387 [#391]: https://github.com/tafia/quick-xml/pull/391 diff --git a/src/events/attributes.rs b/src/events/attributes.rs index ad938f9b..0025003a 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> { /// /// This method is available only if `encoding` feature is **not** enabled. #[cfg(any(doc, not(feature = "encoding")))] - pub fn unescape_value(&self) -> XmlResult> { + pub fn unescape_value(&self) -> XmlResult> { self.unescape_value_with(|_| None) } @@ -61,19 +61,26 @@ impl<'a> Attribute<'a> { pub fn unescape_value_with<'entity>( &self, resolve_entity: impl Fn(&str) -> Option<&'entity str>, - ) -> XmlResult> { + ) -> XmlResult> { // from_utf8 should never fail because content is always UTF-8 encoded - Ok(unescape_with( - std::str::from_utf8(&self.value)?, - resolve_entity, - )?) + let decoded = match &self.value { + Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes)?), + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => Cow::Owned(std::str::from_utf8(bytes)?.to_string()), + }; + + match unescape_with(&decoded, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } } /// Decodes then unescapes the value. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn decode_and_unescape_value(&self, reader: &Reader) -> XmlResult> { + pub fn decode_and_unescape_value(&self, reader: &Reader) -> XmlResult> { self.decode_and_unescape_value_with(reader, |_| None) } @@ -85,8 +92,12 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, resolve_entity: impl Fn(&str) -> Option<&'entity str>, - ) -> XmlResult> { - let decoded = reader.decoder().decode(&*self.value)?; + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(), + }; match unescape_with(&decoded, resolve_entity)? { // Because result is borrowed, no replacements was done and we can use original string diff --git a/src/events/mod.rs b/src/events/mod.rs index e267e6f0..2366d1fe 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -732,7 +732,7 @@ impl<'a> BytesText<'a> { /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn unescape(&self) -> Result> { + pub fn unescape(&self) -> Result> { self.unescape_with(|_| None) } @@ -743,8 +743,12 @@ impl<'a> BytesText<'a> { pub fn unescape_with<'entity>( &self, resolve_entity: impl Fn(&str) -> Option<&'entity str>, - ) -> Result> { - let decoded = self.decoder.decode(&*self)?; + ) -> Result> { + let decoded = match &self.content { + Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(), + }; match unescape_with(&decoded, resolve_entity)? { // Because result is borrowed, no replacements was done and we can use original string @@ -754,11 +758,9 @@ impl<'a> BytesText<'a> { } /// Gets content of this text buffer in the specified encoding and optionally - /// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime - /// of the returned `Cow` is bound to the original buffer / input + /// unescapes it. #[cfg(feature = "serialize")] pub(crate) fn decode(&self, unescape: bool) -> Result> { - //TODO: too many copies, can be optimized let text = match &self.content { Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, // Convert to owned, because otherwise Cow will be bound with wrong lifetime From 7fafac40e89cd6c9ba5f1c0902da00e132aa0866 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 9 Aug 2022 22:46:53 +0500 Subject: [PATCH 2/4] Return `Span` of skipped bytes from `read_to_end*` --- Changelog.md | 2 ++ src/de/mod.rs | 6 ++++-- src/reader/async_tokio.rs | 14 ++++++++------ src/reader/buffered_reader.rs | 15 ++++++++++----- src/reader/mod.rs | 10 +++++++++- src/reader/ns_reader.rs | 20 +++++++++++++++----- src/reader/slice_reader.rs | 13 +++++++++---- 7 files changed, 57 insertions(+), 23 deletions(-) diff --git a/Changelog.md b/Changelog.md index 4fea4ed2..e75d1407 100644 --- a/Changelog.md +++ b/Changelog.md @@ -40,6 +40,7 @@ - [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()` under the `quick-xml::encoding` namespace. - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers +- [#455]: Change return type of all `read_to_end*` methods to return a span between tags ### Bug Fixes @@ -222,6 +223,7 @@ [#440]: https://github.com/tafia/quick-xml/pull/440 [#443]: https://github.com/tafia/quick-xml/pull/443 [#450]: https://github.com/tafia/quick-xml/pull/450 +[#455]: https://github.com/tafia/quick-xml/pull/455 ## 0.23.0 -- 2022-05-08 diff --git a/src/de/mod.rs b/src/de/mod.rs index 0421e661..8f65ff6b 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -951,7 +951,8 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { match self.reader.read_to_end_into(name, &mut self.buf) { Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof), - other => Ok(other?), + Err(e) => Err(e.into()), + Ok(_) => Ok(()), } } @@ -991,7 +992,8 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { match self.reader.read_to_end(name) { Err(Error::UnexpectedEof(_)) => Err(DeError::UnexpectedEof), - other => Ok(other?), + Err(e) => Err(e.into()), + Ok(_) => Ok(()), } } diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1e06b0b8..26ec9ed0 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -9,7 +9,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt}; use crate::events::Event; use crate::name::{QName, ResolveResult}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader}; +use crate::reader::{ + is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, +}; use crate::{Error, Result}; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. @@ -125,7 +127,7 @@ impl Reader { /// // First, we read a start event... /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start)); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap(); @@ -142,8 +144,8 @@ impl Reader { // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033` end: QName<'n>, buf: &mut Vec, - ) -> Result<()> { - read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await) + ) -> Result { + Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)) } /// Read until '<' is found and moves reader to an `Opened` state. @@ -275,7 +277,7 @@ impl NsReader { /// (ResolveResult::Bound(ns), Event::Start(start)) /// ); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap(); @@ -295,7 +297,7 @@ impl NsReader { // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033` end: QName<'n>, buf: &mut Vec, - ) -> Result<()> { + ) -> Result { // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should // match literally the start name. See `Reader::check_end_names` documentation self.reader.read_to_end_into_async(end, buf).await diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index f09ba706..a32946bf 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -10,7 +10,7 @@ use memchr; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; +use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { @@ -277,6 +277,10 @@ impl Reader { /// storage for events content. This function is supposed to be called after /// you already read a [`Start`] event. /// + /// Returns a span that cover content between `>` of an opening tag and `<` of + /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and + /// this method was called after reading expanded [`Start`] event. + /// /// Manages nested cases where parent and child elements have the same name. /// /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] @@ -340,7 +344,7 @@ impl Reader { /// // First, we read a start event... /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); @@ -353,12 +357,13 @@ impl Reader { /// [`End`]: Event::End /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end /// [`read_to_end()`]: Self::read_to_end + /// [`expand_empty_elements`]: Self::expand_empty_elements /// [`check_end_names`]: Self::check_end_names /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { - read_to_end!(self, end, buf, read_event_impl, { + pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result { + Ok(read_to_end!(self, end, buf, read_event_impl, { buf.clear(); - }) + })) } /// Reads optional text between start and end tags. diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 73ff5061..29b55962 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -2,6 +2,7 @@ #[cfg(feature = "encoding")] use encoding_rs::Encoding; +use std::ops::Range; use crate::encoding::Decoder; use crate::errors::{Error, Result}; @@ -238,16 +239,18 @@ macro_rules! read_to_end { $clear:block $(, $await:ident)? ) => {{ + let start = $self.buffer_position(); let mut depth = 0; loop { $clear + let end = $self.buffer_position(); match $self.$read_event($buf) $(.$await)? { Err(e) => return Err(e), Ok(Event::Start(e)) if e.name() == $end => depth += 1, Ok(Event::End(e)) if e.name() == $end => { if depth == 0 { - return Ok(()); + break start..end; } depth -= 1; } @@ -270,6 +273,11 @@ mod slice_reader; pub use ns_reader::NsReader; +/// Range of input in bytes, that corresponds to some piece of XML +pub type Span = Range; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Possible reader states. The state transition diagram (`true` and `false` shows /// value of [`Reader::expand_empty_elements()`] option): /// diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 3f56d248..be70382c 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -12,7 +12,7 @@ use std::path::Path; use crate::errors::Result; use crate::events::Event; use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; -use crate::reader::{Reader, XmlSource}; +use crate::reader::{Reader, Span, XmlSource}; /// A low level encoding-agnostic XML event reader that performs namespace resolution. /// @@ -425,6 +425,10 @@ impl NsReader { /// storage for events content. This function is supposed to be called after /// you already read a [`Start`] event. /// + /// Returns a span that cover content between `>` of an opening tag and `<` of + /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and + /// this method was called after reading expanded [`Start`] event. + /// /// Manages nested cases where parent and child elements have the same name /// ("the same" means that their local names are the same and their prefixes /// resolves to the same namespace). @@ -491,7 +495,7 @@ impl NsReader { /// (ResolveResult::Bound(ns), Event::Start(start)) /// ); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); @@ -508,8 +512,9 @@ impl NsReader { /// [`UnexpectedEof`]: crate::errors::Error::UnexpectedEof /// [`read_to_end()`]: Self::read_to_end /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`expand_empty_elements`]: Self::expand_empty_elements #[inline] - pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { + pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result { // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should // match literally the start name. See `Self::check_end_names` documentation self.reader.read_to_end_into(end, buf) @@ -657,6 +662,10 @@ impl<'i> NsReader<&'i [u8]> { /// Reads until end element is found. This function is supposed to be called /// after you already read a [`Start`] event. /// + /// Returns a span that cover content between `>` of an opening tag and `<` of + /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and + /// this method was called after reading expanded [`Start`] event. + /// /// Manages nested cases where parent and child elements have the same name /// ("the same" means that their local names are the same and their prefixes /// resolves to the same namespace). @@ -717,7 +726,7 @@ impl<'i> NsReader<&'i [u8]> { /// (ResolveResult::Bound(ns), Event::Start(start)) /// ); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end(end.name()).unwrap(); @@ -734,8 +743,9 @@ impl<'i> NsReader<&'i [u8]> { /// [`UnexpectedEof`]: crate::errors::Error::UnexpectedEof /// [`read_to_end()`]: Self::read_to_end /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`expand_empty_elements`]: Self::expand_empty_elements #[inline] - pub fn read_to_end(&mut self, end: QName) -> Result<()> { + pub fn read_to_end(&mut self, end: QName) -> Result { // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should // match literally the start name. See `Self::check_end_names` documentation self.reader.read_to_end(end) diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index d4bf0d7e..e6312a76 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -10,7 +10,7 @@ use encoding_rs::UTF_8; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; +use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; use memchr; @@ -74,6 +74,10 @@ impl<'a> Reader<&'a [u8]> { /// Reads until end element is found. This function is supposed to be called /// after you already read a [`Start`] event. /// + /// Returns a span that cover content between `>` of an opening tag and `<` of + /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and + /// this method was called after reading expanded [`Start`] event. + /// /// Manages nested cases where parent and child elements have the same name. /// /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] @@ -131,7 +135,7 @@ impl<'a> Reader<&'a [u8]> { /// // First, we read a start event... /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); /// - /// //...then, we could skip all events to the corresponding end event. + /// // ...then, we could skip all events to the corresponding end event. /// // This call will correctly handle nested elements. /// // Note, however, that this method does not handle namespaces. /// reader.read_to_end(end.name()).unwrap(); @@ -143,10 +147,11 @@ impl<'a> Reader<&'a [u8]> { /// [`Start`]: Event::Start /// [`End`]: Event::End /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`expand_empty_elements`]: Self::expand_empty_elements /// [`check_end_names`]: Self::check_end_names /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end(&mut self, end: QName) -> Result<()> { - read_to_end!(self, end, (), read_event_impl, {}) + pub fn read_to_end(&mut self, end: QName) -> Result { + Ok(read_to_end!(self, end, (), read_event_impl, {})) } } From d67f4263436c92f28550ea02800d62b9a2a485cd Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 9 Aug 2022 23:21:01 +0500 Subject: [PATCH 3/4] #154: Implement `read_text` which read all content between tags as a text, including other markup --- Changelog.md | 1 + src/reader/ns_reader.rs | 70 ++++++++++++++++++++++++++++++++++++ src/reader/slice_reader.rs | 74 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+) diff --git a/Changelog.md b/Changelog.md index e75d1407..1bf0fe4f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -41,6 +41,7 @@ under the `quick-xml::encoding` namespace. - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers - [#455]: Change return type of all `read_to_end*` methods to return a span between tags +- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags ### Bug Fixes diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index be70382c..6ad5cabe 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -4,6 +4,7 @@ //! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname //! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname +use std::borrow::Cow; use std::fs::File; use std::io::{BufRead, BufReader}; use std::ops::Deref; @@ -750,6 +751,75 @@ impl<'i> NsReader<&'i [u8]> { // match literally the start name. See `Self::check_end_names` documentation self.reader.read_to_end(end) } + + /// Reads content between start and end tags, including any markup. This + /// function is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// Any text will be decoded using the XML current [`decoder()`]. + /// + /// Actually, this method perform the following code: + /// + /// ```ignore + /// let span = reader.read_to_end(end)?; + /// let text = reader.decoder().decode(&reader.inner_slice[span]); + /// ``` + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::NsReader; + /// + /// let mut reader = NsReader::from_str(r#" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// "#); + /// reader.trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.check_end_names(false); + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text(end.name()).unwrap(); + /// assert_eq!(text, Cow::Borrowed(r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#)); + /// + /// // Now we can enable checks again + /// reader.check_end_names(true); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`decoder()`]: Reader::decoder() + #[inline] + pub fn read_text(&mut self, end: QName) -> Result> { + self.reader.read_text(end) + } } impl Deref for NsReader { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index e6312a76..4d9c80b4 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -2,6 +2,8 @@ //! underlying byte stream. This implementation supports not using an //! intermediate buffer as the byte slice itself can be used to borrow from. +use std::borrow::Cow; + #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] @@ -153,6 +155,78 @@ impl<'a> Reader<&'a [u8]> { pub fn read_to_end(&mut self, end: QName) -> Result { Ok(read_to_end!(self, end, (), read_event_impl, {})) } + + /// Reads content between start and end tags, including any markup. This + /// function is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// Any text will be decoded using the XML current [`decoder()`]. + /// + /// Actually, this method perform the following code: + /// + /// ```ignore + /// let span = reader.read_to_end(end)?; + /// let text = reader.decoder().decode(&reader.inner_slice[span]); + /// ``` + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// "); + /// reader.trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.check_end_names(false); + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text(end.name()).unwrap(); + /// assert_eq!(text, Cow::Borrowed(r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#)); + /// + /// // Now we can enable checks again + /// reader.check_end_names(true); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`decoder()`]: Self::decoder() + pub fn read_text(&mut self, end: QName) -> Result> { + // self.reader will be changed, so store original reference + let buffer = self.reader; + let span = self.read_to_end(end)?; + + self.decoder().decode(&buffer[0..span.len()]) + } } //////////////////////////////////////////////////////////////////////////////////////////////////// From 792d23d9b0a2207ebd740cdbf599ee2ee4310017 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 10 Aug 2022 22:04:58 +0500 Subject: [PATCH 4/4] Remove confusing `Reader::read_text_into` which looks similar to `read_text` but works totally different It is better to explicitly match `Event::Text` --- Changelog.md | 2 ++ examples/read_texts.rs | 18 +++++-------- src/reader/buffered_reader.rs | 48 ----------------------------------- 3 files changed, 8 insertions(+), 60 deletions(-) diff --git a/Changelog.md b/Changelog.md index 1bf0fe4f..aa8e287e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -184,6 +184,8 @@ - [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte array cannot guarantee borrowing due to possible copying while decoding. +- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text` + ### New Tests - [#9]: Added tests for incorrect nested tags in input diff --git a/examples/read_texts.rs b/examples/read_texts.rs index 40d71e63..9a0dd240 100644 --- a/examples/read_texts.rs +++ b/examples/read_texts.rs @@ -1,6 +1,5 @@ fn main() { use quick_xml::events::Event; - use quick_xml::name::QName; use quick_xml::Reader; let xml = "text1text2\ @@ -9,23 +8,18 @@ fn main() { let mut reader = Reader::from_str(xml); reader.trim_text(true); - let mut txt = Vec::new(); - let mut buf = Vec::new(); - loop { - match reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => { - txt.push( - reader - .read_text_into(QName(b"tag2"), &mut Vec::new()) - .expect("Cannot decode text value"), - ); + match reader.read_event() { + Ok(Event::Start(e)) if e.name().as_ref() == b"tag2" => { + // read_text_into for buffered readers not implemented + let txt = reader + .read_text(e.name()) + .expect("Cannot decode text value"); println!("{:?}", txt); } Ok(Event::Eof) => break, // exits the loop when reaching end of file Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), _ => (), // There are several other `Event`s we do not consider here } - buf.clear(); } } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index a32946bf..ca3c2098 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -365,54 +365,6 @@ impl Reader { buf.clear(); })) } - - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let mut xml = Reader::from_reader(b" - /// <b> - /// - /// " as &[u8]); - /// xml.trim_text(true); - /// - /// let expected = ["", ""]; - /// for &content in expected.iter() { - /// match xml.read_event_into(&mut Vec::new()) { - /// Ok(Event::Start(ref e)) => { - /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); - /// }, - /// e => panic!("Expecting Start event, found {:?}", e), - /// } - /// } - /// ``` - /// - /// [`Text`]: Event::Text - /// [`End`]: Event::End - pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { - let s = match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Text(e)) => e.unescape()?.into_owned(), - Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), - Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), - _ => return Err(Error::TextNotFound), - }; - self.read_to_end_into(end, buf)?; - Ok(s) - } } impl Reader> {