diff --git a/Changelog.md b/Changelog.md index 24690d08..64bb395f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -17,6 +17,11 @@ - [#393]: New module `name` with `QName`, `LocalName`, `Namespace`, `Prefix` and `PrefixDeclaration` wrappers around byte arrays and `ResolveResult` with the result of namespace resolution +- [#180]: Make `Decoder` struct public. You already had access to it via the + `Reader::decoder()` method, but could not name it in the code. Now the preferred + way to access decoding functionality is via this struct +- [#191]: New event variant `StartText` emitted for bytes before the XML declaration + or a start comment or a tag. For streams with BOM this event will contain a BOM ### Bug Fixes @@ -56,6 +61,33 @@ - [#393]: Now `BytesStart::name()` and `BytesEnd::name()` returns `QName`, and `BytesStart::local_name()` and `BytesEnd::local_name()` returns `LocalName` +- [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it, + use `String::from_utf8` instead (which that function did) +- [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless. + Use the same-named methods without that suffix instead. Attribute values cannot contain BOM +- [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by + `Decoder::decode()` and `Decoder::decode_with_bom_removal()`. + Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now. + `Reader::encoding()` is replaced by `Decoder::encoding()` as well +- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and + `BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked + as expected, this was only due to good luck. They was replaced by the + `BytesStartText::decode_with_bom_removal()`: + - conceptually, you should decode BOM only for the first `Text` event from the + reader (since now `StartText` event is emitted instead for this) + - text before the first tag is not an XML content at all, so it is meaningless + to try to unescape something in it + +- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is + disabled. Signatures of functions are now the same regardless of whether or not the feature is + enabled, and an error will be returned instead of performing replacements for invalid characters + in both cases. + + Previously, if the `encoding` feature was enabled, decoding functions would return `Result>` + while without this feature they would return `Result<&str>`. With this change, only `Result>` + is returned regardless of the status of the feature. +- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable` + ### New Tests - [#9]: Added tests for incorrect nested tags in input @@ -66,6 +98,8 @@ [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 +[#180]: https://github.com/tafia/quick-xml/issues/180 +[#191]: https://github.com/tafia/quick-xml/issues/191 [#363]: https://github.com/tafia/quick-xml/issues/363 [#387]: https://github.com/tafia/quick-xml/pull/387 [#391]: https://github.com/tafia/quick-xml/pull/391 diff --git a/benches/bench.rs b/benches/bench.rs index 389af6f4..29c9f90f 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,6 +1,7 @@ use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::events::Event; +use quick_xml::name::QName; use quick_xml::Reader; static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml"); @@ -173,7 +174,7 @@ fn bytes_text_unescaped(c: &mut Criterion) { /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); - group.bench_function("Text", |b| { + group.bench_function("StartText", |b| { let src = "Hello world!".repeat(512 / 12).into_bytes(); let mut buf = Vec::with_capacity(1024); b.iter(|| { @@ -181,7 +182,7 @@ fn one_event(c: &mut Criterion) { let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); match r.read_event(&mut buf) { - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; @@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) { let mut buf = Vec::new(); loop { match r.read_event(&mut buf) { - Ok(Event::Empty(e)) if e.name() == b"player" => { + Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { count += 1 diff --git a/src/de/escape.rs b/src/de/escape.rs index 28f99364..e47109e0 100644 --- a/src/de/escape.rs +++ b/src/de/escape.rs @@ -45,12 +45,8 @@ macro_rules! deserialize_num { where V: Visitor<'de>, { - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?; - visitor.$visit(value) } }; @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> { V: Visitor<'de>, { let unescaped = self.unescaped()?; - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(&unescaped)?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(&unescaped); visitor.visit_str(&value) } diff --git a/src/de/mod.rs b/src/de/mod.rs index 3529f329..e252ada7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -337,7 +337,7 @@ where { #[cfg(feature = "encoding")] { - let value = decoder.decode(value); + let value = decoder.decode(value)?; // No need to unescape because valid boolean representations cannot be escaped match value.as_ref() { "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => { @@ -624,7 +624,7 @@ where allow_start: bool, ) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::InvalidXml(e.into())), + DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into), DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())), DeEvent::CData(e) => Ok(e), DeEvent::Start(e) if allow_start => { @@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { let event = loop { let e = self.reader.read_event(&mut self.buf)?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())), Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())), Event::End(e) => break Ok(DeEvent::End(e.into_owned())), Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())), @@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { loop { let e = self.reader.read_event_unbuffered()?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into())), Event::Start(e) => break Ok(DeEvent::Start(e)), Event::End(e) => break Ok(DeEvent::End(e)), Event::Text(e) => break Ok(DeEvent::Text(e)), diff --git a/src/de/seq.rs b/src/de/seq.rs index 958f172d..fe4559bd 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; use crate::events::BytesStart; use crate::reader::Decoder; use serde::de::{DeserializeSeed, SeqAccess}; -#[cfg(not(feature = "encoding"))] -use std::borrow::Cow; /// Check if tag `start` is included in the `fields` list. `decoder` is used to /// get a string representation of a tag. @@ -14,11 +12,7 @@ pub fn not_in( start: &BytesStart, decoder: Decoder, ) -> Result { - #[cfg(not(feature = "encoding"))] - let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?); - - #[cfg(feature = "encoding")] - let tag = decoder.decode(start.name().into_inner()); + let tag = decoder.decode(start.name().into_inner())?; Ok(fields.iter().all(|&field| field != tag.as_ref())) } diff --git a/src/errors.rs b/src/errors.rs index 79eb7899..c1af08af 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -4,14 +4,16 @@ use crate::escape::EscapeError; use crate::events::attributes::AttrError; use crate::utils::write_byte_string; use std::str::Utf8Error; +use std::string::FromUtf8Error; /// The error type used by this crate. #[derive(Debug)] pub enum Error { /// IO error Io(::std::io::Error), - /// Utf8 error - Utf8(Utf8Error), + /// Input decoding error. If `encoding` feature is disabled, contains `None`, + /// otherwise contains the UTF-8 decoding error + NonDecodable(Option), /// Unexpected End of File UnexpectedEof(String), /// End event mismatch @@ -46,10 +48,18 @@ impl From<::std::io::Error> for Error { } impl From for Error { - /// Creates a new `Error::Utf8` from the given error + /// Creates a new `Error::NonDecodable` from the given error #[inline] fn from(error: Utf8Error) -> Error { - Error::Utf8(error) + Error::NonDecodable(Some(error)) + } +} + +impl From for Error { + /// Creates a new `Error::Utf8` from the given error + #[inline] + fn from(error: FromUtf8Error) -> Error { + error.utf8_error().into() } } @@ -77,7 +87,8 @@ impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::Io(e) => write!(f, "I/O error: {}", e), - Error::Utf8(e) => write!(f, "UTF8 error: {}", e), + Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"), + Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e), Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e), Error::EndEventMismatch { expected, found } => { write!(f, "Expecting found ", expected, found) @@ -109,7 +120,7 @@ impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { Error::Io(e) => Some(e), - Error::Utf8(e) => Some(e), + Error::NonDecodable(Some(e)) => Some(e), Error::InvalidAttr(e) => Some(e), Error::EscapeError(e) => Some(e), _ => None, @@ -227,6 +238,7 @@ pub mod serialize { } impl From for DeError { + #[inline] fn from(e: Error) -> Self { Self::InvalidXml(e) } @@ -239,15 +251,17 @@ pub mod serialize { } } - impl From for DeError { - fn from(e: ParseIntError) -> Self { - Self::InvalidInt(e) + impl From for DeError { + #[inline] + fn from(e: Utf8Error) -> Self { + Self::InvalidXml(e.into()) } } - impl From for DeError { - fn from(e: ParseFloatError) -> Self { - Self::InvalidFloat(e) + impl From for DeError { + #[inline] + fn from(e: FromUtf8Error) -> Self { + Self::InvalidXml(e.into()) } } @@ -257,4 +271,18 @@ pub mod serialize { Self::InvalidXml(e.into()) } } + + impl From for DeError { + #[inline] + fn from(e: ParseIntError) -> Self { + Self::InvalidInt(e) + } + } + + impl From for DeError { + #[inline] + fn from(e: ParseFloatError) -> Self { + Self::InvalidFloat(e) + } + } } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1d8e2583..d6331bc7 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -108,122 +108,15 @@ impl<'a> Attribute<'a> { } /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] fn do_unescape_and_decode_value( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { - let decoded = reader.decode(&*self.value); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_value( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { - let decoded = reader.decode(&*self.value)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, None) - } + let decoded = reader.decoder().decode(&*self.value)?; - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom( - &self, - reader: &Reader, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &mut Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - #[cfg(feature = "encoding")] - fn do_unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { - let decoded = reader.decode_without_bom(&*self.value); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_without_bom( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { - let decoded = reader.decode_without_bom(&*self.value)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } } diff --git a/src/events/mod.rs b/src/events/mod.rs index 227d55da..0d799e7a 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -38,17 +38,75 @@ pub mod attributes; #[cfg(feature = "encoding_rs")] use encoding_rs::Encoding; -use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8}; - +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::{self, Debug, Formatter}; +use std::io::BufRead; +use std::ops::Deref; +use std::str::from_utf8; + +use crate::errors::{Error, Result}; use crate::escape::{do_unescape, escape, partial_escape}; use crate::name::{LocalName, QName}; +use crate::reader::{Decoder, Reader}; use crate::utils::write_cow_string; -use crate::{errors::Error, errors::Result, reader::Reader}; use attributes::{Attribute, Attributes}; #[cfg(feature = "serialize")] use crate::escape::EscapeError; +/// Text that appeared before an XML declaration, a start element or a comment. +/// +/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event +/// contains something else except BOM, the XML should be considered ill-formed. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BytesStartText<'a> { + content: BytesText<'a>, +} + +impl<'a> BytesStartText<'a> { + /// Converts the event into an owned event. + pub fn into_owned(self) -> BytesStartText<'static> { + BytesStartText { + content: self.content.into_owned(), + } + } + + /// Extracts the inner `Cow` from the `BytesStartText` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content.into_inner() + } + + /// Decodes bytes of event, stripping byte order mark (BOM) if it is presented + /// in the event. + /// + /// This method does not unescapes content, because no escape sequences can + /// appeared in the BOM or in the text before the first tag. + pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result { + //TODO: Fix lifetime issue - it should be possible to borrow string + let decoded = decoder.decode_with_bom_removal(&*self)?; + + Ok(decoded.to_string()) + } +} + +impl<'a> Deref for BytesStartText<'a> { + type Target = BytesText<'a>; + + fn deref(&self) -> &Self::Target { + &self.content + } +} + +impl<'a> From> for BytesStartText<'a> { + fn from(content: BytesText<'a>) -> Self { + Self { content } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Opening tag data (`Event::Start`), with optional attributes. /// /// ``. @@ -249,30 +307,16 @@ impl<'a> BytesStart<'a> { self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) } - #[cfg(feature = "encoding")] #[inline] fn do_unescape_and_decode_with_custom_entities( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } + let decoded = reader.decoder().decode(&*self)?; - #[cfg(not(feature = "encoding"))] - #[inline] - fn do_unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } /// Edit the name of the BytesStart in-place @@ -368,14 +412,22 @@ impl<'a> BytesStart<'a> { } } -impl<'a> std::fmt::Debug for BytesStart<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesStart<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesStart {{ buf: ")?; write_cow_string(f, &self.buf)?; write!(f, ", name_len: {} }}", self.name_len) } } +impl<'a> Deref for BytesStart<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.buf + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// An XML declaration (`Event::Decl`). @@ -451,7 +503,7 @@ impl<'a> BytesDecl<'a> { Some(Ok(a)) if a.key.as_ref() == b"version" => Ok(a.value), // first attribute was not "version" Some(Ok(a)) => { - let found = from_utf8(a.key.as_ref()).map_err(Error::Utf8)?.to_string(); + let found = from_utf8(a.key.as_ref())?.to_string(); Err(Error::XmlDeclWithoutVersion(Some(found))) } // error parsing attributes @@ -608,6 +660,14 @@ impl<'a> BytesDecl<'a> { } } +impl<'a> Deref for BytesDecl<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.element + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A struct to manage `Event::End` events @@ -656,14 +716,22 @@ impl<'a> BytesEnd<'a> { } } -impl<'a> std::fmt::Debug for BytesEnd<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesEnd<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesEnd {{ name: ")?; write_cow_string(f, &self.name)?; write!(f, " }}") } } +impl<'a> Deref for BytesEnd<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.name + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Data from various events (most notably, `Event::Text`) that stored in XML @@ -771,100 +839,6 @@ impl<'a> BytesText<'a> { do_unescape(self, custom_entities).map_err(Error::EscapeError) } - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom( - &self, - reader: &Reader, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &mut Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - #[cfg(feature = "encoding")] - fn do_unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode_without_bom(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_without_bom( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode_without_bom(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - /// helper method to unescape then decode self using the reader encoding /// /// for performance reasons (could avoid allocating a `String`), @@ -893,28 +867,15 @@ impl<'a> BytesText<'a> { self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) } - #[cfg(feature = "encoding")] fn do_unescape_and_decode_with_custom_entities( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } + let decoded = reader.decoder().decode(&*self)?; - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } /// Gets escaped content. @@ -923,14 +884,28 @@ impl<'a> BytesText<'a> { } } -impl<'a> std::fmt::Debug for BytesText<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesText<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesText {{ content: ")?; write_cow_string(f, &self.content)?; write!(f, " }}") } } +impl<'a> Deref for BytesText<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + +impl<'a> From> for BytesText<'a> { + fn from(content: BytesStartText<'a>) -> Self { + content.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// CDATA content contains unescaped data from the reader. If you want to write them as a text, @@ -1013,21 +988,8 @@ impl<'a> BytesCData<'a> { #[cfg(feature = "serialize")] pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { Ok(match &self.content { - Cow::Borrowed(bytes) => { - #[cfg(feature = "encoding")] - { - decoder.decode(bytes) - } - #[cfg(not(feature = "encoding"))] - { - decoder.decode(bytes)?.into() - } - } + Cow::Borrowed(bytes) => decoder.decode(bytes)?, Cow::Owned(bytes) => { - #[cfg(feature = "encoding")] - let decoded = decoder.decode(bytes).into_owned(); - - #[cfg(not(feature = "encoding"))] let decoded = decoder.decode(bytes)?.to_string(); decoded.into() @@ -1036,21 +998,75 @@ impl<'a> BytesCData<'a> { } } -impl<'a> std::fmt::Debug for BytesCData<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesCData<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesCData {{ content: ")?; write_cow_string(f, &self.content)?; write!(f, " }}") } } +impl<'a> Deref for BytesCData<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Event emitted by [`Reader::read_event`]. -/// -/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event #[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { + /// Text that appeared before the first opening tag or an [XML declaration]. + /// [According to the XML standard][std], no text allowed before the XML + /// declaration. However, if there is a BOM in the stream, some data may be + /// present. + /// + /// When this event is generated, it is the very first event emitted by the + /// [`Reader`], and there can be the only one such event. + /// + /// The [`Writer`] writes content of this event "as is" without encoding or + /// escaping. If you write it, it should be written first and only one time + /// (but writer does not enforce that). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use std::borrow::Cow; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// // XML in UTF-8 with BOM + /// let xml = b"\xEF\xBB\xBF"; + /// let mut reader = Reader::from_bytes(xml); + /// let mut events_processed = 0; + /// loop { + /// match reader.read_event_unbuffered() { + /// Ok(Event::StartText(e)) => { + /// assert_eq!(events_processed, 0); + /// // Content contains BOM + /// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF")); + /// } + /// Ok(Event::Decl(_)) => { + /// assert_eq!(events_processed, 1); + /// } + /// Ok(Event::Eof) => { + /// assert_eq!(events_processed, 2); + /// break; + /// } + /// e => panic!("Unexpected event {:?}", e), + /// } + /// events_processed += 1; + /// } + /// ``` + /// + /// [XML declaration]: Event::Decl + /// [std]: https://www.w3.org/TR/xml11/#NT-document + /// [`Writer`]: crate::writer::Writer + StartText(BytesStartText<'a>), /// Start tag (with attributes) ``. Start(BytesStart<'a>), /// End tag ``. @@ -1078,6 +1094,7 @@ impl<'a> Event<'a> { /// buffer used when reading but incurring a new, separate allocation. pub fn into_owned(self) -> Event<'static> { match self { + Event::StartText(e) => Event::StartText(e.into_owned()), Event::Start(e) => Event::Start(e.into_owned()), Event::End(e) => Event::End(e.into_owned()), Event::Empty(e) => Event::Empty(e.into_owned()), @@ -1092,48 +1109,12 @@ impl<'a> Event<'a> { } } -//////////////////////////////////////////////////////////////////////////////////////////////////// - -impl<'a> Deref for BytesStart<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.buf - } -} - -impl<'a> Deref for BytesDecl<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.element - } -} - -impl<'a> Deref for BytesEnd<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.name - } -} - -impl<'a> Deref for BytesText<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.content - } -} - -impl<'a> Deref for BytesCData<'a> { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - &*self.content - } -} - impl<'a> Deref for Event<'a> { type Target = [u8]; + fn deref(&self) -> &[u8] { match *self { + Event::StartText(ref e) => &*e, Event::Start(ref e) | Event::Empty(ref e) => &*e, Event::End(ref e) => &*e, Event::Text(ref e) => &*e, diff --git a/src/lib.rs b/src/lib.rs index 122520f0..378488bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,5 +156,5 @@ mod writer; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::Reader; +pub use crate::reader::{Decoder, Reader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader.rs b/src/reader.rs index 10f97439..4c556a8b 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,12 +1,11 @@ //! A module to handle `Reader` -#[cfg(feature = "encoding")] use std::borrow::Cow; use std::io::{self, BufRead, BufReader}; use std::{fs::File, path::Path, str::from_utf8}; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE}; +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; @@ -14,12 +13,46 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +/// Possible reader states. The state transition diagram (`true` and `false` shows +/// value of [`Reader::expand_empty_elements()`] option): +/// +/// ```mermaid +/// flowchart LR +/// subgraph _ +/// direction LR +/// +/// Init -- "(no event)"\nStartText --> Opened +/// Opened -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> Closed +/// Closed -- "#lt;false#gt;\n(no event)"\nText --> Opened +/// end +/// Closed -- "#lt;true#gt;"\nStart --> Empty +/// Empty -- End --> Closed +/// _ -. Eof .-> Exit +/// ``` #[derive(Clone)] enum TagState { + /// Initial state in which reader stay after creation. Transition from that + /// state could produce a `StartText`, `Decl`, `Comment` or `Start` event. + /// The next state is always `Opened`. The reader will never return to this + /// state. The event emitted during transition to `Opened` is a `StartEvent` + /// if the first symbol not `<`, otherwise no event are emitted. + Init, + /// State after seeing the `<` symbol. Depending on the next symbol all other + /// events (except `StartText`) could be generated. + /// + /// After generating ane event the reader moves to the `Closed` state. Opened, + /// State in which reader searches the `<` symbol of a markup. All bytes before + /// that symbol will be returned in the [`Event::Text`] event. After that + /// the reader moves to the `Opened` state. Closed, + /// This state is used only if option `expand_empty_elements` is set to `true`. + /// Reader enters to this state when it is in a `Closed` state and emits an + /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], + /// after which reader returned to the `Closed` state. Empty, - /// Either Eof or Errored + /// Reader enters this state when `Eof` event generated or an error occurred. + /// This is the last state, the reader stay in it forever. Exit, } @@ -118,14 +151,15 @@ pub struct Reader { is_encoding_set: bool, } +/// Builder methods impl Reader { /// Creates a `Reader` that reads from a reader implementing `BufRead`. - pub fn from_reader(reader: R) -> Reader { - Reader { + pub fn from_reader(reader: R) -> Self { + Self { reader, opened_buffer: Vec::new(), opened_starts: Vec::new(), - tag_state: TagState::Closed, + tag_state: TagState::Init, expand_empty_elements: false, trim_text_start: false, trim_text_end: false, @@ -159,7 +193,7 @@ impl Reader { /// [`Empty`]: events/enum.Event.html#variant.Empty /// [`Start`]: events/enum.Event.html#variant.Start /// [`End`]: events/enum.Event.html#variant.End - pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { + pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self { self.expand_empty_elements = val; self } @@ -172,7 +206,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text(&mut self, val: bool) -> &mut Reader { + pub fn trim_text(&mut self, val: bool) -> &mut Self { self.trim_text_start = val; self.trim_text_end = val; self @@ -185,7 +219,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text_end(&mut self, val: bool) -> &mut Reader { + pub fn trim_text_end(&mut self, val: bool) -> &mut Self { self.trim_text_end = val; self } @@ -201,7 +235,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { + pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self { self.trim_markup_names_in_closing_tags = val; self } @@ -223,7 +257,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn check_end_names(&mut self, val: bool) -> &mut Reader { + pub fn check_end_names(&mut self, val: bool) -> &mut Self { self.check_end_names = val; self } @@ -238,10 +272,80 @@ impl Reader { /// (`false` by default) /// /// [`Comment`]: events/enum.Event.html#variant.Comment - pub fn check_comments(&mut self, val: bool) -> &mut Reader { + pub fn check_comments(&mut self, val: bool) -> &mut Self { self.check_comments = val; self } +} + +/// Getters +impl Reader { + /// Consumes `Reader` returning the underlying reader + /// + /// Can be used to compute line and column of a parsing error position + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use std::{str, io::Cursor}; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); + /// let mut buf = Vec::new(); + /// + /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { + /// let end_pos = reader.buffer_position(); + /// let mut cursor = reader.into_inner(); + /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) + /// .expect("can't make a string"); + /// let mut line = 1; + /// let mut column = 0; + /// for c in s.chars() { + /// if c == '\n' { + /// line += 1; + /// column = 0; + /// } else { + /// column += 1; + /// } + /// } + /// (line, column) + /// } + /// + /// loop { + /// match reader.read_event(&mut buf) { + /// Ok(Event::Start(ref e)) => match e.name().as_ref() { + /// b"tag1" | b"tag2" => (), + /// tag => { + /// assert_eq!(b"tag3", tag); + /// assert_eq!((3, 22), into_line_and_column(reader)); + /// break; + /// } + /// }, + /// Ok(Event::Eof) => unreachable!(), + /// _ => (), + /// } + /// buf.clear(); + /// } + /// ``` + pub fn into_inner(self) -> R { + self.reader + } + + /// Gets a reference to the underlying reader. + pub fn get_ref(&self) -> &R { + &self.reader + } + + /// Gets a mutable reference to the underlying reader. + pub fn get_mut(&mut self) -> &mut R { + &mut self.reader + } /// Gets the current byte position in the input data. /// @@ -256,221 +360,65 @@ impl Reader { } } - /// private function to read until '<' is found - /// return a `Text` event - fn read_until_open<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Opened; - - if self.trim_text_start { - self.reader.skip_whitespace(&mut self.buf_position)?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_buffered(buf); - } - - match self - .reader - .read_bytes_until(b'<', buf, &mut self.buf_position) - { - Ok(Some(bytes)) if self.trim_text_end => { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) - } - Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Closed; - - match self.reader.peek_one() { - // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_start(bytes), - Err(e) => Err(e), - }, - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } + /// Resolves a potentially qualified **event name** into (namespace name, local name). + /// + /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined + /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix + /// can be defined on the same element as the attribute in question. + /// + /// *Unqualified* event inherits the current *default namespace*. + /// + /// # Lifetimes + /// + /// - `'n`: lifetime of an element name + /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored + #[inline] + pub fn event_namespace<'n, 'ns>( + &self, + name: QName<'n>, + namespace_buffer: &'ns [u8], + ) -> (ResolveResult<'ns>, LocalName<'n>) { + self.ns_resolver.resolve(name, namespace_buffer, true) } - /// reads `BytesElement` starting with a `/`, - /// if `self.check_end_names`, checks that element matches last opened element - /// return `End` event - fn read_end<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - // XML standard permits whitespaces after the markup name in closing tags. - // Let's strip them from the buffer before comparing tag names. - let name = if self.trim_markup_names_in_closing_tags { - if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) { - let (name, _) = buf[1..].split_at(pos_end_name + 1); - name - } else { - &buf[1..] - } - } else { - &buf[1..] - }; - if self.check_end_names { - let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| { - *buf_position -= buf.len(); - Err(Error::EndEventMismatch { - expected: from_utf8(expected).unwrap_or("").to_owned(), - found: from_utf8(found).unwrap_or("").to_owned(), - }) - }; - match self.opened_starts.pop() { - Some(start) => { - let expected = &self.opened_buffer[start..]; - if name != expected { - mismatch_err(expected, name, &mut self.buf_position) - } else { - self.opened_buffer.truncate(start); - Ok(Event::End(BytesEnd::borrowed(name))) - } - } - None => mismatch_err(b"", &buf[1..], &mut self.buf_position), - } - } else { - Ok(Event::End(BytesEnd::borrowed(name))) - } + /// Resolves a potentially qualified **attribute name** into (namespace name, local name). + /// + /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined + /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix + /// can be defined on the same element as the attribute in question. + /// + /// *Unqualified* attribute names do *not* inherit the current *default namespace*. + /// + /// # Lifetimes + /// + /// - `'n`: lifetime of an attribute + /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored + #[inline] + pub fn attribute_namespace<'n, 'ns>( + &self, + name: QName<'n>, + namespace_buffer: &'ns [u8], + ) -> (ResolveResult<'ns>, LocalName<'n>) { + self.ns_resolver.resolve(name, namespace_buffer, false) } - /// reads `BytesElement` starting with a `!`, - /// return `Comment`, `CData` or `DocType` event - fn read_bang<'a, 'b>(&'a mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; - - let len = buf.len(); - match bang_type { - BangType::Comment if buf.starts_with(b"!--") => { - if self.check_comments { - // search if '--' not in comments - if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) - .position(|p| buf[3 + p + 1] == b'-') - { - self.buf_position += len - p; - return Err(Error::UnexpectedToken("--".to_string())); - } - } - Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) - } - BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - Ok(Event::CData(BytesCData::new(&buf[8..]))) - } - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { - let start = buf[8..] - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or_else(|| len - 8); - debug_assert!(start < len - 8, "DocType must have a name"); - Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) - } - _ => Err(bang_type.to_err()), - } - } - - /// reads `BytesElement` starting with a `?`, - /// return `Decl` or `PI` event - fn read_question_mark<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - let len = buf.len(); - if len > 2 && buf[len - 1] == b'?' { - if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) { - let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3)); - - // Try getting encoding from the declaration event - #[cfg(feature = "encoding")] - if let Some(enc) = event.encoder() { - self.encoding = enc; - self.is_encoding_set = true; - } - - Ok(Event::Decl(event)) - } else { - Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1]))) - } - } else { - self.buf_position -= len; - Err(Error::UnexpectedEof("XmlDecl".to_string())) - } - } - - #[inline] - fn close_expanded_empty(&mut self) -> Result> { - self.tag_state = TagState::Closed; - let name = self - .opened_buffer - .split_off(self.opened_starts.pop().unwrap()); - Ok(Event::End(BytesEnd::owned(name))) - } - - /// reads `BytesElement` starting with any character except `/`, `!` or ``?` - /// return `Start` or `Empty` event - fn read_start<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - // TODO: do this directly when reading bufreader ... - let len = buf.len(); - let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len); - if let Some(&b'/') = buf.last() { - let end = if name_end < len { name_end } else { len - 1 }; - if self.expand_empty_elements { - self.tag_state = TagState::Empty; - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&buf[..end]); - Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end))) - } else { - Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end))) - } - } else { - if self.check_end_names { - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&buf[..name_end]); - } - Ok(Event::Start(BytesStart::borrowed(buf, name_end))) + /// Get the decoder, used to decode bytes, read by this reader, to the strings. + /// + /// If `encoding` feature is enabled, the used encoding may change after + /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. + /// + /// If `encoding` feature is enabled and no encoding is specified in declaration, + /// defaults to UTF-8. + pub fn decoder(&self) -> Decoder { + Decoder { + #[cfg(feature = "encoding")] + encoding: self.encoding, } } +} +/// Read methods +impl Reader { /// Reads the next `Event`. /// /// This is the main entry point for reading XML `Event`s. @@ -515,72 +463,10 @@ impl Reader { /// println!("Text events: {:?}", txt); /// ``` #[inline] - pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + pub fn read_event<'b>(&mut self, buf: &'b mut Vec) -> Result> { self.read_event_buffered(buf) } - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - let event = match self.tag_state { - TagState::Opened => self.read_until_close(buf), - TagState::Closed => self.read_until_open(buf), - TagState::Empty => self.close_expanded_empty(), - TagState::Exit => return Ok(Event::Eof), - }; - match event { - Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, - _ => {} - } - event - } - - /// Resolves a potentially qualified **event name** into (namespace name, local name). - /// - /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined - /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix - /// can be defined on the same element as the attribute in question. - /// - /// *Unqualified* event inherits the current *default namespace*. - /// - /// # Lifetimes - /// - /// - `'n`: lifetime of an element name - /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored - #[inline] - pub fn event_namespace<'n, 'ns>( - &self, - name: QName<'n>, - namespace_buffer: &'ns [u8], - ) -> (ResolveResult<'ns>, LocalName<'n>) { - self.ns_resolver.resolve(name, namespace_buffer, true) - } - - /// Resolves a potentially qualified **attribute name** into (namespace name, local name). - /// - /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined - /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix - /// can be defined on the same element as the attribute in question. - /// - /// *Unqualified* attribute names do *not* inherit the current *default namespace*. - /// - /// # Lifetimes - /// - /// - `'n`: lifetime of an attribute - /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored - #[inline] - pub fn attribute_namespace<'n, 'ns>( - &self, - name: QName<'n>, - namespace_buffer: &'ns [u8], - ) -> (ResolveResult<'ns>, LocalName<'n>) { - self.ns_resolver.resolve(name, namespace_buffer, false) - } - /// Reads the next event and resolves its namespace (if applicable). /// /// # Examples @@ -677,261 +563,354 @@ impl Reader { } } - /// Returns the `Reader`s encoding. - /// - /// The used encoding may change after parsing the XML declaration. - /// - /// This encoding will be used by [`decode`]. + /// Reads until end element is found /// - /// [`decode`]: #method.decode - #[cfg(feature = "encoding")] - pub fn encoding(&self) -> &'static Encoding { - self.encoding + /// Manages nested cases where parent and child elements have the same name + pub fn read_to_end>(&mut self, end: K, buf: &mut Vec) -> Result<()> { + let mut depth = 0; + let end = end.as_ref(); + loop { + match self.read_event(buf) { + Ok(Event::End(ref e)) if e.name().as_ref() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Start(ref e)) if e.name().as_ref() == end => depth += 1, + Err(e) => return Err(e), + Ok(Event::Eof) => { + return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); + } + _ => (), + } + buf.clear(); + } } - /// Decodes a slice using the encoding specified in the XML declaration. + /// Reads optional text between start and end tags. /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode<'b, 'c>(&'b self, bytes: &'c [u8]) -> Cow<'c, str> { - self.encoding.decode(bytes).0 - } - - /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration. + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. + /// # Examples /// - /// # Note + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - if bytes.starts_with(b"\xEF\xBB\xBF") { - from_utf8(&bytes[3..]).map_err(Error::Utf8) - } else { - from_utf8(bytes).map_err(Error::Utf8) - } - } - - /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration. + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode_without_bom<'b, 'c>(&'b mut self, mut bytes: &'c [u8]) -> Cow<'c, str> { - if self.is_encoding_set { - return self.encoding.decode_with_bom_removal(bytes).0; - } - if bytes.starts_with(b"\xEF\xBB\xBF") { - self.is_encoding_set = true; - bytes = &bytes[3..]; - } else if bytes.starts_with(b"\xFF\xFE") { - self.is_encoding_set = true; - self.encoding = UTF_16LE; - bytes = &bytes[2..]; - } else if bytes.starts_with(b"\xFE\xFF") { - self.is_encoding_set = true; - self.encoding = UTF_16BE; - bytes = &bytes[3..]; + /// [`Text`]: events/enum.Event.html#variant.Text + /// [`End`]: events/enum.Event.html#variant.End + pub fn read_text>(&mut self, end: K, buf: &mut Vec) -> Result { + let s = match self.read_event(buf) { + Ok(Event::Text(e)) => e.unescape_and_decode(self), + Ok(Event::End(ref e)) if e.name().as_ref() == end.as_ref() => return Ok("".to_string()), + Err(e) => return Err(e), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), }; - self.encoding.decode_without_bom_handling(bytes).0 + self.read_to_end(end, buf)?; + s } +} - /// Decodes a UTF8 slice regardless of XML declaration. - /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// # Note +/// Private methods +impl Reader { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> + where + R: XmlSource<'i, B>, + { + let event = match self.tag_state { + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), + TagState::Opened => self.read_until_close(buf), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - from_utf8(bytes).map_err(Error::Utf8) + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> + where + R: XmlSource<'i, B>, + { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_buffered(buf); + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = encoding; + self.is_encoding_set = true; + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } } - /// Get utf8 decoder - #[cfg(feature = "encoding")] - pub fn decoder(&self) -> Decoder { - Decoder { - encoding: self.encoding, + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close<'i, B>(&mut self, buf: B) -> Result> + where + R: XmlSource<'i, B>, + { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), } } - /// Get utf8 decoder - #[cfg(not(feature = "encoding"))] - pub fn decoder(&self) -> Decoder { - Decoder + /// reads `BytesElement` starting with a `!`, + /// return `Comment`, `CData` or `DocType` event + fn read_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { + let uncased_starts_with = |string: &[u8], prefix: &[u8]| { + string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) + }; + + let len = buf.len(); + match bang_type { + BangType::Comment if buf.starts_with(b"!--") => { + if self.check_comments { + // search if '--' not in comments + if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) + .position(|p| buf[3 + p + 1] == b'-') + { + self.buf_position += len - p; + return Err(Error::UnexpectedToken("--".to_string())); + } + } + Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) + } + BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { + Ok(Event::CData(BytesCData::new(&buf[8..]))) + } + BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { + let start = buf[8..] + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or_else(|| len - 8); + debug_assert!(start < len - 8, "DocType must have a name"); + Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) + } + _ => Err(bang_type.to_err()), + } } - /// Reads until end element is found - /// - /// Manages nested cases where parent and child elements have the same name - pub fn read_to_end>(&mut self, end: K, buf: &mut Vec) -> Result<()> { - let mut depth = 0; - let end = end.as_ref(); - loop { - match self.read_event(buf) { - Ok(Event::End(ref e)) if e.name().as_ref() == end => { - if depth == 0 { - return Ok(()); + /// reads `BytesElement` starting with a `/`, + /// if `self.check_end_names`, checks that element matches last opened element + /// return `End` event + fn read_end<'b>(&mut self, buf: &'b [u8]) -> Result> { + // XML standard permits whitespaces after the markup name in closing tags. + // Let's strip them from the buffer before comparing tag names. + let name = if self.trim_markup_names_in_closing_tags { + if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) { + let (name, _) = buf[1..].split_at(pos_end_name + 1); + name + } else { + &buf[1..] + } + } else { + &buf[1..] + }; + if self.check_end_names { + let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| { + *buf_position -= buf.len(); + Err(Error::EndEventMismatch { + expected: from_utf8(expected).unwrap_or("").to_owned(), + found: from_utf8(found).unwrap_or("").to_owned(), + }) + }; + match self.opened_starts.pop() { + Some(start) => { + let expected = &self.opened_buffer[start..]; + if name != expected { + mismatch_err(expected, name, &mut self.buf_position) + } else { + self.opened_buffer.truncate(start); + Ok(Event::End(BytesEnd::borrowed(name))) } - depth -= 1; - } - Ok(Event::Start(ref e)) if e.name().as_ref() == end => depth += 1, - Err(e) => return Err(e), - Ok(Event::Eof) => { - return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); } - _ => (), + None => mismatch_err(b"", &buf[1..], &mut self.buf_position), } - buf.clear(); + } else { + Ok(Event::End(BytesEnd::borrowed(name))) } } - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let mut xml = Reader::from_reader(b" - /// <b> - /// - /// " as &[u8]); - /// xml.trim_text(true); - /// - /// let expected = ["", ""]; - /// for &content in expected.iter() { - /// match xml.read_event(&mut Vec::new()) { - /// Ok(Event::Start(ref e)) => { - /// assert_eq!(&xml.read_text(e.name(), &mut Vec::new()).unwrap(), content); - /// }, - /// e => panic!("Expecting Start event, found {:?}", e), - /// } - /// } - /// ``` - /// - /// [`Text`]: events/enum.Event.html#variant.Text - /// [`End`]: events/enum.Event.html#variant.End - pub fn read_text>(&mut self, end: K, buf: &mut Vec) -> Result { - let s = match self.read_event(buf) { - Ok(Event::Text(e)) => e.unescape_and_decode(self), - Ok(Event::End(ref e)) if e.name().as_ref() == end.as_ref() => return Ok("".to_string()), - Err(e) => return Err(e), - Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), - _ => return Err(Error::TextNotFound), - }; - self.read_to_end(end, buf)?; - s - } + /// reads `BytesElement` starting with a `?`, + /// return `Decl` or `PI` event + fn read_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result> { + let len = buf.len(); + if len > 2 && buf[len - 1] == b'?' { + if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) { + let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3)); - /// Consumes `Reader` returning the underlying reader - /// - /// Can be used to compute line and column of a parsing error position - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use std::{str, io::Cursor}; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); - /// let mut buf = Vec::new(); - /// - /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { - /// let end_pos = reader.buffer_position(); - /// let mut cursor = reader.into_inner(); - /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) - /// .expect("can't make a string"); - /// let mut line = 1; - /// let mut column = 0; - /// for c in s.chars() { - /// if c == '\n' { - /// line += 1; - /// column = 0; - /// } else { - /// column += 1; - /// } - /// } - /// (line, column) - /// } - /// - /// loop { - /// match reader.read_event(&mut buf) { - /// Ok(Event::Start(ref e)) => match e.name().as_ref() { - /// b"tag1" | b"tag2" => (), - /// tag => { - /// assert_eq!(b"tag3", tag); - /// assert_eq!((3, 22), into_line_and_column(reader)); - /// break; - /// } - /// }, - /// Ok(Event::Eof) => unreachable!(), - /// _ => (), - /// } - /// buf.clear(); - /// } - /// ``` - pub fn into_inner(self) -> R { - self.reader + // Try getting encoding from the declaration event + #[cfg(feature = "encoding")] + if let Some(enc) = event.encoder() { + self.encoding = enc; + self.is_encoding_set = true; + } + + Ok(Event::Decl(event)) + } else { + Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1]))) + } + } else { + self.buf_position -= len; + Err(Error::UnexpectedEof("XmlDecl".to_string())) + } } - /// Gets a reference to the underlying reader. - pub fn get_ref(&self) -> &R { - &self.reader + #[inline] + fn close_expanded_empty(&mut self) -> Result> { + self.tag_state = TagState::Closed; + let name = self + .opened_buffer + .split_off(self.opened_starts.pop().unwrap()); + Ok(Event::End(BytesEnd::owned(name))) } - /// Gets a mutable reference to the underlying reader. - pub fn get_mut(&mut self) -> &mut R { - &mut self.reader + /// reads `BytesElement` starting with any character except `/`, `!` or ``?` + /// return `Start` or `Empty` event + fn read_start<'b>(&mut self, buf: &'b [u8]) -> Result> { + // TODO: do this directly when reading bufreader ... + let len = buf.len(); + let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len); + if let Some(&b'/') = buf.last() { + let end = if name_end < len { name_end } else { len - 1 }; + if self.expand_empty_elements { + self.tag_state = TagState::Empty; + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&buf[..end]); + Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end))) + } else { + Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end))) + } + } else { + if self.check_end_names { + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&buf[..name_end]); + } + Ok(Event::Start(BytesStart::borrowed(buf, name_end))) + } } } impl Reader> { /// Creates an XML reader from a file path. - pub fn from_file>(path: P) -> Result>> { + pub fn from_file>(path: P) -> Result { let file = File::open(path).map_err(Error::Io)?; let reader = BufReader::new(file); - Ok(Reader::from_reader(reader)) + Ok(Self::from_reader(reader)) } } impl<'a> Reader<&'a [u8]> { /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Reader<&'a [u8]> { - Reader::from_reader(s.as_bytes()) + pub fn from_str(s: &'a str) -> Self { + Self::from_reader(s.as_bytes()) } /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> { - Reader::from_reader(s) + pub fn from_bytes(s: &'a [u8]) -> Self { + Self::from_reader(s) } /// Read an event that borrows from the input rather than a buffer. @@ -1463,52 +1442,174 @@ pub(crate) fn is_whitespace(b: u8) -> bool { } } -/// Utf8 Decoder -#[cfg(not(feature = "encoding"))] -#[derive(Clone, Copy, Debug)] -pub struct Decoder; +//////////////////////////////////////////////////////////////////////////////////////////////////// -/// Utf8 Decoder -#[cfg(feature = "encoding")] +/// Decoder of byte slices to the strings. This is lightweight object that can be copied. +/// +/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` +/// XML declaration or assumes UTF-8, if XML has no declaration, encoding +/// key is not defined or contains unknown encoding. +/// +/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` +/// is supported. [*UTF-16 is not supported at the present*][utf16]. +/// +/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: +/// any XML declarations are ignored. +/// +/// [utf16]: https://github.com/tafia/quick-xml/issues/158 #[derive(Clone, Copy, Debug)] pub struct Decoder { + #[cfg(feature = "encoding")] encoding: &'static Encoding, } +#[cfg(not(feature = "encoding"))] impl Decoder { - #[cfg(not(feature = "encoding"))] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - from_utf8(bytes).map_err(Error::Utf8) - } - - #[cfg(not(feature = "encoding"))] - pub fn decode_owned<'c>(&self, bytes: Vec) -> Result { - String::from_utf8(bytes).map_err(|e| Error::Utf8(e.utf8_error())) + /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + #[inline] + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + Ok(Cow::Borrowed(from_utf8(bytes)?)) } - #[cfg(feature = "encoding")] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { - self.encoding.decode(bytes).0 + /// Decodes a slice regardless of XML declaration with BOM removal if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { + &bytes[3..] + } else { + bytes + }; + self.decode(bytes) } } -/// This implementation is required for tests of other parts of the library -#[cfg(test)] -#[cfg(feature = "serialize")] +#[cfg(feature = "encoding")] impl Decoder { - #[cfg(not(feature = "encoding"))] - pub(crate) fn utf8() -> Self { - Decoder + /// Returns the `Reader`s encoding. + /// + /// This encoding will be used by [`decode`]. + /// + /// [`decode`]: Self::decode + pub fn encoding(&self) -> &'static Encoding { + self.encoding } - #[cfg(feature = "encoding")] + /// Decodes specified bytes using encoding, declared in the XML, if it was + /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present + /// in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + match self + .encoding + .decode_without_bom_handling_and_without_replacement(bytes) + { + None => Err(Error::NonDecodable(None)), + Some(s) => Ok(s), + } + } + + /// Decodes a slice with BOM removal if it is present in the `bytes` using + /// the reader encoding. + /// + /// If this method called after reading XML declaration with the `"encoding"` + /// key, then this encoding is used, otherwise UTF-8 is used. + /// + /// If XML declaration is absent in the XML, UTF-8 is used. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + self.decode(self.remove_bom(bytes)) + } + /// Copied from [`Encoding::decode_with_bom_removal`] + #[inline] + fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { + use encoding_rs::*; + + if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { + return &bytes[3..]; + } + if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { + return &bytes[2..]; + } + if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { + return &bytes[2..]; + } + + bytes + } +} + +impl Decoder { + /// This implementation is required for tests of other parts of the library + #[cfg(test)] + #[cfg(feature = "serialize")] pub(crate) fn utf8() -> Self { Decoder { + #[cfg(feature = "encoding")] encoding: encoding_rs::UTF_8, } } } +/// Automatic encoding detection of XML files based using the [recommended algorithm] +/// (https://www.w3.org/TR/xml11/#sec-guessing) +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order) +/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order) +/// |`00 00 FF FE`|UCS-4, unusual octet order (2143) +/// |`FE FF 00 00`|UCS-4, unusual octet order (3412) +/// |`FE FF ## ##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// |-------------|------------------------------------------ +/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234) +/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321) +/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143) +/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412) +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use) +/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind +/// +/// Because [`encoding_rs`] crate supported only subset of those encodings, only +/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// If encoding is detected, `Some` is returned, otherwise `None` is returned. +#[cfg(feature = "encoding")] +fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { + match bytes { + // with BOM + _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), + _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), + _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), + + // without BOM + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible + + _ => None, + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod test { macro_rules! check { @@ -2253,6 +2354,16 @@ mod test { use crate::reader::Reader; use pretty_assertions::assert_eq; + #[test] + fn start_text() { + let mut reader = Reader::from_str("bom"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into()) + ); + } + #[test] fn declaration() { let mut reader = Reader::from_str(""); @@ -2316,9 +2427,15 @@ mod test { ); } + /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = Reader::from_str("text"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::Empty(BytesStart::borrowed_name(b"tag")) + ); assert_eq!( reader.read_event_buffered($buf).unwrap(), diff --git a/src/se/mod.rs b/src/se/mod.rs index 92a97c40..bb737802 100644 --- a/src/se/mod.rs +++ b/src/se/mod.rs @@ -23,8 +23,7 @@ pub fn to_writer(writer: W, value: &S) -> Result<(), DeE pub fn to_string(value: &S) -> Result { let mut writer = Vec::new(); to_writer(&mut writer, value)?; - let s = String::from_utf8(writer).map_err(|e| crate::errors::Error::Utf8(e.utf8_error()))?; - Ok(s) + Ok(String::from_utf8(writer)?) } /// A Serializer diff --git a/src/writer.rs b/src/writer.rs index 9b4f484a..d6fcf960 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -91,6 +91,7 @@ impl Writer { pub fn write_event<'a, E: AsRef>>(&mut self, event: E) -> Result<()> { let mut next_should_line_break = true; let result = match *event.as_ref() { + Event::StartText(ref e) => self.write(&e), Event::Start(ref e) => { let result = self.write_wrapped(b"<", e, b">"); if let Some(i) = self.indent.as_mut() { diff --git a/tests/test.rs b/tests/test.rs index 37426391..cf547c8e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -181,7 +181,10 @@ fn fuzz_101() { fn test_no_trim() { let mut reader = Reader::from_str(" text "); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); @@ -193,7 +196,10 @@ fn test_trim_end() { let mut reader = Reader::from_str(" text "); reader.trim_text_end(true); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 21f53fb4..39af252d 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -29,7 +29,7 @@ macro_rules! next_eq_content { ($r:expr, $t:tt, $bytes:expr) => { let mut buf = Vec::new(); match $r.read_event(&mut buf).unwrap() { - $t(ref e) if &**e == $bytes => (), + $t(ref e) if e.as_ref() == $bytes => (), e => panic!( "expecting {}({:?}), found {:?}", stringify!($t), @@ -42,6 +42,7 @@ macro_rules! next_eq_content { } macro_rules! next_eq { + ($r:expr, StartText, $bytes:expr) => (next_eq_content!($r, StartText, $bytes);); ($r:expr, Start, $bytes:expr) => (next_eq_name!($r, Start, $bytes);); ($r:expr, End, $bytes:expr) => (next_eq_name!($r, End, $bytes);); ($r:expr, Empty, $bytes:expr) => (next_eq_name!($r, Empty, $bytes);); @@ -781,82 +782,100 @@ fn test_closing_bracket_in_single_quote_mixed() { next_eq!(r, End, b"a"); } -#[test] -#[cfg(not(feature = "encoding"))] -fn test_unescape_and_decode_without_bom_removes_utf8_bom() { - let input: &str = std::str::from_utf8(b"\xEF\xBB\xBF").unwrap(); +mod decode_with_bom_removal { + use super::*; + use pretty_assertions::assert_eq; - let mut reader = Reader::from_str(&input); - reader.trim_text(true); + #[test] + #[cfg(not(feature = "encoding"))] + fn removes_utf8_bom() { + let input: &str = std::str::from_utf8(b"\xEF\xBB\xBF").unwrap(); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut reader = Reader::from_str(&input); + reader.trim_text(true); - loop { - match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()), - Ok(Event::Eof) => break, - _ => (), + let mut txt = Vec::new(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), + } } + assert_eq!(txt, vec![""]); } - assert_eq!(txt, vec![""]); -} -#[test] -#[cfg(feature = "encoding")] -fn test_unescape_and_decode_without_bom_removes_utf16be_bom() { - let mut reader = Reader::from_file("./tests/documents/utf16be.xml").unwrap(); - reader.trim_text(true); - - let mut txt = Vec::new(); - let mut buf = Vec::new(); + /// Test is disabled: the input started with `[FE FF 00 3C 00 3F ...]` and currently + /// quick-xml captures `[FE FF 00]` as a `StartText` event, because it is stopped + /// at byte `<` (0x3C). That sequence represents UTF-16 BOM (=BE) and a first byte + /// of the `<` symbol, encoded in UTF-16 BE (`00 3C`). + #[test] + #[cfg(feature = "encoding")] + #[ignore = "Non-ASCII compatible encodings not properly supported yet. See https://github.com/tafia/quick-xml/issues/158"] + fn removes_utf16be_bom() { + let mut reader = Reader::from_file("./tests/documents/utf16be.xml").unwrap(); + reader.trim_text(true); + + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), - Ok(Event::Eof) => break, - _ => (), + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), + } } + assert_eq!(Some(txt[0].as_ref()), Some("")); } - assert_eq!(txt[0], ""); -} -#[test] -#[cfg(feature = "encoding")] -fn test_unescape_and_decode_without_bom_removes_utf16le_bom() { - let mut reader = Reader::from_file("./tests/documents/utf16le.xml").unwrap(); - reader.trim_text(true); + #[test] + #[cfg(feature = "encoding")] + fn removes_utf16le_bom() { + let mut reader = Reader::from_file("./tests/documents/utf16le.xml").unwrap(); + reader.trim_text(true); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), - Ok(Event::Eof) => break, - _ => (), + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), + } } + assert_eq!(Some(txt[0].as_ref()), Some("")); } - assert_eq!(txt[0], ""); -} -#[test] -#[cfg(not(feature = "encoding"))] -fn test_unescape_and_decode_without_bom_does_nothing_if_no_bom_exists() { - let input: &str = std::str::from_utf8(b"").unwrap(); + #[test] + #[cfg(not(feature = "encoding"))] + fn does_nothing_if_no_bom_exists() { + let input: &str = std::str::from_utf8(b"").unwrap(); - let mut reader = Reader::from_str(&input); - reader.trim_text(true); + let mut reader = Reader::from_str(&input); + reader.trim_text(true); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), - Ok(Event::Eof) => break, - _ => (), + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), + } } + assert!(txt.is_empty()); } - assert!(txt.is_empty()); } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 12f86594..a23bb03c 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -1,7 +1,6 @@ use quick_xml::events::{BytesStart, Event}; -use quick_xml::name::ResolveResult; -use quick_xml::{Reader, Result}; -use std::borrow::Cow; +use quick_xml::name::{QName, ResolveResult}; +use quick_xml::{Decoder, Reader, Result}; use std::str::from_utf8; #[test] @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { loop { buf.clear(); let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer); - let line = xmlrs_display(event, &reader); + let line = xmlrs_display(event, reader.decoder()); if let Some((n, spec)) = spec_lines.next() { if spec.trim() == "EndDocument" { break; @@ -420,15 +419,12 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { } } -fn namespace_name(n: ResolveResult, name: &[u8]) -> String { +fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String { + let name = decoder.decode(name.as_ref()).unwrap(); match n { - // Produces string '{namespace}local_name' - ResolveResult::Bound(n) => format!( - "{{{}}}{}", - from_utf8(n.as_ref()).unwrap(), - from_utf8(name).unwrap() - ), - _ => from_utf8(name).unwrap().to_owned(), + // Produces string '{namespace}prefixed_name' + ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name), + _ => name.to_string(), } } @@ -451,21 +447,11 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result { Ok(atts.join(", ")) } -// FIXME: The public API differs based on the "encoding" feature -fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { - #[cfg(feature = "encoding")] - let decoded = reader.decode(text); - - #[cfg(not(feature = "encoding"))] - let decoded = Cow::Borrowed(reader.decode(text).unwrap()); - - decoded -} - -fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { +fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String { match opt_event { + Ok((_, Event::StartText(_))) => "StartText".to_string(), Ok((n, Event::Start(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name), Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs), @@ -473,7 +459,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::Empty(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name), Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs), @@ -481,16 +467,13 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::End(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, e.name(), decoder); format!("EndElement({})", name) } Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()), Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()), Ok((_, Event::Text(ref e))) => match e.unescaped() { - Ok(c) => match from_utf8(decode(&*c, reader).as_bytes()) { - Ok(c) => format!("Characters({})", c), - Err(ref err) => format!("InvalidUtf8({:?}; {})", e.escaped(), err), - }, + Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()), Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err), }, Ok((_, Event::Decl(ref e))) => {