diff --git a/Changelog.md b/Changelog.md index 341cb892..190afd59 100644 --- a/Changelog.md +++ b/Changelog.md @@ -37,6 +37,8 @@ | |`resolve` |`event_namespace` |`resolve_element` |`attribute_namespace` |`resolve_attribute` +- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()` + under the `quick-xml::encoding` namespace. ### Bug Fixes @@ -209,6 +211,8 @@ [#431]: https://github.com/tafia/quick-xml/pull/431 [#434]: https://github.com/tafia/quick-xml/pull/434 [#437]: https://github.com/tafia/quick-xml/pull/437 +[#439]: https://github.com/tafia/quick-xml/pull/439 + ## 0.23.0 -- 2022-05-08 diff --git a/src/de/escape.rs b/src/de/escape.rs index badc3299..e9eff985 100644 --- a/src/de/escape.rs +++ b/src/de/escape.rs @@ -1,9 +1,9 @@ //! Serde `Deserializer` module use crate::de::deserialize_bool; +use crate::encoding::Decoder; use crate::errors::serialize::DeError; use crate::escape::unescape; -use crate::reader::Decoder; use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor}; use serde::{self, forward_to_deserialize_any, serde_if_integer128}; use std::borrow::Cow; diff --git a/src/de/mod.rs b/src/de/mod.rs index caabbdf8..1d7dd4a7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -215,10 +215,10 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ + encoding::Decoder, errors::Error, events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, name::QName, - reader::Decoder, Reader, }; use serde::de::{self, Deserialize, DeserializeOwned, Visitor}; diff --git a/src/de/seq.rs b/src/de/seq.rs index 8dc9a462..091b968b 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,6 +1,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; +use crate::encoding::Decoder; use crate::events::BytesStart; -use crate::reader::Decoder; use serde::de::{DeserializeSeed, SeqAccess}; /// Check if tag `start` is included in the `fields` list. `decoder` is used to diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index dc0b157a..580c6312 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -4,9 +4,9 @@ //! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition use crate::de::{deserialize_bool, str2bool}; +use crate::encoding::Decoder; use crate::errors::serialize::DeError; use crate::escape::unescape; -use crate::reader::Decoder; use memchr::memchr; use serde::de::{DeserializeSeed, Deserializer, EnumAccess, SeqAccess, VariantAccess, Visitor}; use serde::{self, serde_if_integer128}; diff --git a/src/encoding.rs b/src/encoding.rs new file mode 100644 index 00000000..04d54495 --- /dev/null +++ b/src/encoding.rs @@ -0,0 +1,187 @@ +//! A module for wrappers that encode / decode data. + +use std::borrow::Cow; + +#[cfg(feature = "encoding")] +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; + +use crate::{Error, Result}; + +/// Decoder of byte slices into strings. +/// +/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` +/// XML declaration or assumes UTF-8, if XML has no declaration, encoding +/// key is not defined or contains unknown encoding. +/// +/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` +/// is supported. [*UTF-16 is not supported at the present*][utf16]. +/// +/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: +/// any XML declarations are ignored. +/// +/// [utf16]: https://github.com/tafia/quick-xml/issues/158 +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Decoder { + #[cfg(feature = "encoding")] + pub(crate) encoding: &'static Encoding, +} + +impl Decoder { + pub(crate) fn utf8() -> Self { + Decoder { + #[cfg(feature = "encoding")] + encoding: UTF_8, + } + } + + #[cfg(all(test, feature = "encoding", feature = "serialize"))] + pub(crate) fn utf16() -> Self { + Decoder { encoding: UTF_16LE } + } +} + +#[cfg(not(feature = "encoding"))] +impl Decoder { + /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + #[inline] + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + Ok(Cow::Borrowed(std::str::from_utf8(bytes)?)) + } + + /// Decodes a slice regardless of XML declaration with BOM removal if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + &bytes[3..] + } else { + bytes + }; + self.decode(bytes) + } +} + +#[cfg(feature = "encoding")] +impl Decoder { + /// Returns the `Reader`s encoding. + /// + /// This encoding will be used by [`decode`]. + /// + /// [`decode`]: Self::decode + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + /// Decodes specified bytes using encoding, declared in the XML, if it was + /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present + /// in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + decode(bytes, self.encoding) + } + + /// Decodes a slice with BOM removal if it is present in the `bytes` using + /// the reader encoding. + /// + /// If this method called after reading XML declaration with the `"encoding"` + /// key, then this encoding is used, otherwise UTF-8 is used. + /// + /// If XML declaration is absent in the XML, UTF-8 is used. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + self.decode(remove_bom(bytes, self.encoding)) + } +} + +/// Decodes the provided bytes using the specified encoding, ignoring the BOM +/// if it is present in the `bytes`. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result> { + encoding + .decode_without_bom_handling_and_without_replacement(bytes) + .ok_or(Error::NonDecodable(None)) +} + +/// Decodes a slice with an unknown encoding, removing the BOM if it is present +/// in the bytes. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result> { + if let Some(encoding) = detect_encoding(bytes) { + let bytes = remove_bom(bytes, encoding); + decode(bytes, encoding) + } else { + decode(bytes, UTF_8) + } +} + +#[cfg(feature = "encoding")] +fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) { + if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + bytes.split_at(3) + } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) { + bytes.split_at(2) + } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) { + bytes.split_at(2) + } else { + (&[], bytes) + } +} + +#[cfg(feature = "encoding")] +fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { + let (_, bytes) = split_at_bom(bytes, encoding); + bytes +} + +/// Automatic encoding detection of XML files based using the +/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing). +/// +/// If encoding is detected, `Some` is returned, otherwise `None` is returned. +/// +/// Because the [`encoding_rs`] crate supports only subset of those encodings, only +/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// |`FE FF ## ##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// |-------------|------------------------------------------ +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +#[cfg(feature = "encoding")] +pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { + match bytes { + // with BOM + _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), + _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), + _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), + + // without BOM + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible + + _ => None, + } +} + +// TODO: add some tests for functions diff --git a/src/events/mod.rs b/src/events/mod.rs index 6181a40b..bc9ff0ae 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -43,10 +43,10 @@ use std::fmt::{self, Debug, Formatter}; use std::ops::Deref; use std::str::from_utf8; +use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; -use crate::reader::Decoder; use crate::utils::write_cow_string; use attributes::{Attribute, Attributes}; diff --git a/src/lib.rs b/src/lib.rs index a210ed18..14249d19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,7 @@ #[cfg(feature = "serialize")] pub mod de; +pub mod encoding; mod errors; mod escapei; pub mod escape { @@ -62,8 +63,9 @@ pub mod utils; mod writer; // reexports +pub use crate::encoding::Decoder; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::{Decoder, NsReader, Reader}; +pub use crate::reader::{NsReader, Reader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 66765b64..a9fed368 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,13 +5,13 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; +use memchr; + use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; -use memchr; - /// This is an implementation of [`Reader`] for reading from a [`BufRead`] as /// underlying byte stream. impl Reader { diff --git a/src/reader/mod.rs b/src/reader/mod.rs index e4c8f342..ef663f90 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1,11 +1,13 @@ //! A module to handle `Reader` -use std::borrow::Cow; use std::str::from_utf8; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use encoding_rs::{Encoding, UTF_8}; +#[cfg(feature = "encoding")] +use crate::encoding::detect_encoding; +use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; @@ -928,172 +930,6 @@ pub(crate) fn is_whitespace(b: u8) -> bool { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Decoder of byte slices to the strings. This is lightweight object that can be copied. -/// -/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` -/// XML declaration or assumes UTF-8, if XML has no declaration, encoding -/// key is not defined or contains unknown encoding. -/// -/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` -/// is supported. [*UTF-16 is not supported at the present*][utf16]. -/// -/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: -/// any XML declarations are ignored. -/// -/// [utf16]: https://github.com/tafia/quick-xml/issues/158 -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct Decoder { - #[cfg(feature = "encoding")] - encoding: &'static Encoding, -} - -#[cfg(not(feature = "encoding"))] -impl Decoder { - /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if - /// it is present in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { - Ok(Cow::Borrowed(from_utf8(bytes)?)) - } - - /// Decodes a slice regardless of XML declaration with BOM removal if - /// it is present in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { - &bytes[3..] - } else { - bytes - }; - self.decode(bytes) - } -} - -#[cfg(feature = "encoding")] -impl Decoder { - /// Returns the `Reader`s encoding. - /// - /// This encoding will be used by [`decode`]. - /// - /// [`decode`]: Self::decode - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Decodes specified bytes using encoding, declared in the XML, if it was - /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present - /// in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { - match self - .encoding - .decode_without_bom_handling_and_without_replacement(bytes) - { - None => Err(Error::NonDecodable(None)), - Some(s) => Ok(s), - } - } - - /// Decodes a slice with BOM removal if it is present in the `bytes` using - /// the reader encoding. - /// - /// If this method called after reading XML declaration with the `"encoding"` - /// key, then this encoding is used, otherwise UTF-8 is used. - /// - /// If XML declaration is absent in the XML, UTF-8 is used. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - self.decode(self.remove_bom(bytes)) - } - /// Copied from [`Encoding::decode_with_bom_removal`] - #[inline] - fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { - if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { - return &bytes[3..]; - } - if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { - return &bytes[2..]; - } - if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { - return &bytes[2..]; - } - - bytes - } -} - -impl Decoder { - pub(crate) fn utf8() -> Self { - Decoder { - #[cfg(feature = "encoding")] - encoding: UTF_8, - } - } - - #[cfg(all(test, feature = "encoding", feature = "serialize"))] - pub(crate) fn utf16() -> Self { - Decoder { encoding: UTF_16LE } - } -} - -/// Automatic encoding detection of XML files based using the [recommended algorithm] -/// (https://www.w3.org/TR/xml11/#sec-guessing) -/// -/// The algorithm suggests examine up to the first 4 bytes to determine encoding -/// according to the following table: -/// -/// | Bytes |Detected encoding -/// |-------------|------------------------------------------ -/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order) -/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order) -/// |`00 00 FF FE`|UCS-4, unusual octet order (2143) -/// |`FE FF 00 00`|UCS-4, unusual octet order (3412) -/// |`FE FF ## ##`|UTF-16, big-endian -/// |`FF FE ## ##`|UTF-16, little-endian -/// |`EF BB BF` |UTF-8 -/// |-------------|------------------------------------------ -/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234) -/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321) -/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143) -/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412) -/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) -/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) -/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably -/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use) -/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind -/// -/// Because [`encoding_rs`] crate supported only subset of those encodings, only -/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. -/// -/// If encoding is detected, `Some` is returned, otherwise `None` is returned. -#[cfg(feature = "encoding")] -fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { - match bytes { - // with BOM - _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), - _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), - _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), - - // without BOM - _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible - - _ => None, - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - #[cfg(test)] mod test { macro_rules! check {