diff --git a/Changelog.md b/Changelog.md index 341cb892..190afd59 100644 --- a/Changelog.md +++ b/Changelog.md @@ -37,6 +37,8 @@ | |`resolve` |`event_namespace` |`resolve_element` |`attribute_namespace` |`resolve_attribute` +- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()` + under the `quick-xml::encoding` namespace. ### Bug Fixes @@ -209,6 +211,8 @@ [#431]: https://github.com/tafia/quick-xml/pull/431 [#434]: https://github.com/tafia/quick-xml/pull/434 [#437]: https://github.com/tafia/quick-xml/pull/437 +[#439]: https://github.com/tafia/quick-xml/pull/439 + ## 0.23.0 -- 2022-05-08 diff --git a/src/encoding.rs b/src/encoding.rs index 486c3834..52cb49ab 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -60,7 +60,7 @@ impl Decoder { /// /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { + let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { &bytes[3..] } else { bytes @@ -86,13 +86,7 @@ impl Decoder { /// /// Returns an error in case of malformed sequences in the `bytes`. pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { - match self - .encoding - .decode_without_bom_handling_and_without_replacement(bytes) - { - None => Err(Error::NonDecodable(None)), - Some(s) => Ok(s), - } + decode(bytes, self.encoding) } /// Decodes a slice with BOM removal if it is present in the `bytes` using @@ -105,25 +99,54 @@ impl Decoder { /// /// Returns an error in case of malformed sequences in the `bytes`. pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - self.decode(self.remove_bom(bytes)) + self.decode(remove_bom(bytes, self.encoding)) } - /// Copied from [`Encoding::decode_with_bom_removal`] - #[inline] - fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { - if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { - return &bytes[3..]; - } - if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { - return &bytes[2..]; - } - if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { - return &bytes[2..]; - } +} + +/// Decodes the provided bytes using the specified encoding, ignoring the BOM +/// if it is present in the `bytes`. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result> { + encoding + .decode_without_bom_handling_and_without_replacement(bytes) + .ok_or(Error::NonDecodable(None)) +} - bytes +/// Decodes a slice with an unknown encoding, removing the BOM if it is present +/// in the bytes. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result> { + if let Some(encoding) = detect_encoding(bytes) { + let bytes = remove_bom(bytes, encoding); + decode(bytes, encoding) + } else { + decode(bytes, UTF_8) + } +} + +#[cfg(feature = "encoding")] +fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) { + if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + bytes.split_at(3) + } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) { + bytes.split_at(2) + } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) { + bytes.split_at(2) + } else { + (&[], bytes) } } +#[cfg(feature = "encoding")] +fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { + let (_, bytes) = split_at_bom(bytes, encoding); + bytes +} + /// Automatic encoding detection of XML files based using the [recommended algorithm] /// (https://www.w3.org/TR/xml11/#sec-guessing) /// @@ -155,7 +178,7 @@ impl Decoder { /// /// If encoding is detected, `Some` is returned, otherwise `None` is returned. #[cfg(feature = "encoding")] -pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { +pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { match bytes { // with BOM _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), @@ -170,3 +193,5 @@ pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { _ => None, } } + +// TODO: add some tests for functions