Skip to content

Commit

Permalink
Provide some utilities for decoding entire buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jul 24, 2022
1 parent bee8ff6 commit faf13a9
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 23 deletions.
4 changes: 4 additions & 0 deletions Changelog.md
Expand Up @@ -37,6 +37,8 @@
| |`resolve`
|`event_namespace` |`resolve_element`
|`attribute_namespace` |`resolve_attribute`
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
under the `quick-xml::encoding` namespace.


### Bug Fixes
Expand Down Expand Up @@ -209,6 +211,8 @@
[#431]: https://github.com/tafia/quick-xml/pull/431
[#434]: https://github.com/tafia/quick-xml/pull/434
[#437]: https://github.com/tafia/quick-xml/pull/437
[#439]: https://github.com/tafia/quick-xml/pull/439


## 0.23.0 -- 2022-05-08

Expand Down
71 changes: 48 additions & 23 deletions src/encoding.rs
Expand Up @@ -60,7 +60,7 @@ impl Decoder {
///
/// If you instead want to use XML declared encoding, use the `encoding` feature
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
&bytes[3..]
} else {
bytes
Expand All @@ -86,13 +86,7 @@ impl Decoder {
///
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
match self
.encoding
.decode_without_bom_handling_and_without_replacement(bytes)
{
None => Err(Error::NonDecodable(None)),
Some(s) => Ok(s),
}
decode(bytes, self.encoding)
}

/// Decodes a slice with BOM removal if it is present in the `bytes` using
Expand All @@ -105,25 +99,54 @@ impl Decoder {
///
/// Returns an error in case of malformed sequences in the `bytes`.
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
self.decode(self.remove_bom(bytes))
self.decode(remove_bom(bytes, self.encoding))
}
/// Copied from [`Encoding::decode_with_bom_removal`]
#[inline]
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
return &bytes[3..];
}
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
return &bytes[2..];
}
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
return &bytes[2..];
}
}

/// Decodes the provided bytes using the specified encoding, ignoring the BOM
/// if it is present in the `bytes`.
///
/// Returns an error in case of malformed sequences in the `bytes`.
#[cfg(feature = "encoding")]
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
encoding
.decode_without_bom_handling_and_without_replacement(bytes)
.ok_or(Error::NonDecodable(None))
}

bytes
/// Decodes a slice with an unknown encoding, removing the BOM if it is present
/// in the bytes.
///
/// Returns an error in case of malformed sequences in the `bytes`.
#[cfg(feature = "encoding")]
pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
if let Some(encoding) = detect_encoding(bytes) {
let bytes = remove_bom(bytes, encoding);
decode(bytes, encoding)
} else {
decode(bytes, UTF_8)
}
}

#[cfg(feature = "encoding")]
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
bytes.split_at(3)
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
bytes.split_at(2)
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
bytes.split_at(2)
} else {
(&[], bytes)
}
}

#[cfg(feature = "encoding")]
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
let (_, bytes) = split_at_bom(bytes, encoding);
bytes
}

/// Automatic encoding detection of XML files based using the [recommended algorithm]
/// (https://www.w3.org/TR/xml11/#sec-guessing)
///
Expand Down Expand Up @@ -155,7 +178,7 @@ impl Decoder {
///
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
#[cfg(feature = "encoding")]
pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
match bytes {
// with BOM
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
Expand All @@ -170,3 +193,5 @@ pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
_ => None,
}
}

// TODO: add some tests for functions

0 comments on commit faf13a9

Please sign in to comment.