From 6daeced30495ab2e211e5a08b52e681cb94557b5 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Tue, 16 Aug 2022 00:19:30 -0400 Subject: [PATCH] Remove BOM from first-emitted text event --- Changelog.md | 5 +++-- src/encoding.rs | 2 +- src/reader/parser.rs | 34 ++++++++++++++++++++-------------- tests/encodings.rs | 2 ++ tests/xmlrs_reader_tests.rs | 19 +++++++++++++++++++ 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/Changelog.md b/Changelog.md index 941aa22d..d19b06e1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -40,7 +40,7 @@ - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers - [#455]: Change return type of all `read_to_end*` methods to return a span between tags - [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags -- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document. +- [#458]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document. ### Bug Fixes @@ -180,7 +180,8 @@ - [#456]: Reader and writer stuff grouped under `reader` and `writer` modules. You still can use re-exported definitions from a crate root -- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable. +- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable. +- [#458]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`. ### New Tests diff --git a/src/encoding.rs b/src/encoding.rs index 06520496..8c8717e7 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], } #[cfg(feature = "encoding")] -fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { +pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { let (_, bytes) = split_at_bom(bytes, encoding); bytes } diff --git a/src/reader/parser.rs b/src/reader/parser.rs index 3403dbca..6338ee94 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -1,9 +1,7 @@ #[cfg(feature = "encoding")] use encoding_rs::UTF_8; -#[cfg(feature = "encoding")] -use crate::encoding::detect_encoding; -use crate::encoding::Decoder; +use crate::encoding::{self, Decoder}; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; #[cfg(feature = "encoding")] @@ -68,23 +66,31 @@ impl Parser { /// /// [`Text`]: Event::Text pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result> { - #[cfg(feature = "encoding")] - if first && self.encoding.can_be_refined() { - if let Some(encoding) = detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - } - } + let mut content = bytes; - let content = if self.trim_text_end { + if self.trim_text_end { // Skip the ending '<' let len = bytes .iter() .rposition(|&b| !is_whitespace(b)) .map_or_else(|| bytes.len(), |p| p + 1); - &bytes[..len] - } else { - bytes - }; + content = &bytes[..len]; + } + + if first { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + if let Some(encoding) = encoding::detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + content = encoding::remove_bom(content, encoding); + } + } + #[cfg(not(feature = "encoding"))] + if bytes.starts_with(encoding::UTF8_BOM) { + content = &bytes[encoding::UTF8_BOM.len()..]; + } + } + Ok(Event::Text(BytesText::wrap(content, self.decoder()))) } diff --git a/tests/encodings.rs b/tests/encodings.rs index cd135ff0..496a8450 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -1,4 +1,6 @@ +#[allow(unused_imports)] use quick_xml::events::Event; +#[allow(unused_imports)] use quick_xml::Reader; #[cfg(feature = "encoding")] diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 83a47ebb..587f869b 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -51,6 +51,25 @@ fn html5() { ); } +#[test] +fn bom_removed_from_initial_text() { + let expected = r#" + |Characters(asdf) + |StartElement(paired [attr1="value1", attr2="value2"]) + |Characters(text) + |EndElement(paired) + |EndDocument + "#; + + // BOM right up against the text + test( + "\u{FEFF}asdftext", + expected, + true, + ); +} + + #[test] fn escaped_characters() { test(