Skip to content

Commit

Permalink
Remove BOM from first-emitted text event
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Aug 17, 2022
1 parent b302b6f commit 6daeced
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 17 deletions.
5 changes: 3 additions & 2 deletions Changelog.md
Expand Up @@ -40,7 +40,7 @@
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.
- [#458]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.

### Bug Fixes

Expand Down Expand Up @@ -180,7 +180,8 @@
- [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
You still can use re-exported definitions from a crate root

- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
- [#458]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`.

### New Tests

Expand Down
2 changes: 1 addition & 1 deletion src/encoding.rs
Expand Up @@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
}

#[cfg(feature = "encoding")]
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
let (_, bytes) = split_at_bom(bytes, encoding);
bytes
}
Expand Down
34 changes: 20 additions & 14 deletions src/reader/parser.rs
@@ -1,9 +1,7 @@
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;

#[cfg(feature = "encoding")]
use crate::encoding::detect_encoding;
use crate::encoding::Decoder;
use crate::encoding::{self, Decoder};
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
#[cfg(feature = "encoding")]
Expand Down Expand Up @@ -68,23 +66,31 @@ impl Parser {
///
/// [`Text`]: Event::Text
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
#[cfg(feature = "encoding")]
if first && self.encoding.can_be_refined() {
if let Some(encoding) = detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
}
}
let mut content = bytes;

let content = if self.trim_text_end {
if self.trim_text_end {
// Skip the ending '<'
let len = bytes
.iter()
.rposition(|&b| !is_whitespace(b))
.map_or_else(|| bytes.len(), |p| p + 1);
&bytes[..len]
} else {
bytes
};
content = &bytes[..len];
}

if first {
#[cfg(feature = "encoding")]
if self.encoding.can_be_refined() {
if let Some(encoding) = encoding::detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
content = encoding::remove_bom(content, encoding);
}
}
#[cfg(not(feature = "encoding"))]
if bytes.starts_with(encoding::UTF8_BOM) {
content = &bytes[encoding::UTF8_BOM.len()..];
}
}

Ok(Event::Text(BytesText::wrap(content, self.decoder())))
}

Expand Down
2 changes: 2 additions & 0 deletions tests/encodings.rs
@@ -1,4 +1,6 @@
#[allow(unused_imports)]
use quick_xml::events::Event;
#[allow(unused_imports)]
use quick_xml::Reader;

#[cfg(feature = "encoding")]
Expand Down
19 changes: 19 additions & 0 deletions tests/xmlrs_reader_tests.rs
Expand Up @@ -51,6 +51,25 @@ fn html5() {
);
}

#[test]
fn bom_removed_from_initial_text() {
let expected = r#"
|Characters(asdf)
|StartElement(paired [attr1="value1", attr2="value2"])
|Characters(text)
|EndElement(paired)
|EndDocument
"#;

// BOM right up against the text
test(
"\u{FEFF}asdf<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
expected,
true,
);
}


#[test]
fn escaped_characters() {
test(
Expand Down

0 comments on commit 6daeced

Please sign in to comment.