Skip to content

Commit

Permalink
Remove BOM from first-emitted text event
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Aug 18, 2022
1 parent 08d4a3a commit 340ea04
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 16 deletions.
1 change: 1 addition & 0 deletions Changelog.md
Expand Up @@ -181,6 +181,7 @@
You still can use re-exported definitions from a crate root

- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
- [#459]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`.

### New Tests

Expand Down
2 changes: 1 addition & 1 deletion src/encoding.rs
Expand Up @@ -155,7 +155,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
}

#[cfg(feature = "encoding")]
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
let (_, bytes) = split_at_bom(bytes, encoding);
bytes
}
Expand Down
3 changes: 2 additions & 1 deletion src/reader/mod.rs
Expand Up @@ -1582,12 +1582,13 @@ mod test {
}

#[$test]
#[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events.
$($async)? fn bom_at_start() {
let mut reader = Reader::from_str("\u{feff}");

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::Text(BytesText::from_escaped("\u{feff}").into())
Event::Eof
);
}

Expand Down
34 changes: 20 additions & 14 deletions src/reader/parser.rs
@@ -1,9 +1,7 @@
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;

#[cfg(feature = "encoding")]
use crate::encoding::detect_encoding;
use crate::encoding::Decoder;
use crate::encoding::{self, Decoder};
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
#[cfg(feature = "encoding")]
Expand Down Expand Up @@ -70,23 +68,31 @@ impl Parser {
///
/// [`Text`]: Event::Text
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
#[cfg(feature = "encoding")]
if first && self.encoding.can_be_refined() {
if let Some(encoding) = detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
}
}
let mut content = bytes;

let content = if self.trim_text_end {
if self.trim_text_end {
// Skip the ending '<'
let len = bytes
.iter()
.rposition(|&b| !is_whitespace(b))
.map_or_else(|| bytes.len(), |p| p + 1);
&bytes[..len]
} else {
bytes
};
content = &bytes[..len];
}

if first {
#[cfg(feature = "encoding")]
if self.encoding.can_be_refined() {
if let Some(encoding) = encoding::detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
content = encoding::remove_bom(content, encoding);
}
}
#[cfg(not(feature = "encoding"))]
if bytes.starts_with(encoding::UTF8_BOM) {
content = &bytes[encoding::UTF8_BOM.len()..];
}
}

Ok(Event::Text(BytesText::wrap(content, self.decoder())))
}

Expand Down
2 changes: 2 additions & 0 deletions tests/encodings.rs
@@ -1,4 +1,6 @@
#[allow(unused_imports)]
use quick_xml::events::Event;
#[allow(unused_imports)]
use quick_xml::Reader;

#[cfg(feature = "encoding")]
Expand Down
18 changes: 18 additions & 0 deletions tests/xmlrs_reader_tests.rs
Expand Up @@ -51,6 +51,24 @@ fn html5() {
);
}

#[test]
fn bom_removed_from_initial_text() {
let expected = r#"
|Characters(asdf)
|StartElement(paired [attr1="value1", attr2="value2"])
|Characters(text)
|EndElement(paired)
|EndDocument
"#;

// BOM right up against the text
test(
"\u{FEFF}asdf<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
expected,
true,
);
}

#[test]
fn escaped_characters() {
test(
Expand Down

0 comments on commit 340ea04

Please sign in to comment.