From b1a96701e31395bb57fd1204f94207f6ac47292a Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 13 Aug 2022 19:49:32 -0400 Subject: [PATCH 1/3] Remove StartText StartText would be out of place once all events are expected to contain UTF-8. Additionally the decoder implementation strips BOM bytes out of the bytestream so there's no good way to access them. --- Changelog.md | 11 ---- benches/microbenches.rs | 14 ----- src/de/mod.rs | 8 --- src/events/mod.rs | 122 ------------------------------------ src/reader/mod.rs | 32 ++++++---- src/reader/parser.rs | 15 +---- src/writer.rs | 1 - tests/test.rs | 4 +- tests/unit_tests.rs | 88 -------------------------- tests/xmlrs_reader_tests.rs | 5 -- 10 files changed, 25 insertions(+), 275 deletions(-) diff --git a/Changelog.md b/Changelog.md index f11a770c..e396154c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -20,8 +20,6 @@ - [#180]: Make `Decoder` struct public. You already had access to it via the `Reader::decoder()` method, but could not name it in the code. Now the preferred way to access decoding functionality is via this struct -- [#191]: New event variant `StartText` emitted for bytes before the XML declaration - or a start comment or a tag. For streams with BOM this event will contain a BOM - [#395]: Add support for XML Schema `xs:list` - [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore the XML declared encoding and always use UTF-8 @@ -100,15 +98,6 @@ `Decoder::decode()` and `Decoder::decode_with_bom_removal()`. Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now. `Reader::encoding()` is replaced by `Decoder::encoding()` as well -- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and - `BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked - as expected, this was only due to good luck. They was replaced by the - `BytesStartText::decode_with_bom_removal()`: - - conceptually, you should decode BOM only for the first `Text` event from the - reader (since now `StartText` event is emitted instead for this) - - text before the first tag is not an XML content at all, so it is meaningless - to try to unescape something in it - - [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is disabled. Signatures of functions are now the same regardless of whether or not the feature is enabled, and an error will be returned instead of performing replacements for invalid characters diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 12ef0564..c8556fe4 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -118,20 +118,6 @@ fn read_resolved_event_into(c: &mut Criterion) { /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); - group.bench_function("StartText", |b| { - let src = "Hello world!".repeat(512 / 12); - b.iter(|| { - let mut r = Reader::from_str(&src); - let mut nbtxt = criterion::black_box(0); - r.check_end_names(false).check_comments(false); - match r.read_event() { - Ok(Event::StartText(e)) => nbtxt += e.len(), - something_else => panic!("Did not expect {:?}", something_else), - }; - - assert_eq!(nbtxt, 504); - }) - }); group.bench_function("Start", |b| { let src = format!(r#""#, "world".repeat(512 / 5)); diff --git a/src/de/mod.rs b/src/de/mod.rs index 5c90cb93..8927a4b3 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -929,10 +929,6 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { let event = loop { let e = self.reader.read_event_into(&mut self.buf)?; match e { - //TODO: Probably not the best idea treat StartText as usual text - // Usually this event will represent a BOM - // Changing this requires review of the serde-de::top_level::one_element test - Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())), Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())), Event::End(e) => break Ok(DeEvent::End(e.into_owned())), Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())), @@ -974,10 +970,6 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { loop { let e = self.reader.read_event()?; match e { - //TODO: Probably not the best idea treat StartText as usual text - // Usually this event will represent a BOM - // Changing this requires review of the serde-de::top_level::one_element test - Event::StartText(e) => break Ok(DeEvent::Text(e.into())), Event::Start(e) => break Ok(DeEvent::Start(e)), Event::End(e) => break Ok(DeEvent::End(e)), Event::Text(e) => break Ok(DeEvent::Text(e)), diff --git a/src/events/mod.rs b/src/events/mod.rs index a70f6a55..9ab063ba 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -50,69 +50,6 @@ use crate::name::{LocalName, QName}; use crate::utils::write_cow_string; use attributes::{Attribute, Attributes}; -/// Text that appeared before an XML declaration, a start element or a comment. -/// -/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event -/// contains something else except BOM, the XML should be considered ill-formed. -/// -/// This is a reader-only event. If you need to write a text before the first tag, -/// use the [`BytesText`] event. -#[derive(Debug, Clone, Eq, PartialEq)] -pub struct BytesStartText<'a> { - content: BytesText<'a>, -} - -impl<'a> BytesStartText<'a> { - /// Converts the event into an owned event. - pub fn into_owned(self) -> BytesStartText<'static> { - BytesStartText { - content: self.content.into_owned(), - } - } - - /// Extracts the inner `Cow` from the `BytesStartText` event container. - #[inline] - pub fn into_inner(self) -> Cow<'a, [u8]> { - self.content.into_inner() - } - - /// Converts the event into a borrowed event. - #[inline] - pub fn borrow(&self) -> BytesStartText { - BytesStartText { - content: self.content.borrow(), - } - } - - /// Decodes bytes of event, stripping byte order mark (BOM) if it is presented - /// in the event. - /// - /// This method does not unescapes content, because no escape sequences can - /// appeared in the BOM or in the text before the first tag. - pub fn decode_with_bom_removal(&self) -> Result { - //TODO: Fix lifetime issue - it should be possible to borrow string - let decoded = self.content.decoder.decode_with_bom_removal(&*self)?; - - Ok(decoded.to_string()) - } -} - -impl<'a> Deref for BytesStartText<'a> { - type Target = BytesText<'a>; - - fn deref(&self) -> &Self::Target { - &self.content - } -} - -impl<'a> From> for BytesStartText<'a> { - fn from(content: BytesText<'a>) -> Self { - Self { content } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - /// Opening tag data (`Event::Start`), with optional attributes. /// /// ``. @@ -797,12 +734,6 @@ impl<'a> Deref for BytesText<'a> { } } -impl<'a> From> for BytesText<'a> { - fn from(content: BytesStartText<'a>) -> Self { - content.content - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// /// CDATA content contains unescaped data from the reader. If you want to write them as a text, @@ -941,56 +872,6 @@ impl<'a> Deref for BytesCData<'a> { /// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into #[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { - /// Text that appeared before the first opening tag or an [XML declaration]. - /// [According to the XML standard][std], no text allowed before the XML - /// declaration. However, if there is a BOM in the stream, some data may be - /// present. - /// - /// When this event is generated, it is the very first event emitted by the - /// [`Reader`], and there can be the only one such event. - /// - /// The [`Writer`] writes content of this event "as is" without encoding or - /// escaping. If you write it, it should be written first and only one time - /// (but writer does not enforce that). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use std::borrow::Cow; - /// use quick_xml::events::Event; - /// use quick_xml::reader::Reader; - /// - /// // XML in UTF-8 with BOM - /// let xml = b"\xEF\xBB\xBF".as_ref(); - /// let mut reader = Reader::from_reader(xml); - /// let mut buf = Vec::new(); - /// let mut events_processed = 0; - /// loop { - /// match reader.read_event_into(&mut buf) { - /// Ok(Event::StartText(e)) => { - /// assert_eq!(events_processed, 0); - /// // Content contains BOM - /// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF")); - /// } - /// Ok(Event::Decl(_)) => { - /// assert_eq!(events_processed, 1); - /// } - /// Ok(Event::Eof) => { - /// assert_eq!(events_processed, 2); - /// break; - /// } - /// e => panic!("Unexpected event {:?}", e), - /// } - /// events_processed += 1; - /// } - /// ``` - /// - /// [XML declaration]: Event::Decl - /// [std]: https://www.w3.org/TR/xml11/#NT-document - /// [`Reader`]: crate::reader::Reader - /// [`Writer`]: crate::writer::Writer - StartText(BytesStartText<'a>), /// Start tag (with attributes) ``. Start(BytesStart<'a>), /// End tag ``. @@ -1018,7 +899,6 @@ impl<'a> Event<'a> { /// buffer used when reading but incurring a new, separate allocation. pub fn into_owned(self) -> Event<'static> { match self { - Event::StartText(e) => Event::StartText(e.into_owned()), Event::Start(e) => Event::Start(e.into_owned()), Event::End(e) => Event::End(e.into_owned()), Event::Empty(e) => Event::Empty(e.into_owned()), @@ -1036,7 +916,6 @@ impl<'a> Event<'a> { #[inline] pub fn borrow(&self) -> Event { match self { - Event::StartText(e) => Event::StartText(e.borrow()), Event::Start(e) => Event::Start(e.borrow()), Event::End(e) => Event::End(e.borrow()), Event::Empty(e) => Event::Empty(e.borrow()), @@ -1056,7 +935,6 @@ impl<'a> Deref for Event<'a> { fn deref(&self) -> &[u8] { match *self { - Event::StartText(ref e) => &*e, Event::Start(ref e) | Event::Empty(ref e) => &*e, Event::End(ref e) => &*e, Event::Text(ref e) => &*e, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 0c8ee798..920f2aca 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -286,7 +286,7 @@ pub type Span = Range; /// subgraph _ /// direction LR /// -/// Init -- "(no event)"\nStartText --> OpenedTag +/// Init -- "(no event)"\n --> OpenedTag /// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag /// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag /// end @@ -297,13 +297,13 @@ pub type Span = Range; #[derive(Clone)] enum ParseState { /// Initial state in which reader stay after creation. Transition from that - /// state could produce a `StartText`, `Decl`, `Comment` or `Start` event. - /// The next state is always `OpenedTag`. The reader will never return to this - /// state. The event emitted during transition to `OpenedTag` is a `StartEvent` - /// if the first symbol not `<`, otherwise no event are emitted. + /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next + /// state is always `OpenedTag`. The reader will never return to this state. The + /// event emitted during transition to `OpenedTag` is a `StartEvent` if the + /// first symbol not `<`, otherwise no event are emitted. Init, /// State after seeing the `<` symbol. Depending on the next symbol all other - /// events (except `StartText`) could be generated. + /// events could be generated. /// /// After generating one event the reader moves to the `ClosedTag` state. OpenedTag, @@ -552,9 +552,7 @@ impl Reader { read_event_impl!(self, buf, read_until_open, read_until_close) } - /// Read until '<' is found and moves reader to an `OpenedTag` state. - /// - /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> where R: XmlSource<'i, B>, @@ -1574,12 +1572,22 @@ mod test { use pretty_assertions::assert_eq; #[$test] - $($async)? fn start_text() { - let mut reader = Reader::from_str("bom"); + $($async)? fn text_at_start() { + let mut reader = Reader::from_str("text"); + + assert_eq!( + reader.$read_event($buf) $(.$await)? .unwrap(), + Event::Text(BytesText::from_escaped("text").into()) + ); + } + + #[$test] + $($async)? fn bom_at_start() { + let mut reader = Reader::from_str("\u{feff}"); assert_eq!( reader.$read_event($buf) $(.$await)? .unwrap(), - Event::StartText(BytesText::from_escaped("bom").into()) + Event::Text(BytesText::from_escaped("\u{feff}").into()) ); } diff --git a/src/reader/parser.rs b/src/reader/parser.rs index baeacda1..f2747c90 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -61,17 +61,13 @@ pub(super) struct Parser { } impl Parser { - /// Trims whitespaces from `bytes`, if required, and returns a [`StartText`] - /// or a [`Text`] event. When [`StartText`] is returned, the method can change - /// the encoding of the reader, detecting it from the beginning of the stream. + /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event. /// /// # Parameters /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` /// - `first`: if `true`, then this is the first call of that function, - /// i. e. data from the start of stream and [`StartText`] will be returned, - /// otherwise [`Text`] will be returned + /// i. e. data from the start of stream and the bytes will be checked for a BOM. /// - /// [`StartText`]: Event::StartText /// [`Text`]: Event::Text pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result> { #[cfg(feature = "encoding")] @@ -91,12 +87,7 @@ impl Parser { } else { bytes }; - - Ok(if first { - Event::StartText(BytesText::wrap(content, self.decoder()).into()) - } else { - Event::Text(BytesText::wrap(content, self.decoder())) - }) + Ok(Event::Text(BytesText::wrap(content, self.decoder()))) } /// reads `BytesElement` starting with a `!`, diff --git a/src/writer.rs b/src/writer.rs index af3f18c1..7592e70b 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -90,7 +90,6 @@ impl Writer { pub fn write_event<'a, E: AsRef>>(&mut self, event: E) -> Result<()> { let mut next_should_line_break = true; let result = match *event.as_ref() { - Event::StartText(ref e) => self.write(&e), Event::Start(ref e) => { let result = self.write_wrapped(b"<", e, b">"); if let Some(i) = self.indent.as_mut() { diff --git a/tests/test.rs b/tests/test.rs index 5333bafc..32b2a6f8 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -111,7 +111,7 @@ fn test_issue94() { fn test_no_trim() { let mut reader = Reader::from_str(" text "); - assert!(matches!(reader.read_event().unwrap(), StartText(_))); + assert!(matches!(reader.read_event().unwrap(), Text(_))); assert!(matches!(reader.read_event().unwrap(), Start(_))); assert!(matches!(reader.read_event().unwrap(), Text(_))); assert!(matches!(reader.read_event().unwrap(), End(_))); @@ -123,7 +123,7 @@ fn test_trim_end() { let mut reader = Reader::from_str(" text "); reader.trim_text_end(true); - assert!(matches!(reader.read_event().unwrap(), StartText(_))); + assert!(matches!(reader.read_event().unwrap(), Text(_))); assert!(matches!(reader.read_event().unwrap(), Start(_))); assert!(matches!(reader.read_event().unwrap(), Text(_))); assert!(matches!(reader.read_event().unwrap(), End(_))); diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index e98328d8..c269886a 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -41,7 +41,6 @@ macro_rules! next_eq_content { } macro_rules! next_eq { - ($r:expr, StartText, $bytes:expr) => (next_eq_content!($r, StartText, $bytes);); ($r:expr, Start, $bytes:expr) => (next_eq_name!($r, Start, $bytes);); ($r:expr, End, $bytes:expr) => (next_eq_name!($r, End, $bytes);); ($r:expr, Empty, $bytes:expr) => (next_eq_name!($r, Empty, $bytes);); @@ -705,90 +704,3 @@ fn test_closing_bracket_in_single_quote_mixed() { } next_eq!(r, End, b"a"); } - -mod decode_with_bom_removal { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - #[cfg(not(feature = "encoding"))] - fn removes_utf8_bom() { - let input: &str = std::str::from_utf8(b"\xEF\xBB\xBF").unwrap(); - - let mut reader = Reader::from_str(&input); - reader.trim_text(true); - - let mut txt = Vec::new(); - - loop { - match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), - Ok(Eof) => break, - _ => (), - } - } - assert_eq!(txt, vec![""]); - } - - /// Test is disabled: the input started with `[FE FF 00 3C 00 3F ...]` and currently - /// quick-xml captures `[FE FF 00]` as a `StartText` event, because it is stopped - /// at byte `<` (0x3C). That sequence represents UTF-16 BOM (=BE) and a first byte - /// of the `<` symbol, encoded in UTF-16 BE (`00 3C`). - #[test] - #[cfg(feature = "encoding")] - #[ignore = "Non-ASCII compatible encodings not properly supported yet. See https://github.com/tafia/quick-xml/issues/158"] - fn removes_utf16be_bom() { - let mut reader = Reader::from_reader(include_bytes!("./documents/utf16be.xml").as_ref()); - reader.trim_text(true); - - let mut buf = Vec::new(); - let mut txt = Vec::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), - Ok(Eof) => break, - _ => (), - } - } - assert_eq!(Some(txt[0].as_ref()), Some("")); - } - - #[test] - #[cfg(feature = "encoding")] - fn removes_utf16le_bom() { - let mut reader = Reader::from_reader(include_bytes!("./documents/utf16le.xml").as_ref()); - reader.trim_text(true); - let mut buf = Vec::new(); - let mut txt = Vec::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), - Ok(Eof) => break, - _ => (), - } - } - assert_eq!(Some(txt[0].as_ref()), Some("")); - } - - #[test] - #[cfg(not(feature = "encoding"))] - fn does_nothing_if_no_bom_exists() { - let input: &str = std::str::from_utf8(b"").unwrap(); - - let mut reader = Reader::from_str(&input); - reader.trim_text(true); - - let mut txt = Vec::new(); - - loop { - match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), - Ok(Eof) => break, - _ => (), - } - } - assert!(txt.is_empty()); - } -} diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 0bffc682..83a47ebb 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -374,11 +374,6 @@ fn test_bytes(input: &[u8], output: &[u8], trim: bool) { let mut decoder = reader.decoder(); loop { let line = match reader.read_resolved_event() { - Ok((_, Event::StartText(_))) => { - // BOM could change decoder - decoder = reader.decoder(); - "StartText".to_string() - } Ok((_, Event::Decl(e))) => { // Declaration could change decoder decoder = reader.decoder(); From 08d4a3a814f85862e8cb4eb8a6d3d4b237af4ada Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Tue, 16 Aug 2022 21:16:23 -0400 Subject: [PATCH 2/3] Add a write_bom() method to the Writer --- Changelog.md | 8 +++++--- src/encoding.rs | 26 +++++++++++++++++++------- src/writer.rs | 41 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/Changelog.md b/Changelog.md index e396154c..5410e901 100644 --- a/Changelog.md +++ b/Changelog.md @@ -40,7 +40,7 @@ - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers - [#455]: Change return type of all `read_to_end*` methods to return a span between tags - [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags - +- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document. ### Bug Fixes @@ -175,11 +175,13 @@ - [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte array cannot guarantee borrowing due to possible copying while decoding. -- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text` +- [#455]: Removed `Reader::read_text_into` which is just a thin wrapper over match on `Event::Text` - [#456]: Reader and writer stuff grouped under `reader` and `writer` modules. You still can use re-exported definitions from a crate root +- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable. + ### New Tests - [#9]: Added tests for incorrect nested tags in input @@ -223,7 +225,7 @@ [#450]: https://github.com/tafia/quick-xml/pull/450 [#455]: https://github.com/tafia/quick-xml/pull/455 [#456]: https://github.com/tafia/quick-xml/pull/456 - +[#459]: https://github.com/tafia/quick-xml/pull/459 ## 0.23.0 -- 2022-05-08 diff --git a/src/encoding.rs b/src/encoding.rs index 3cfd12fd..c2b66676 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -9,6 +9,18 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; use crate::Error; use crate::Result; +/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8. +/// See +pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF]; +/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order. +/// See +#[cfg(feature = "encoding")] +pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE]; +/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order. +/// See +#[cfg(feature = "encoding")] +pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF]; + /// Decoder of byte slices into strings. /// /// If feature `encoding` is enabled, this encoding taken from the `"encoding"` @@ -62,7 +74,7 @@ impl Decoder { /// /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + let bytes = if bytes.starts_with(UTF8_BOM) { &bytes[3..] } else { bytes @@ -131,11 +143,11 @@ pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result> { #[cfg(feature = "encoding")] fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) { - if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) { bytes.split_at(3) - } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) { + } else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) { bytes.split_at(2) - } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) { + } else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) { bytes.split_at(2) } else { (&[], bytes) @@ -172,9 +184,9 @@ fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { match bytes { // with BOM - _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), - _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), - _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), + _ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE), + _ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE), + _ if bytes.starts_with(UTF8_BOM) => Some(UTF_8), // without BOM _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 diff --git a/src/writer.rs b/src/writer.rs index 7592e70b..641d632b 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,8 +1,10 @@ //! Contains high-level interface for an events-based XML emitter. +use std::io::Write; + +use crate::encoding::UTF8_BOM; use crate::errors::{Error, Result}; use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event}; -use std::io::Write; /// XML writer. /// @@ -86,6 +88,40 @@ impl Writer { &mut self.writer } + /// Write a [Byte-Order-Mark] character to the document. + /// + /// # Example + /// + /// ```rust + /// # use quick_xml::Result; + /// # fn main() -> Result<()> { + /// use quick_xml::events::{BytesStart, BytesText, Event}; + /// use quick_xml::writer::Writer; + /// use quick_xml::Error; + /// use std::io::Cursor; + /// + /// let mut buffer = Vec::new(); + /// let mut writer = Writer::new_with_indent(&mut buffer, b' ', 4); + /// + /// writer.write_bom()?; + /// writer + /// .create_element("empty") + /// .with_attribute(("attr1", "value1")) + /// .write_empty() + /// .expect("failure"); + /// + /// assert_eq!( + /// std::str::from_utf8(&buffer).unwrap(), + /// "\u{FEFF}" + /// ); + /// # Ok(()) + /// # } + /// ``` + /// [Byte-Order-Mark]: https://unicode.org/faq/utf_bom.html#BOM + pub fn write_bom(&mut self) -> Result<()> { + self.write(UTF8_BOM) + } + /// Writes the given event to the underlying writer. pub fn write_event<'a, E: AsRef>>(&mut self, event: E) -> Result<()> { let mut next_should_line_break = true; @@ -128,7 +164,7 @@ impl Writer { /// Writes bytes #[inline] - pub fn write(&mut self, value: &[u8]) -> Result<()> { + pub(crate) fn write(&mut self, value: &[u8]) -> Result<()> { self.writer.write_all(value).map_err(Error::Io) } @@ -502,6 +538,7 @@ mod indentation { "# ); } + #[test] fn element_writer_empty() { let mut buffer = Vec::new(); From 340ea044b4aeb929111614e067fdb16dbc75750b Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Tue, 16 Aug 2022 00:19:30 -0400 Subject: [PATCH 3/3] Remove BOM from first-emitted text event --- Changelog.md | 1 + src/encoding.rs | 2 +- src/reader/mod.rs | 3 ++- src/reader/parser.rs | 34 ++++++++++++++++++++-------------- tests/encodings.rs | 2 ++ tests/xmlrs_reader_tests.rs | 18 ++++++++++++++++++ 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/Changelog.md b/Changelog.md index 5410e901..39c84a52 100644 --- a/Changelog.md +++ b/Changelog.md @@ -181,6 +181,7 @@ You still can use re-exported definitions from a crate root - [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable. +- [#459]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`. ### New Tests diff --git a/src/encoding.rs b/src/encoding.rs index c2b66676..673e4012 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -155,7 +155,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], } #[cfg(feature = "encoding")] -fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { +pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { let (_, bytes) = split_at_bom(bytes, encoding); bytes } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 920f2aca..77fa8fd9 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1582,12 +1582,13 @@ mod test { } #[$test] + #[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events. $($async)? fn bom_at_start() { let mut reader = Reader::from_str("\u{feff}"); assert_eq!( reader.$read_event($buf) $(.$await)? .unwrap(), - Event::Text(BytesText::from_escaped("\u{feff}").into()) + Event::Eof ); } diff --git a/src/reader/parser.rs b/src/reader/parser.rs index f2747c90..158272df 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -1,9 +1,7 @@ #[cfg(feature = "encoding")] use encoding_rs::UTF_8; -#[cfg(feature = "encoding")] -use crate::encoding::detect_encoding; -use crate::encoding::Decoder; +use crate::encoding::{self, Decoder}; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; #[cfg(feature = "encoding")] @@ -70,23 +68,31 @@ impl Parser { /// /// [`Text`]: Event::Text pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result> { - #[cfg(feature = "encoding")] - if first && self.encoding.can_be_refined() { - if let Some(encoding) = detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - } - } + let mut content = bytes; - let content = if self.trim_text_end { + if self.trim_text_end { // Skip the ending '<' let len = bytes .iter() .rposition(|&b| !is_whitespace(b)) .map_or_else(|| bytes.len(), |p| p + 1); - &bytes[..len] - } else { - bytes - }; + content = &bytes[..len]; + } + + if first { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + if let Some(encoding) = encoding::detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + content = encoding::remove_bom(content, encoding); + } + } + #[cfg(not(feature = "encoding"))] + if bytes.starts_with(encoding::UTF8_BOM) { + content = &bytes[encoding::UTF8_BOM.len()..]; + } + } + Ok(Event::Text(BytesText::wrap(content, self.decoder()))) } diff --git a/tests/encodings.rs b/tests/encodings.rs index cd135ff0..496a8450 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -1,4 +1,6 @@ +#[allow(unused_imports)] use quick_xml::events::Event; +#[allow(unused_imports)] use quick_xml::Reader; #[cfg(feature = "encoding")] diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 83a47ebb..cba1305b 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -51,6 +51,24 @@ fn html5() { ); } +#[test] +fn bom_removed_from_initial_text() { + let expected = r#" + |Characters(asdf) + |StartElement(paired [attr1="value1", attr2="value2"]) + |Characters(text) + |EndElement(paired) + |EndDocument + "#; + + // BOM right up against the text + test( + "\u{FEFF}asdftext", + expected, + true, + ); +} + #[test] fn escaped_characters() { test(