Skip to content

Commit

Permalink
Merge pull request #399 from Mingun/bom
Browse files Browse the repository at this point in the history
Rework BOM handling and encoding API
  • Loading branch information
Mingun committed Jun 20, 2022
2 parents 3b37c0e + d49a4b8 commit 8fa6f1e
Show file tree
Hide file tree
Showing 15 changed files with 1,002 additions and 945 deletions.
34 changes: 34 additions & 0 deletions Changelog.md
Expand Up @@ -17,6 +17,11 @@
- [#393]: New module `name` with `QName`, `LocalName`, `Namespace`, `Prefix`
and `PrefixDeclaration` wrappers around byte arrays and `ResolveResult` with
the result of namespace resolution
- [#180]: Make `Decoder` struct public. You already had access to it via the
`Reader::decoder()` method, but could not name it in the code. Now the preferred
way to access decoding functionality is via this struct
- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
or a start comment or a tag. For streams with BOM this event will contain a BOM

### Bug Fixes

Expand Down Expand Up @@ -56,6 +61,33 @@
- [#393]: Now `BytesStart::name()` and `BytesEnd::name()` returns `QName`, and
`BytesStart::local_name()` and `BytesEnd::local_name()` returns `LocalName`

- [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it,
use `String::from_utf8` instead (which that function did)
- [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless.
Use the same-named methods without that suffix instead. Attribute values cannot contain BOM
- [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by
`Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
`Reader::encoding()` is replaced by `Decoder::encoding()` as well
- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and
`BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked
as expected, this was only due to good luck. They was replaced by the
`BytesStartText::decode_with_bom_removal()`:
- conceptually, you should decode BOM only for the first `Text` event from the
reader (since now `StartText` event is emitted instead for this)
- text before the first tag is not an XML content at all, so it is meaningless
to try to unescape something in it

- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
disabled. Signatures of functions are now the same regardless of whether or not the feature is
enabled, and an error will be returned instead of performing replacements for invalid characters
in both cases.

Previously, if the `encoding` feature was enabled, decoding functions would return `Result<Cow<&str>>`
while without this feature they would return `Result<&str>`. With this change, only `Result<Cow<&str>>`
is returned regardless of the status of the feature.
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand All @@ -66,6 +98,8 @@

[#8]: https://github.com/Mingun/fast-xml/pull/8
[#9]: https://github.com/Mingun/fast-xml/pull/9
[#180]: https://github.com/tafia/quick-xml/issues/180
[#191]: https://github.com/tafia/quick-xml/issues/191
[#363]: https://github.com/tafia/quick-xml/issues/363
[#387]: https://github.com/tafia/quick-xml/pull/387
[#391]: https://github.com/tafia/quick-xml/pull/391
Expand Down
7 changes: 4 additions & 3 deletions benches/bench.rs
@@ -1,6 +1,7 @@
use criterion::{self, criterion_group, criterion_main, Criterion};
use pretty_assertions::assert_eq;
use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::Reader;

static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml");
Expand Down Expand Up @@ -173,15 +174,15 @@ fn bytes_text_unescaped(c: &mut Criterion) {
/// Benchmarks, how fast individual event parsed
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
group.bench_function("Text", |b| {
group.bench_function("StartText", |b| {
let src = "Hello world!".repeat(512 / 12).into_bytes();
let mut buf = Vec::with_capacity(1024);
b.iter(|| {
let mut r = Reader::from_reader(src.as_ref());
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false).check_comments(false);
match r.read_event(&mut buf) {
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down Expand Up @@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) {
let mut buf = Vec::new();
loop {
match r.read_event(&mut buf) {
Ok(Event::Empty(e)) if e.name() == b"player" => {
Ok(Event::Empty(e)) if e.name() == QName(b"player") => {
for name in ["num", "status", "avg"] {
if let Some(_attr) = e.try_get_attribute(name).unwrap() {
count += 1
Expand Down
7 changes: 0 additions & 7 deletions src/de/escape.rs
Expand Up @@ -45,12 +45,8 @@ macro_rules! deserialize_num {
where
V: Visitor<'de>,
{
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;

visitor.$visit(value)
}
};
Expand All @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
#[cfg(not(feature = "encoding"))]
let value = self.decoder.decode(&unescaped)?;

#[cfg(feature = "encoding")]
let value = self.decoder.decode(&unescaped);
visitor.visit_str(&value)
}

Expand Down
12 changes: 10 additions & 2 deletions src/de/mod.rs
Expand Up @@ -337,7 +337,7 @@ where
{
#[cfg(feature = "encoding")]
{
let value = decoder.decode(value);
let value = decoder.decode(value)?;
// No need to unescape because valid boolean representations cannot be escaped
match value.as_ref() {
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
Expand Down Expand Up @@ -624,7 +624,7 @@ where
allow_start: bool,
) -> Result<BytesCData<'de>, DeError> {
match self.next()? {
DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::InvalidXml(e.into())),
DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
DeEvent::CData(e) => Ok(e),
DeEvent::Start(e) if allow_start => {
Expand Down Expand Up @@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
let event = loop {
let e = self.reader.read_event(&mut self.buf)?;
match e {
//TODO: Probably not the best idea treat StartText as usual text
// Usually this event will represent a BOM
// Changing this requires review of the serde-de::top_level::one_element test
Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
Expand Down Expand Up @@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
loop {
let e = self.reader.read_event_unbuffered()?;
match e {
//TODO: Probably not the best idea treat StartText as usual text
// Usually this event will represent a BOM
// Changing this requires review of the serde-de::top_level::one_element test
Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
Event::Start(e) => break Ok(DeEvent::Start(e)),
Event::End(e) => break Ok(DeEvent::End(e)),
Event::Text(e) => break Ok(DeEvent::Text(e)),
Expand Down
8 changes: 1 addition & 7 deletions src/de/seq.rs
Expand Up @@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
use crate::events::BytesStart;
use crate::reader::Decoder;
use serde::de::{DeserializeSeed, SeqAccess};
#[cfg(not(feature = "encoding"))]
use std::borrow::Cow;

/// Check if tag `start` is included in the `fields` list. `decoder` is used to
/// get a string representation of a tag.
Expand All @@ -14,11 +12,7 @@ pub fn not_in(
start: &BytesStart,
decoder: Decoder,
) -> Result<bool, DeError> {
#[cfg(not(feature = "encoding"))]
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);

#[cfg(feature = "encoding")]
let tag = decoder.decode(start.name().into_inner());
let tag = decoder.decode(start.name().into_inner())?;

Ok(fields.iter().all(|&field| field != tag.as_ref()))
}
Expand Down
52 changes: 40 additions & 12 deletions src/errors.rs
Expand Up @@ -4,14 +4,16 @@ use crate::escape::EscapeError;
use crate::events::attributes::AttrError;
use crate::utils::write_byte_string;
use std::str::Utf8Error;
use std::string::FromUtf8Error;

/// The error type used by this crate.
#[derive(Debug)]
pub enum Error {
/// IO error
Io(::std::io::Error),
/// Utf8 error
Utf8(Utf8Error),
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
/// otherwise contains the UTF-8 decoding error
NonDecodable(Option<Utf8Error>),
/// Unexpected End of File
UnexpectedEof(String),
/// End event mismatch
Expand Down Expand Up @@ -46,10 +48,18 @@ impl From<::std::io::Error> for Error {
}

impl From<Utf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
/// Creates a new `Error::NonDecodable` from the given error
#[inline]
fn from(error: Utf8Error) -> Error {
Error::Utf8(error)
Error::NonDecodable(Some(error))
}
}

impl From<FromUtf8Error> for Error {
/// Creates a new `Error::Utf8` from the given error
#[inline]
fn from(error: FromUtf8Error) -> Error {
error.utf8_error().into()
}
}

Expand Down Expand Up @@ -77,7 +87,8 @@ impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Error::Io(e) => write!(f, "I/O error: {}", e),
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
Error::EndEventMismatch { expected, found } => {
write!(f, "Expecting </{}> found </{}>", expected, found)
Expand Down Expand Up @@ -109,7 +120,7 @@ impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Error::Io(e) => Some(e),
Error::Utf8(e) => Some(e),
Error::NonDecodable(Some(e)) => Some(e),
Error::InvalidAttr(e) => Some(e),
Error::EscapeError(e) => Some(e),
_ => None,
Expand Down Expand Up @@ -227,6 +238,7 @@ pub mod serialize {
}

impl From<Error> for DeError {
#[inline]
fn from(e: Error) -> Self {
Self::InvalidXml(e)
}
Expand All @@ -239,15 +251,17 @@ pub mod serialize {
}
}

impl From<ParseIntError> for DeError {
fn from(e: ParseIntError) -> Self {
Self::InvalidInt(e)
impl From<Utf8Error> for DeError {
#[inline]
fn from(e: Utf8Error) -> Self {
Self::InvalidXml(e.into())
}
}

impl From<ParseFloatError> for DeError {
fn from(e: ParseFloatError) -> Self {
Self::InvalidFloat(e)
impl From<FromUtf8Error> for DeError {
#[inline]
fn from(e: FromUtf8Error) -> Self {
Self::InvalidXml(e.into())
}
}

Expand All @@ -257,4 +271,18 @@ pub mod serialize {
Self::InvalidXml(e.into())
}
}

impl From<ParseIntError> for DeError {
#[inline]
fn from(e: ParseIntError) -> Self {
Self::InvalidInt(e)
}
}

impl From<ParseFloatError> for DeError {
#[inline]
fn from(e: ParseFloatError) -> Self {
Self::InvalidFloat(e)
}
}
}

0 comments on commit 8fa6f1e

Please sign in to comment.