From bddd1a625f9ba583d1c735a10fe2429f80d814cc Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 23 Apr 2022 21:40:08 +0500 Subject: [PATCH 1/8] Group all attribute-related methods in one impl block Also fix incorrect references to `Attributes` instead of `Attribute` --- src/events/mod.rs | 118 +++++++++++++++++++++++----------------------- src/writer.rs | 2 - 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index 1dca2e98..c42170fe 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -159,20 +159,6 @@ impl<'a> BytesStart<'a> { BytesEnd::borrowed(self.name()) } - /// Consumes `self` and yield a new `BytesStart` with additional attributes from an iterator. - /// - /// The yielded items must be convertible to [`Attribute`] using `Into`. - /// - /// [`Attribute`]: attributes/struct.Attributes.html - pub fn with_attributes<'b, I>(mut self, attributes: I) -> Self - where - I: IntoIterator, - I::Item: Into>, - { - self.extend_attributes(attributes); - self - } - /// Gets the undecoded raw tag name as a `&[u8]`. #[inline] pub fn name(&self) -> &[u8] { @@ -226,39 +212,6 @@ impl<'a> BytesStart<'a> { do_unescape(&*self.buf, custom_entities).map_err(Error::EscapeError) } - /// Returns an iterator over the attributes of this tag. - pub fn attributes(&self) -> Attributes { - Attributes::new(&self.buf, self.name_len) - } - - /// Returns an iterator over the HTML-like attributes of this tag (no mandatory quotes or `=`). - pub fn html_attributes(&self) -> Attributes { - Attributes::html(self, self.name_len) - } - - /// Gets the undecoded raw string with the attributes of this tag as a `&[u8]`, - /// including the whitespace after the tag name if there is any. - #[inline] - pub fn attributes_raw(&self) -> &[u8] { - &self.buf[self.name_len..] - } - - /// Add additional attributes to this tag using an iterator. - /// - /// The yielded items must be convertible to [`Attribute`] using `Into`. - /// - /// [`Attribute`]: attributes/struct.Attributes.html - pub fn extend_attributes<'b, I>(&mut self, attributes: I) -> &mut BytesStart<'a> - where - I: IntoIterator, - I::Item: Into>, - { - for attr in attributes { - self.push_attribute(attr); - } - self - } - /// Returns the unescaped and decoded string value. /// /// This allocates a `String` in all cases. For performance reasons it might be a better idea to @@ -323,17 +276,6 @@ impl<'a> BytesStart<'a> { String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) } - /// Adds an attribute to this element. - pub fn push_attribute<'b, A: Into>>(&mut self, attr: A) { - let a = attr.into(); - let bytes = self.buf.to_mut(); - bytes.push(b' '); - bytes.extend_from_slice(a.key); - bytes.extend_from_slice(b"=\""); - bytes.extend_from_slice(&*a.value); - bytes.push(b'"'); - } - /// Edit the name of the BytesStart in-place /// /// # Warning @@ -345,6 +287,49 @@ impl<'a> BytesStart<'a> { self.name_len = name.len(); self } +} + +/// Attribute-related methods +impl<'a> BytesStart<'a> { + /// Consumes `self` and yield a new `BytesStart` with additional attributes from an iterator. + /// + /// The yielded items must be convertible to [`Attribute`] using `Into`. + pub fn with_attributes<'b, I>(mut self, attributes: I) -> Self + where + I: IntoIterator, + I::Item: Into>, + { + self.extend_attributes(attributes); + self + } + + /// Add additional attributes to this tag using an iterator. + /// + /// The yielded items must be convertible to [`Attribute`] using `Into`. + pub fn extend_attributes<'b, I>(&mut self, attributes: I) -> &mut BytesStart<'a> + where + I: IntoIterator, + I::Item: Into>, + { + for attr in attributes { + self.push_attribute(attr); + } + self + } + + /// Adds an attribute to this element. + pub fn push_attribute<'b, A>(&mut self, attr: A) + where + A: Into>, + { + let a = attr.into(); + let bytes = self.buf.to_mut(); + bytes.push(b' '); + bytes.extend_from_slice(a.key); + bytes.extend_from_slice(b"=\""); + bytes.extend_from_slice(&*a.value); + bytes.push(b'"'); + } /// Remove all attributes from the ByteStart pub fn clear_attributes(&mut self) -> &mut BytesStart<'a> { @@ -352,6 +337,23 @@ impl<'a> BytesStart<'a> { self } + /// Returns an iterator over the attributes of this tag. + pub fn attributes(&self) -> Attributes { + Attributes::new(&self.buf, self.name_len) + } + + /// Returns an iterator over the HTML-like attributes of this tag (no mandatory quotes or `=`). + pub fn html_attributes(&self) -> Attributes { + Attributes::html(self, self.name_len) + } + + /// Gets the undecoded raw string with the attributes of this tag as a `&[u8]`, + /// including the whitespace after the tag name if there is any. + #[inline] + pub fn attributes_raw(&self) -> &[u8] { + &self.buf[self.name_len..] + } + /// Try to get an attribute pub fn try_get_attribute + Sized>( &'a self, diff --git a/src/writer.rs b/src/writer.rs index 768c244d..f9edf865 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -240,8 +240,6 @@ impl<'a, W: Write> ElementWriter<'a, W> { /// Add additional attributes to this element using an iterator. /// /// The yielded items must be convertible to [`Attribute`] using `Into`. - /// - /// [`Attribute`]: attributes/struct.Attributes.html pub fn with_attributes<'b, I>(mut self, attributes: I) -> Self where I: IntoIterator, From 149635a8de64721d2976837f1e5db3eea9fef7e7 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 10 Apr 2022 20:22:37 +0500 Subject: [PATCH 2/8] Group `Attributes` definition and impl block --- src/events/attributes.rs | 114 ++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 2042fc18..ed6fde98 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -7,62 +7,6 @@ use crate::escape::{do_unescape, escape}; use crate::reader::{is_whitespace, Reader}; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Range}; -/// Iterator over XML attributes. -/// -/// Yields `Result`. An `Err` will be yielded if an attribute is malformed or duplicated. -/// The duplicate check can be turned off by calling [`with_checks(false)`]. -/// -/// [`with_checks(false)`]: #method.with_checks -#[derive(Clone, Debug)] -pub struct Attributes<'a> { - /// slice of `Element` corresponding to attributes - bytes: &'a [u8], - /// current position of the iterator - pub(crate) position: usize, - /// if true, checks for duplicate names - with_checks: bool, - /// allows attribute without quote or `=` - html: bool, - /// if `with_checks`, contains the ranges corresponding to the - /// attribute names already parsed in this `Element` - consumed: Vec>, -} - -impl<'a> Attributes<'a> { - /// Creates a new attribute iterator from a buffer. - pub fn new(buf: &'a [u8], pos: usize) -> Attributes<'a> { - Attributes { - bytes: buf, - position: pos, - html: false, - with_checks: true, - consumed: Vec::new(), - } - } - - /// Creates a new attribute iterator from a buffer, allowing HTML attribute syntax. - pub fn html(buf: &'a [u8], pos: usize) -> Attributes<'a> { - Attributes { - bytes: buf, - position: pos, - html: true, - with_checks: true, - consumed: Vec::new(), - } - } - - /// Changes whether attributes should be checked for uniqueness. - /// - /// The XML specification requires attribute keys in the same element to be unique. This check - /// can be disabled to improve performance slightly. - /// - /// (`true` by default) - pub fn with_checks(&mut self, val: bool) -> &mut Attributes<'a> { - self.with_checks = val; - self - } -} - /// A struct representing a key/value XML attribute. /// /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely @@ -333,6 +277,64 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Iterator over XML attributes. +/// +/// Yields `Result`. An `Err` will be yielded if an attribute is malformed or duplicated. +/// The duplicate check can be turned off by calling [`with_checks(false)`]. +/// +/// [`with_checks(false)`]: #method.with_checks +#[derive(Clone, Debug)] +pub struct Attributes<'a> { + /// slice of `Element` corresponding to attributes + bytes: &'a [u8], + /// current position of the iterator + pub(crate) position: usize, + /// if true, checks for duplicate names + with_checks: bool, + /// allows attribute without quote or `=` + html: bool, + /// if `with_checks`, contains the ranges corresponding to the + /// attribute names already parsed in this `Element` + consumed: Vec>, +} + +impl<'a> Attributes<'a> { + /// Creates a new attribute iterator from a buffer. + pub fn new(buf: &'a [u8], pos: usize) -> Attributes<'a> { + Attributes { + bytes: buf, + position: pos, + html: false, + with_checks: true, + consumed: Vec::new(), + } + } + + /// Creates a new attribute iterator from a buffer, allowing HTML attribute syntax. + pub fn html(buf: &'a [u8], pos: usize) -> Attributes<'a> { + Attributes { + bytes: buf, + position: pos, + html: true, + with_checks: true, + consumed: Vec::new(), + } + } + + /// Changes whether attributes should be checked for uniqueness. + /// + /// The XML specification requires attribute keys in the same element to be unique. This check + /// can be disabled to improve performance slightly. + /// + /// (`true` by default) + pub fn with_checks(&mut self, val: bool) -> &mut Attributes<'a> { + self.with_checks = val; + self + } +} + impl<'a> Iterator for Attributes<'a> { type Item = Result>; fn next(&mut self) -> Option { From 6b82fc3a17586af2f71e6c78e83ea49e90583669 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 11 Apr 2022 23:31:29 +0500 Subject: [PATCH 3/8] Remove period from error messages for consistency and rephrase some messages --- src/errors.rs | 18 ++++++++++-------- tests/documents/html5.txt | 2 +- tests/xmlrs_reader_tests.rs | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index 2860c49b..6c0dc743 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -27,13 +27,15 @@ pub enum Error { TextNotFound, /// `Event::XmlDecl` must start with *version* attribute XmlDeclWithoutVersion(Option), - /// Attribute Name contains quote + /// Attribute Name contains quote, position relative to start of owning tag is provided NameWithQuote(usize), - /// Attribute key not followed by with `=` + /// Attribute key not followed by with `=`, position relative to start of owning tag is provided NoEqAfterName(usize), - /// Attribute value not quoted + /// Attribute value not quoted, position relative to start of owning tag is provided UnquotedValue(usize), - /// Duplicate attribute + /// Duplicate attribute, positions relative to start of owning tag is provided: + /// - position of the duplicate + /// - previous position DuplicatedAttribute(usize, usize), /// Escape error EscapeError(EscapeError), @@ -73,7 +75,7 @@ impl std::fmt::Display for Error { match self { Error::Io(e) => write!(f, "I/O error: {}", e), Error::Utf8(e) => write!(f, "UTF8 error: {}", e), - Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}.", e), + Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e), Error::EndEventMismatch { expected, found } => { write!(f, "Expecting found ", expected, found) } @@ -92,19 +94,19 @@ impl std::fmt::Display for Error { Error::NameWithQuote(e) => write!( f, "error while parsing attribute at position {}: \ - Attribute key cannot contain quote.", + Attribute key cannot contain quote", e ), Error::NoEqAfterName(e) => write!( f, "error while parsing attribute at position {}: \ - Attribute key must be directly followed by = or space", + Attribute key must be directly followed by `=` or space", e ), Error::UnquotedValue(e) => write!( f, "error while parsing attribute at position {}: \ - Attribute value must start with a quote.", + Attribute value must start with a single or double quote", e ), Error::DuplicatedAttribute(pos1, pos2) => write!( diff --git a/tests/documents/html5.txt b/tests/documents/html5.txt index f1bc908b..df39548a 100644 --- a/tests/documents/html5.txt +++ b/tests/documents/html5.txt @@ -1,7 +1,7 @@ DocType(html) Characters( ) -StartElement(a, attr-error: error while parsing attribute at position 7: Attribute value must start with a quote.) +StartElement(a, attr-error: error while parsing attribute at position 7: Attribute value must start with a single or double quote) Characters(Hey) EndElement(a) Characters( diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 25435c25..31e7d772 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -161,7 +161,7 @@ fn sample_ns_short() { fn eof_1() { test( r#" Date: Sun, 10 Apr 2022 23:27:13 +0500 Subject: [PATCH 4/8] Error kind `NameWithQuote` never triggered, remove it .find(|&(_, &b)| b == b'=' || is_whitespace(b)) cannot return Some((_, b'"')) or Some((_, b'\'')) because `"` and `'` are not whitespaces --- src/errors.rs | 8 -------- src/events/attributes.rs | 3 --- 2 files changed, 11 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index 6c0dc743..a1c71cda 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -27,8 +27,6 @@ pub enum Error { TextNotFound, /// `Event::XmlDecl` must start with *version* attribute XmlDeclWithoutVersion(Option), - /// Attribute Name contains quote, position relative to start of owning tag is provided - NameWithQuote(usize), /// Attribute key not followed by with `=`, position relative to start of owning tag is provided NoEqAfterName(usize), /// Attribute value not quoted, position relative to start of owning tag is provided @@ -91,12 +89,6 @@ impl std::fmt::Display for Error { "XmlDecl must start with 'version' attribute, found {:?}", e ), - Error::NameWithQuote(e) => write!( - f, - "error while parsing attribute at position {}: \ - Attribute key cannot contain quote", - e - ), Error::NoEqAfterName(e) => write!( f, "error while parsing attribute at position {}: \ diff --git a/src/events/attributes.rs b/src/events/attributes.rs index ed6fde98..d9b51eda 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -386,9 +386,6 @@ impl<'a> Iterator for Attributes<'a> { .find(|&(_, &b)| b == b'=' || is_whitespace(b)) { Some((i, &b'=')) => i, - Some((i, &b'\'')) | Some((i, &b'"')) if self.with_checks => { - err!(Error::NameWithQuote(i)); - } Some((i, _)) => { // consume until `=` or return if html match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) { From f93259ad587f224ba508f2b7df93c8db58d3cbb9 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 12 Apr 2022 22:27:05 +0500 Subject: [PATCH 5/8] Use dedicated comparable error type for attribute parsing errors Because iterator over attributes is recovering, user can want to inspect possible errors and filtering out irrelevant errors should help him Co-authored-by: Daniel Alley --- src/errors.rs | 38 +++------- src/events/attributes.rs | 142 ++++++++++++++++++++++++++++++++++-- tests/documents/html5.txt | 2 +- tests/xmlrs_reader_tests.rs | 4 +- 4 files changed, 150 insertions(+), 36 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index a1c71cda..fe805f4e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,6 +1,7 @@ //! Error management module use crate::escape::EscapeError; +use crate::events::attributes::AttrError; use std::str::Utf8Error; /// The error type used by this crate. @@ -27,14 +28,8 @@ pub enum Error { TextNotFound, /// `Event::XmlDecl` must start with *version* attribute XmlDeclWithoutVersion(Option), - /// Attribute key not followed by with `=`, position relative to start of owning tag is provided - NoEqAfterName(usize), - /// Attribute value not quoted, position relative to start of owning tag is provided - UnquotedValue(usize), - /// Duplicate attribute, positions relative to start of owning tag is provided: - /// - position of the duplicate - /// - previous position - DuplicatedAttribute(usize, usize), + /// Attribute parsing error + InvalidAttr(AttrError), /// Escape error EscapeError(EscapeError), } @@ -63,6 +58,13 @@ impl From for Error { } } +impl From for Error { + #[inline] + fn from(error: AttrError) -> Self { + Error::InvalidAttr(error) + } +} + /// A specialized `Result` type where the error is hard-wired to [`Error`]. /// /// [`Error`]: enum.Error.html @@ -89,24 +91,7 @@ impl std::fmt::Display for Error { "XmlDecl must start with 'version' attribute, found {:?}", e ), - Error::NoEqAfterName(e) => write!( - f, - "error while parsing attribute at position {}: \ - Attribute key must be directly followed by `=` or space", - e - ), - Error::UnquotedValue(e) => write!( - f, - "error while parsing attribute at position {}: \ - Attribute value must start with a single or double quote", - e - ), - Error::DuplicatedAttribute(pos1, pos2) => write!( - f, - "error while parsing attribute at position {0}: \ - Duplicate attribute at position {1} and {0}", - pos1, pos2 - ), + Error::InvalidAttr(e) => write!(f, "error while parsing attribute: {}", e), Error::EscapeError(e) => write!(f, "{}", e), } } @@ -117,6 +102,7 @@ impl std::error::Error for Error { match self { Error::Io(e) => Some(e), Error::Utf8(e) => Some(e), + Error::InvalidAttr(e) => Some(e), Error::EscapeError(e) => Some(e), _ => None, } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d9b51eda..832995a6 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -5,6 +5,7 @@ use crate::errors::{Error, Result}; use crate::escape::{do_unescape, escape}; use crate::reader::{is_whitespace, Reader}; +use std::fmt::{Debug, Display, Formatter}; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Range}; /// A struct representing a key/value XML attribute. @@ -223,8 +224,8 @@ impl<'a> Attribute<'a> { } } -impl<'a> std::fmt::Debug for Attribute<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl<'a> Debug for Attribute<'a> { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { use crate::utils::{write_byte_string, write_cow_string}; write!(f, "Attribute {{ key: ")?; @@ -394,12 +395,12 @@ impl<'a> Iterator for Attributes<'a> { self.position = j - 1; return attr!(start_key..i, 0..0); } - Some((j, _)) => err!(Error::NoEqAfterName(j)), + Some((j, _)) => err!(AttrError::ExpectedEq(j)), None if self.html => { self.position = len; return attr!(start_key..len, 0..0); } - None => err!(Error::NoEqAfterName(len)), + None => err!(AttrError::ExpectedEq(len)), } } None => return attr!(start_key..len), @@ -413,7 +414,7 @@ impl<'a> Iterator for Attributes<'a> { .find(|r| self.bytes[(*r).clone()] == self.bytes[start_key..end_key]) .map(|ref r| r.start) { - err!(Error::DuplicatedAttribute(start_key, start)); + err!(AttrError::Duplicated(start_key, start)); } self.consumed.push(start_key..end_key); } @@ -426,7 +427,7 @@ impl<'a> Iterator for Attributes<'a> { self.position = j + 1; return attr!(start_key..end_key, i + 1..j); } - None => err!(Error::UnquotedValue(i)), + None => err!(AttrError::UnquotedValue(i)), } } Some((i, _)) if self.html => { @@ -437,12 +438,139 @@ impl<'a> Iterator for Attributes<'a> { self.position = j; return attr!(start_key..end_key, i..j); } - Some((i, _)) => err!(Error::UnquotedValue(i)), + Some((i, _)) => err!(AttrError::UnquotedValue(i)), None => return attr!(start_key..end_key), } } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Errors that can be raised during parsing attributes. +/// +/// Recovery position in examples shows the position from which parsing of the +/// next attribute will be attempted. +#[derive(Debug, PartialEq)] +pub enum AttrError { + /// Attribute key was not followed by `=`, position relative to the start of + /// the owning tag is provided. + /// + /// Example of input that raises this error: + /// + /// ```xml + /// + /// + /// ``` + /// + /// This error can be raised only when the iterator is in XML mode. + ExpectedEq(usize), + /// Attribute value was not found after `=`, position relative to the start + /// of the owning tag is provided. + /// + /// Example of input that raises this error: + /// + /// ```xml + /// + /// + /// ``` + /// + /// This error can be returned only for the last attribute in the list, + /// because otherwise any content after `=` will be threated as a value. + /// The XML + /// + /// ```xml + /// + /// + /// + /// ``` + /// + /// will be treated as `Attribute { key = b"key", value = b"another-key" }` + /// and or [`Attribute`] is returned, or [`AttrError::UnquotedValue`] is raised, + /// depending on the parsing mode. + ExpectedValue(usize), + /// Attribute value is not quoted, position relative to the start of the + /// owning tag is provided. + /// + /// Example of input that raises this error: + /// + /// ```xml + /// + /// + /// + /// ``` + /// + /// This error can be raised only when the iterator is in XML mode. + UnquotedValue(usize), + /// Attribute value was not finished with a matching quote, position relative + /// to the start of owning tag and a quote is provided. That position is always + /// a last character in the tag content. + /// + /// Example of input that raises this error: + /// + /// ```xml + /// + /// + /// + /// + /// ``` + /// + /// This error is returned only when [`Attributes::with_checks()`] is set + /// to `true` (that is default behavior). + Duplicated(usize, usize), +} + +impl Display for AttrError { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + Self::ExpectedEq(pos) => write!( + f, + r#"position {}: attribute key must be directly followed by `=` or space"#, + pos + ), + Self::ExpectedValue(pos) => write!( + f, + r#"position {}: `=` must be followed by an attribute value"#, + pos + ), + Self::UnquotedValue(pos) => write!( + f, + r#"position {}: attribute value must be enclosed in `"` or `'`"#, + pos + ), + Self::ExpectedQuote(pos, quote) => write!( + f, + r#"position {}: missing closing quote `{}` in attribute value"#, + pos, *quote as char + ), + Self::Duplicated(pos1, pos2) => write!( + f, + r#"position {}: duplicated attribute, previous declaration at position {}"#, + pos1, pos2 + ), + } + } +} + +impl std::error::Error for AttrError {} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod tests { use super::*; diff --git a/tests/documents/html5.txt b/tests/documents/html5.txt index df39548a..91211edb 100644 --- a/tests/documents/html5.txt +++ b/tests/documents/html5.txt @@ -1,7 +1,7 @@ DocType(html) Characters( ) -StartElement(a, attr-error: error while parsing attribute at position 7: Attribute value must start with a single or double quote) +StartElement(a, attr-error: error while parsing attribute: position 7: attribute value must be enclosed in `"` or `'`) Characters(Hey) EndElement(a) Characters( diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 31e7d772..765cdec8 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -227,8 +227,8 @@ fn issue_83_duplicate_attributes() { r#""#, " |StartElement(hello) - |1:30 EmptyElement(some-tag, attr-error: error while parsing \ - attribute at position 16: Duplicate attribute at position 9 and 16) + |1:30 EmptyElement(some-tag, attr-error: error while parsing attribute: \ + position 16: duplicated attribute, previous declaration at position 9) |EndElement(hello) |EndDocument ", From 317ab1473fe976c61c6be9dbbce8dcbd21a0cb52 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 13 Apr 2022 00:47:43 +0500 Subject: [PATCH 6/8] Use `AttrError` in the Attributes iterator return type Because now error type is PartialEq, we can use `assert_eq!` directly, which is especially nice when used with pretty_assertions crate --- Changelog.md | 2 + src/errors.rs | 7 +++ src/events/attributes.rs | 31 +++++------ src/events/mod.rs | 2 +- tests/documents/html5.txt | 2 +- tests/namespaces.rs | 94 ++++++++++++++------------------- tests/test.rs | 38 ++++++-------- tests/unit_tests.rs | 102 +++++++++++++++++++++++------------- tests/xmlrs_reader_tests.rs | 2 +- 9 files changed, 149 insertions(+), 131 deletions(-) diff --git a/Changelog.md b/Changelog.md index 695dc8bf..ad892069 100644 --- a/Changelog.md +++ b/Changelog.md @@ -38,6 +38,8 @@ ([quick-xml#311](https://github.com/tafia/quick-xml/issues/311)) - feat: add `Reader::get_ref()` and `Reader::get_mut()`, rename `Reader::into_underlying_reader()` to `Reader::into_inner()` +- refactor: now `Attributes::next()` returns a new type `AttrError` when attribute parsing failed + ([#4](https://github.com/Mingun/fast-xml/pull/4)) ## 0.23.0-alpha3 diff --git a/src/errors.rs b/src/errors.rs index fe805f4e..b180ac15 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -219,4 +219,11 @@ pub mod serialize { DeError::Float(e) } } + + impl From for DeError { + #[inline] + fn from(e: AttrError) -> Self { + DeError::Xml(e.into()) + } + } } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 832995a6..f5fbf40b 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -2,7 +2,7 @@ //! //! Provides an iterator over attributes key/value pairs -use crate::errors::{Error, Result}; +use crate::errors::{Error, Result as XmlResult}; use crate::escape::{do_unescape, escape}; use crate::reader::{is_whitespace, Reader}; use std::fmt::{Debug, Display, Formatter}; @@ -37,7 +37,7 @@ impl<'a> Attribute<'a> { /// This will allocate if the value contains any escape sequences. /// /// See also [`unescaped_value_with_custom_entities()`](#method.unescaped_value_with_custom_entities) - pub fn unescaped_value(&self) -> Result> { + pub fn unescaped_value(&self) -> XmlResult> { self.make_unescaped_value(None) } @@ -57,14 +57,14 @@ impl<'a> Attribute<'a> { pub fn unescaped_value_with_custom_entities( &self, custom_entities: &HashMap, Vec>, - ) -> Result> { + ) -> XmlResult> { self.make_unescaped_value(Some(custom_entities)) } fn make_unescaped_value( &self, custom_entities: Option<&HashMap, Vec>>, - ) -> Result> { + ) -> XmlResult> { do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError) } @@ -78,7 +78,7 @@ impl<'a> Attribute<'a> { /// /// [`unescaped_value()`]: #method.unescaped_value /// [`Reader::decode()`]: ../../reader/struct.Reader.html#method.decode - pub fn unescape_and_decode_value(&self, reader: &Reader) -> Result { + pub fn unescape_and_decode_value(&self, reader: &Reader) -> XmlResult { self.do_unescape_and_decode_value(reader, None) } @@ -100,7 +100,7 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, custom_entities: &HashMap, Vec>, - ) -> Result { + ) -> XmlResult { self.do_unescape_and_decode_value(reader, Some(custom_entities)) } @@ -110,7 +110,7 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + ) -> XmlResult { let decoded = reader.decode(&*self.value); let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; @@ -122,7 +122,7 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + ) -> XmlResult { let decoded = reader.decode(&*self.value)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; @@ -140,7 +140,7 @@ impl<'a> Attribute<'a> { pub fn unescape_and_decode_without_bom( &self, reader: &mut Reader, - ) -> Result { + ) -> XmlResult { self.do_unescape_and_decode_without_bom(reader, None) } @@ -155,7 +155,7 @@ impl<'a> Attribute<'a> { pub fn unescape_and_decode_without_bom( &self, reader: &Reader, - ) -> Result { + ) -> XmlResult { self.do_unescape_and_decode_without_bom(reader, None) } @@ -175,7 +175,7 @@ impl<'a> Attribute<'a> { &self, reader: &mut Reader, custom_entities: &HashMap, Vec>, - ) -> Result { + ) -> XmlResult { self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) } @@ -195,7 +195,7 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, custom_entities: &HashMap, Vec>, - ) -> Result { + ) -> XmlResult { self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) } @@ -204,7 +204,7 @@ impl<'a> Attribute<'a> { &self, reader: &mut Reader, custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + ) -> XmlResult { let decoded = reader.decode_without_bom(&*self.value); let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; @@ -216,7 +216,7 @@ impl<'a> Attribute<'a> { &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + ) -> XmlResult { let decoded = reader.decode_without_bom(&*self.value)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; @@ -337,7 +337,8 @@ impl<'a> Attributes<'a> { } impl<'a> Iterator for Attributes<'a> { - type Item = Result>; + type Item = Result, AttrError>; + fn next(&mut self) -> Option { let len = self.bytes.len(); diff --git a/src/events/mod.rs b/src/events/mod.rs index c42170fe..86662413 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -456,7 +456,7 @@ impl<'a> BytesDecl<'a> { Err(Error::XmlDeclWithoutVersion(Some(found))) } // error parsing attributes - Some(Err(e)) => Err(e), + Some(Err(e)) => Err(e.into()), // no attributes None => Err(Error::XmlDeclWithoutVersion(None)), } diff --git a/tests/documents/html5.txt b/tests/documents/html5.txt index 91211edb..05f200d4 100644 --- a/tests/documents/html5.txt +++ b/tests/documents/html5.txt @@ -1,7 +1,7 @@ DocType(html) Characters( ) -StartElement(a, attr-error: error while parsing attribute: position 7: attribute value must be enclosed in `"` or `'`) +StartElement(a, attr-error: position 7: attribute value must be enclosed in `"` or `'`) Characters(Hey) EndElement(a) Characters( diff --git a/tests/namespaces.rs b/tests/namespaces.rs index 3fc19ff6..4668d069 100644 --- a/tests/namespaces.rs +++ b/tests/namespaces.rs @@ -151,7 +151,7 @@ fn attributes_empty_ns() { e => panic!("Expecting Empty event, got {:?}", e), }; - let mut atts = e + let mut attrs = e .attributes() .map(|ar| ar.expect("Expecting attribute parsing to succeed.")) // we don't care about xmlns attributes for this test @@ -160,23 +160,19 @@ fn attributes_empty_ns() { let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf); (opt_ns, local_name, value) }); - match atts.next() { - Some((None, b"att1", Cow::Borrowed(b"a"))) => (), - e => panic!("Expecting att1='a' attribute, found {:?}", e), - } - match atts.next() { - Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => { - assert_eq!(&ns[..], b"urn:example:r"); - } - e => panic!( - "Expecting {{urn:example:r}}att2='b' attribute, found {:?}", - e - ), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + assert_eq!( + attrs.next(), + Some((None, &b"att1"[..], Cow::Borrowed(&b"a"[..]))) + ); + assert_eq!( + attrs.next(), + Some(( + Some(&b"urn:example:r"[..]), + &b"att2"[..], + Cow::Borrowed(&b"b"[..]) + )) + ); + assert_eq!(attrs.next(), None); } /// Single empty element with qualified attributes. @@ -196,7 +192,7 @@ fn attributes_empty_ns_expanded() { e => panic!("Expecting Empty event, got {:?}", e), }; - let mut atts = e + let mut attrs = e .attributes() .map(|ar| ar.expect("Expecting attribute parsing to succeed.")) // we don't care about xmlns attributes for this test @@ -205,23 +201,19 @@ fn attributes_empty_ns_expanded() { let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf); (opt_ns, local_name, value) }); - match atts.next() { - Some((None, b"att1", Cow::Borrowed(b"a"))) => (), - e => panic!("Expecting att1='a' attribute, found {:?}", e), - } - match atts.next() { - Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => { - assert_eq!(&ns[..], b"urn:example:r"); - } - e => panic!( - "Expecting {{urn:example:r}}att2='b' attribute, found {:?}", - e - ), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + assert_eq!( + attrs.next(), + Some((None, &b"att1"[..], Cow::Borrowed(&b"a"[..]))) + ); + assert_eq!( + attrs.next(), + Some(( + Some(&b"urn:example:r"[..]), + &b"att2"[..], + Cow::Borrowed(&b"b"[..]) + )) + ); + assert_eq!(attrs.next(), None); } match r.read_namespaced_event(&mut buf, &mut ns_buf) { @@ -261,7 +253,7 @@ fn default_ns_shadowing_empty() { e => panic!("Expecting Empty event, got {:?}", e), }; - let mut atts = e + let mut attrs = e .attributes() .map(|ar| ar.expect("Expecting attribute parsing to succeed.")) // we don't care about xmlns attributes for this test @@ -272,14 +264,11 @@ fn default_ns_shadowing_empty() { }); // the attribute should _not_ have a namespace name. The default namespace does not // apply to attributes. - match atts.next() { - Some((None, b"att1", Cow::Borrowed(b"a"))) => (), - e => panic!("Expecting att1='a' attribute, found {:?}", e), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + assert_eq!( + attrs.next(), + Some((None, &b"att1"[..], Cow::Borrowed(&b"a"[..]))) + ); + assert_eq!(attrs.next(), None); } // @@ -323,7 +312,7 @@ fn default_ns_shadowing_expanded() { } e => panic!("Expecting Start event (), got {:?}", e), }; - let mut atts = e + let mut attrs = e .attributes() .map(|ar| ar.expect("Expecting attribute parsing to succeed.")) // we don't care about xmlns attributes for this test @@ -334,14 +323,11 @@ fn default_ns_shadowing_expanded() { }); // the attribute should _not_ have a namespace name. The default namespace does not // apply to attributes. - match atts.next() { - Some((None, b"att1", Cow::Borrowed(b"a"))) => (), - e => panic!("Expecting att1='a' attribute, found {:?}", e), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + assert_eq!( + attrs.next(), + Some((None, &b"att1"[..], Cow::Borrowed(&b"a"[..]))) + ); + assert_eq!(attrs.next(), None); } // virtual diff --git a/tests/test.rs b/tests/test.rs index c074a4ed..327d5771 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -32,25 +32,22 @@ fn test_attributes_empty() { let mut buf = Vec::new(); match r.read_event(&mut buf) { Ok(Empty(e)) => { - let mut atts = e.attributes(); - match atts.next() { + let mut attrs = e.attributes(); + assert_eq!( + attrs.next(), Some(Ok(Attribute { key: b"att1", value: Cow::Borrowed(b"a"), - })) => (), - e => panic!("Expecting att1='a' attribute, found {:?}", e), - } - match atts.next() { + })) + ); + assert_eq!( + attrs.next(), Some(Ok(Attribute { key: b"att2", value: Cow::Borrowed(b"b"), - })) => (), - e => panic!("Expecting att2='b' attribute, found {:?}", e), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + })) + ); + assert_eq!(attrs.next(), None); } e => panic!("Expecting Empty event, got {:?}", e), } @@ -64,18 +61,15 @@ fn test_attribute_equal() { let mut buf = Vec::new(); match r.read_event(&mut buf) { Ok(Empty(e)) => { - let mut atts = e.attributes(); - match atts.next() { + let mut attrs = e.attributes(); + assert_eq!( + attrs.next(), Some(Ok(Attribute { key: b"att1", value: Cow::Borrowed(b"a=b"), - })) => (), - e => panic!("Expecting att1=\"a=b\" attribute, found {:?}", e), - } - match atts.next() { - None => (), - e => panic!("Expecting None, found {:?}", e), - } + })) + ); + assert_eq!(attrs.next(), None); } e => panic!("Expecting Empty event, got {:?}", e), } diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 75e65c74..99c7e057 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -1,6 +1,8 @@ +use std::borrow::Cow; use std::io::Cursor; use std::str::from_utf8; +use fast_xml::events::attributes::{AttrError, Attribute}; use fast_xml::events::{BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use fast_xml::{events::Event::*, Reader, Result, Writer}; @@ -314,6 +316,8 @@ fn test_write_empty_element_attrs() -> Result<()> { #[test] fn test_write_attrs() -> Result<()> { + type AttrResult = std::result::Result; + let str_from = r#""#; let expected = r#""#; let mut reader = Reader::from_str(str_from); @@ -324,7 +328,7 @@ fn test_write_attrs() -> Result<()> { let event = match reader.read_event(&mut buf)? { Eof => break, Start(elem) => { - let mut attrs = elem.attributes().collect::>>()?; + let mut attrs = elem.attributes().collect::>>()?; attrs.extend_from_slice(&[("a", "b").into(), ("c", "d").into()]); let mut elem = BytesStart::owned(b"copy".to_vec(), 4); elem.extend_attributes(attrs); @@ -671,15 +675,21 @@ fn test_closing_bracket_in_single_quote_attr() { match r.read_event(&mut buf) { Ok(Start(e)) => { let mut attrs = e.attributes(); - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("attr".as_bytes(), ">".as_bytes()).into()), - x => panic!("expected attribute 'attr', got {:?}", x), - } - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("check".as_bytes(), "2".as_bytes()).into()), - x => panic!("expected attribute 'check', got {:?}", x), - } - assert!(attrs.next().is_none(), "expected only two attributes"); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"attr", + value: Cow::Borrowed(b">"), + })) + ); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"check", + value: Cow::Borrowed(b"2"), + })) + ); + assert_eq!(attrs.next(), None); } x => panic!("expected , got {:?}", x), } @@ -694,15 +704,21 @@ fn test_closing_bracket_in_double_quote_attr() { match r.read_event(&mut buf) { Ok(Start(e)) => { let mut attrs = e.attributes(); - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("attr".as_bytes(), ">".as_bytes()).into()), - x => panic!("expected attribute 'attr', got {:?}", x), - } - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("check".as_bytes(), "2".as_bytes()).into()), - x => panic!("expected attribute 'check', got {:?}", x), - } - assert!(attrs.next().is_none(), "expected only two attributes"); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"attr", + value: Cow::Borrowed(b">"), + })) + ); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"check", + value: Cow::Borrowed(b"2"), + })) + ); + assert_eq!(attrs.next(), None); } x => panic!("expected , got {:?}", x), } @@ -717,15 +733,21 @@ fn test_closing_bracket_in_double_quote_mixed() { match r.read_event(&mut buf) { Ok(Start(e)) => { let mut attrs = e.attributes(); - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("attr".as_bytes(), "'>'".as_bytes()).into()), - x => panic!("expected attribute 'attr', got {:?}", x), - } - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("check".as_ref(), "'2'".as_bytes()).into()), - x => panic!("expected attribute 'check', got {:?}", x), - } - assert!(attrs.next().is_none(), "expected only two attributes"); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"attr", + value: Cow::Borrowed(b"'>'"), + })) + ); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"check", + value: Cow::Borrowed(b"'2'"), + })) + ); + assert_eq!(attrs.next(), None); } x => panic!("expected , got {:?}", x), } @@ -740,15 +762,21 @@ fn test_closing_bracket_in_single_quote_mixed() { match r.read_event(&mut buf) { Ok(Start(e)) => { let mut attrs = e.attributes(); - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("attr".as_bytes(), "\">\"".as_bytes()).into()), - x => panic!("expected attribute 'attr', got {:?}", x), - } - match attrs.next() { - Some(Ok(attr)) => assert_eq!(attr, ("check".as_bytes(), "\"2\"".as_bytes()).into()), - x => panic!("expected attribute 'check', got {:?}", x), - } - assert!(attrs.next().is_none(), "expected only two attributes"); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"attr", + value: Cow::Borrowed(br#"">""#), + })) + ); + assert_eq!( + attrs.next(), + Some(Ok(Attribute { + key: b"check", + value: Cow::Borrowed(br#""2""#), + })) + ); + assert_eq!(attrs.next(), None); } x => panic!("expected , got {:?}", x), } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 765cdec8..801782f7 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -227,7 +227,7 @@ fn issue_83_duplicate_attributes() { r#""#, " |StartElement(hello) - |1:30 EmptyElement(some-tag, attr-error: error while parsing attribute: \ + |1:30 EmptyElement(some-tag, attr-error: \ position 16: duplicated attribute, previous declaration at position 9) |EndElement(hello) |EndDocument From d516e45ae464ba1c8dd443adbb0e054c39de845e Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 11 Apr 2022 00:34:06 +0500 Subject: [PATCH 7/8] Properly test all paths of attribute parsing failures (24): events::attributes::html::duplicated::with_check::double_quoted events::attributes::html::duplicated::with_check::key_only events::attributes::html::duplicated::with_check::single_quoted events::attributes::html::duplicated::with_check::unquoted events::attributes::html::single::missed_value events::attributes::html::sparsed::double_quoted events::attributes::html::sparsed::key_contains_invalid events::attributes::html::sparsed::key_only events::attributes::html::sparsed::key_start_invalid events::attributes::html::sparsed::missed_value events::attributes::html::sparsed::single_quoted events::attributes::html::sparsed::unquoted events::attributes::xml::duplicated::with_check::double_quoted events::attributes::xml::duplicated::with_check::key_only events::attributes::xml::duplicated::with_check::single_quoted events::attributes::xml::duplicated::with_check::unquoted events::attributes::xml::duplicated::without_check::key_only events::attributes::xml::duplicated::without_check::unquoted events::attributes::xml::first::key_only events::attributes::xml::first::missed_value events::attributes::xml::first::unquoted events::attributes::xml::single::key_only events::attributes::xml::single::missed_value events::attributes::xml::sparsed::missed_value --- Changelog.md | 1 + src/events/attributes.rs | 1499 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 1442 insertions(+), 58 deletions(-) diff --git a/Changelog.md b/Changelog.md index ad892069..a2a1dde1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -40,6 +40,7 @@ `Reader::into_underlying_reader()` to `Reader::into_inner()` - refactor: now `Attributes::next()` returns a new type `AttrError` when attribute parsing failed ([#4](https://github.com/Mingun/fast-xml/pull/4)) +- test: properly test all paths of attributes parsing ([#4](https://github.com/Mingun/fast-xml/pull/4)) ## 0.23.0-alpha3 diff --git a/src/events/attributes.rs b/src/events/attributes.rs index f5fbf40b..bf03eddc 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -572,76 +572,1459 @@ impl std::error::Error for AttrError {} //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Checks, how parsing of XML-style attributes works. Each attribute should +/// have a value, enclosed in single or double quotes. #[cfg(test)] -mod tests { +mod xml { use super::*; use pretty_assertions::assert_eq; - #[test] - fn regular() { - let event = b"name a='a' b = 'b'"; - let mut attributes = Attributes::new(event, 0); - attributes.with_checks(true); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"a"); - assert_eq!(&*a.value, b"a"); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"b"); - assert_eq!(&*a.value, b"b"); - assert!(attributes.next().is_none()); + /// Checked attribute is the single attribute + mod single { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::new(br#"tag key='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::new(br#"tag key="value""#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::new(br#"tag key=value"#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(8)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::new(br#"tag key"#, 3); + // 0 ^ = 7 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(7)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::new(br#"tag 'key'='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::new(br#"tag key&jey='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=` + #[test] + fn missed_value() { + let mut iter = Attributes::new(br#"tag key="#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedValue(8)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Checked attribute is the first attribute in the list of many attributes + mod first { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::new(br#"tag key='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::new(br#"tag key="value" regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::new(br#"tag key=value regular='attribute'"#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(8)))); + // check error recovery + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::new(br#"tag key regular='attribute'"#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(8)))); + // check error recovery + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::new(br#"tag 'key'='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::new(br#"tag key&jey='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=`. + #[test] + fn missed_value() { + let mut iter = Attributes::new(br#"tag key= regular='attribute'"#, 3); + // 0 ^ = 9 + + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(9)))); + // Because we do not check validity of keys and values during parsing, + // "error='recovery'" is considered, as unquoted attribute value and + // skipped during recovery and iteration finished + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::new(br#"tag key= regular= 'attribute'"#, 3); + // 0 ^ = 9 ^ = 29 + + // In that case "regular=" considered as unquoted value + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(9)))); + // In that case "'attribute'" considered as a key, because we do not check + // validity of key names + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(29)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::new(br#"tag key= regular ='attribute'"#, 3); + // 0 ^ = 9 ^ = 29 + + // In that case "regular" considered as unquoted value + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(9)))); + // In that case "='attribute'" considered as a key, because we do not check + // validity of key names + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(29)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::new(br#"tag key= regular = 'attribute'"#, 3); + // 0 ^ = 9 ^ = 19 ^ = 30 + + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(9)))); + // In that case second "=" considered as a key, because we do not check + // validity of key names + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(19)))); + // In that case "'attribute'" considered as a key, because we do not check + // validity of key names + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(30)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Copy of single, but with additional spaces in markup + mod sparsed { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::new(br#"tag key = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::new(br#"tag key = "value" "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::new(br#"tag key = value "#, 3); + // 0 ^ = 10 + + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(10)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::new(br#"tag key "#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(8)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::new(br#"tag 'key' = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::new(br#"tag key&jey = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=` + #[test] + fn missed_value() { + let mut iter = Attributes::new(br#"tag key = "#, 3); + // 0 ^ = 10 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedValue(10)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Checks that duplicated attributes correctly reported and recovering is + /// possible after that + mod duplicated { + use super::*; + + mod with_check { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::new(br#"tag key='value' key='dup' another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::new(br#"tag key='value' key="dup" another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::new(br#"tag key='value' key=dup another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::new(br#"tag key='value' key another=''"#, 3); + // 0 ^ = 20 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(20)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Check for duplicated names is disabled + mod without_check { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::new(br#"tag key='value' key='dup' another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"dup"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::new(br#"tag key='value' key="dup" another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"dup"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::new(br#"tag key='value' key=dup another=''"#, 3); + // 0 ^ = 20 + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::UnquotedValue(20)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::new(br#"tag key='value' key another=''"#, 3); + // 0 ^ = 20 + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedEq(20)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } } #[test] fn mixed_quote() { - let event = b"name a='a' b = \"b\" c='cc\"cc'"; - let mut attributes = Attributes::new(event, 0); - attributes.with_checks(true); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"a"); - assert_eq!(&*a.value, b"a"); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"b"); - assert_eq!(&*a.value, b"b"); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"c"); - assert_eq!(&*a.value, b"cc\"cc"); - assert!(attributes.next().is_none()); + let mut iter = Attributes::new(br#"tag a='a' b = "b" c='cc"cc' d="dd'dd""#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"a", + value: Cow::Borrowed(b"a"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"b", + value: Cow::Borrowed(b"b"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"c", + value: Cow::Borrowed(br#"cc"cc"#), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"d", + value: Cow::Borrowed(b"dd'dd"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); } +} - #[test] - fn html_fail() { - let event = b"name a='a' b=b c"; - let mut attributes = Attributes::new(event, 0); - attributes.with_checks(true); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"a"); - assert_eq!(&*a.value, b"a"); - assert!(attributes.next().unwrap().is_err()); +/// Checks, how parsing of HTML-style attributes works. Each attribute can be +/// in three forms: +/// - XML-like: have a value, enclosed in single or double quotes +/// - have a value, do not enclosed in quotes +/// - without value, key only +#[cfg(test)] +mod html { + use super::*; + use pretty_assertions::assert_eq; + + /// Checked attribute is the single attribute + mod single { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::html(br#"tag key='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::html(br#"tag key="value""#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::html(br#"tag key=value"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::html(br#"tag key"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::html(br#"tag 'key'='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::html(br#"tag key&jey='value'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=` + #[test] + fn missed_value() { + let mut iter = Attributes::html(br#"tag key="#, 3); + // 0 ^ = 8 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedValue(8)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Checked attribute is the first attribute in the list of many attributes + mod first { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::html(br#"tag key='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::html(br#"tag key="value" regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::html(br#"tag key=value regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::html(br#"tag key regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::html(br#"tag 'key'='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::html(br#"tag key&jey='value' regular='attribute'"#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"regular", + value: Cow::Borrowed(b"attribute"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=` + #[test] + fn missed_value() { + let mut iter = Attributes::html(br#"tag key= regular='attribute'"#, 3); + + // Because we do not check validity of keys and values during parsing, + // "regular='attribute'" is considered as unquoted attribute value + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"regular='attribute'"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::html(br#"tag key= regular= 'attribute'"#, 3); + + // Because we do not check validity of keys and values during parsing, + // "regular=" is considered as unquoted attribute value + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"regular="), + })) + ); + // Because we do not check validity of keys and values during parsing, + // "'attribute'" is considered as key-only attribute + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'attribute'", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::html(br#"tag key= regular ='attribute'"#, 3); + + // Because we do not check validity of keys and values during parsing, + // "regular" is considered as unquoted attribute value + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"regular"), + })) + ); + // Because we do not check validity of keys and values during parsing, + // "='attribute'" is considered as key-only attribute + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"='attribute'", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + + //////////////////////////////////////////////////////////////////// + + let mut iter = Attributes::html(br#"tag key= regular = 'attribute'"#, 3); + // 0 ^ = 9 ^ = 19 ^ = 30 + + // Because we do not check validity of keys and values during parsing, + // "regular" is considered as unquoted attribute value + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"regular"), + })) + ); + // Because we do not check validity of keys and values during parsing, + // "=" is considered as key-only attribute + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"=", + value: Cow::Borrowed(&[]), + })) + ); + // Because we do not check validity of keys and values during parsing, + // "'attribute'" is considered as key-only attribute + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'attribute'", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Copy of single, but with additional spaces in markup + mod sparsed { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::html(br#"tag key = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::html(br#"tag key = "value" "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::html(br#"tag key = value "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::html(br#"tag key "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key is started with an invalid symbol (a single quote in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_start_invalid() { + let mut iter = Attributes::html(br#"tag 'key' = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"'key'", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Key contains an invalid symbol (an ampersand in this test). + /// Because we do not check validity of keys and values during parsing, + /// that invalid attribute will be returned + #[test] + fn key_contains_invalid() { + let mut iter = Attributes::html(br#"tag key&jey = 'value' "#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key&jey", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute value is missing after `=` + #[test] + fn missed_value() { + let mut iter = Attributes::html(br#"tag key = "#, 3); + // 0 ^ = 10 + + assert_eq!(iter.next(), Some(Err(AttrError::ExpectedValue(10)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Checks that duplicated attributes correctly reported and recovering is + /// possible after that + mod duplicated { + use super::*; + + mod with_check { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::html(br#"tag key='value' key='dup' another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::html(br#"tag key='value' key="dup" another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::html(br#"tag key='value' key=dup another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::html(br#"tag key='value' key another=''"#, 3); + // 0 ^ = 4 ^ = 16 + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!(iter.next(), Some(Err(AttrError::Duplicated(16, 4)))); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } + + /// Check for duplicated names is disabled + mod without_check { + use super::*; + use pretty_assertions::assert_eq; + + /// Attribute have a value enclosed in single quotes + #[test] + fn single_quoted() { + let mut iter = Attributes::html(br#"tag key='value' key='dup' another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"dup"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value enclosed in double quotes + #[test] + fn double_quoted() { + let mut iter = Attributes::html(br#"tag key='value' key="dup" another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"dup"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Attribute have a value, not enclosed in quotes + #[test] + fn unquoted() { + let mut iter = Attributes::html(br#"tag key='value' key=dup another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"dup"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + /// Only attribute key is present + #[test] + fn key_only() { + let mut iter = Attributes::html(br#"tag key='value' key another=''"#, 3); + iter.with_checks(false); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(b"value"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"key", + value: Cow::Borrowed(&[]), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"another", + value: Cow::Borrowed(b""), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } } #[test] - fn html_ok() { - let event = b"name a='a' e b=b c d ee=ee"; - let mut attributes = Attributes::html(event, 0); - attributes.with_checks(true); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"a"); - assert_eq!(&*a.value, b"a"); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"e"); - assert_eq!(&*a.value, b""); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"b"); - assert_eq!(&*a.value, b"b"); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"c"); - assert_eq!(&*a.value, b""); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"d"); - assert_eq!(&*a.value, b""); - let a = attributes.next().unwrap().unwrap(); - assert_eq!(a.key, b"ee"); - assert_eq!(&*a.value, b"ee"); - assert!(attributes.next().is_none()); + fn mixed_quote() { + let mut iter = Attributes::html(br#"tag a='a' b = "b" c='cc"cc' d="dd'dd""#, 3); + + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"a", + value: Cow::Borrowed(b"a"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"b", + value: Cow::Borrowed(b"b"), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"c", + value: Cow::Borrowed(br#"cc"cc"#), + })) + ); + assert_eq!( + iter.next(), + Some(Ok(Attribute { + key: b"d", + value: Cow::Borrowed(b"dd'dd"), + })) + ); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); } } From c81d25c58282d536e08a9128d298a806e1c2f172 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 12 Apr 2022 20:19:41 +0500 Subject: [PATCH 8/8] Reimplement attributes parsing logic and fix all errors Introduce new `Attr` type that stores not only an attribute content, but also its shape --- Changelog.md | 3 + src/de/map.rs | 43 ++- src/events/attributes.rs | 581 ++++++++++++++++++++++++++++++--------- 3 files changed, 475 insertions(+), 152 deletions(-) diff --git a/Changelog.md b/Changelog.md index a2a1dde1..38f02ec5 100644 --- a/Changelog.md +++ b/Changelog.md @@ -41,6 +41,9 @@ - refactor: now `Attributes::next()` returns a new type `AttrError` when attribute parsing failed ([#4](https://github.com/Mingun/fast-xml/pull/4)) - test: properly test all paths of attributes parsing ([#4](https://github.com/Mingun/fast-xml/pull/4)) +- feat: attribute iterator now implements `FusedIterator` ([#4](https://github.com/Mingun/fast-xml/pull/4)) +- fix: fixed many errors in attribute parsing using iterator, returned from `attributes()` + or `html_attributes()` ([#4](https://github.com/Mingun/fast-xml/pull/4)) ## 0.23.0-alpha3 diff --git a/src/de/map.rs b/src/de/map.rs index ab0b9943..dc9cab93 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -4,11 +4,12 @@ use crate::{ de::escape::EscapedDeserializer, de::{DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX}, errors::serialize::DeError, - events::attributes::Attribute, + events::attributes::IterState, events::BytesStart, }; use serde::de::{self, DeserializeSeed, IntoDeserializer}; use std::borrow::Cow; +use std::ops::Range; /// Representing state of the `MapAccess` accessor. enum State { @@ -17,7 +18,7 @@ enum State { Empty, /// `next_key_seed` checked the attributes list and find it is not exhausted yet. /// Next call to the `next_value_seed` will deserialize type from the attribute value - Attribute, + Attribute(Range), /// The same as `InnerValue` Nested, /// Value should be deserialized from the text content of the XML node: @@ -41,7 +42,7 @@ where /// do not store reference to `Attributes` itself but instead create /// a new object on each advance of `Attributes` iterator, so we need /// to restore last position before advance. - position: usize, + iter: IterState, /// Current state of the accessor that determines what next call to API /// methods should return. state: State, @@ -59,11 +60,10 @@ where start: BytesStart<'de>, fields: &[&'static str], ) -> Result { - let position = start.attributes().position; Ok(MapAccess { de, start, - position, + iter: IterState::new(0, false), state: State::Empty, unflatten_fields: fields .iter() @@ -72,14 +72,6 @@ where .collect(), }) } - - fn next_attr(&mut self) -> Result, DeError> { - let mut attributes = self.start.attributes(); - attributes.position = self.position; - let next_att = attributes.next().transpose()?; - self.position = attributes.position; - Ok(next_att) - } } impl<'de, 'a, R> de::MapAccess<'de> for MapAccess<'de, 'a, R> @@ -92,16 +84,17 @@ where &mut self, seed: K, ) -> Result, Self::Error> { + // FIXME: There error positions counted from end of tag name - need global position + let slice = self.start.attributes_raw(); let decoder = self.de.reader.decoder(); let has_value_field = self.de.has_value_field; - let mut attributes = self.start.attributes(); - attributes.position = self.position; - if let Some(a) = attributes.next().transpose()? { + if let Some(a) = self.iter.next(slice).transpose()? { // try getting map from attributes (key= "value") - self.state = State::Attribute; + let (key, value) = a.into(); + self.state = State::Attribute(value.unwrap_or_default()); seed.deserialize(EscapedDeserializer::new( - Cow::Borrowed(a.key), + Cow::Borrowed(&slice[key]), decoder, false, )) @@ -172,13 +165,15 @@ where seed: K, ) -> Result { match std::mem::replace(&mut self.state, State::Empty) { - State::Attribute => { + State::Attribute(value) => { + let slice = self.start.attributes_raw(); let decoder = self.de.reader.decoder(); - match self.next_attr()? { - Some(a) => seed.deserialize(EscapedDeserializer::new(a.value, decoder, true)), - // We set `Attribute` state only when we are sure that `next_attr()` returns a value - None => unreachable!(), - } + + seed.deserialize(EscapedDeserializer::new( + Cow::Borrowed(&slice[value]), + decoder, + true, + )) } State::Nested | State::InnerValue => seed.deserialize(&mut *self.de), State::Empty => Err(DeError::EndOfAttributes), diff --git a/src/events/attributes.rs b/src/events/attributes.rs index bf03eddc..77842c3e 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -5,7 +5,9 @@ use crate::errors::{Error, Result as XmlResult}; use crate::escape::{do_unescape, escape}; use crate::reader::{is_whitespace, Reader}; -use std::fmt::{Debug, Display, Formatter}; +use crate::utils::{write_byte_string, write_cow_string, Bytes}; +use std::fmt::{self, Debug, Display, Formatter}; +use std::iter::FusedIterator; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Range}; /// A struct representing a key/value XML attribute. @@ -225,9 +227,7 @@ impl<'a> Attribute<'a> { } impl<'a> Debug for Attribute<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - use crate::utils::{write_byte_string, write_cow_string}; - + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "Attribute {{ key: ")?; write_byte_string(f, self.key)?; write!(f, ", value: ")?; @@ -278,6 +278,16 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { } } +impl<'a> From> for Attribute<'a> { + #[inline] + fn from(attr: Attr<&'a [u8]>) -> Self { + Self { + key: attr.key(), + value: Cow::Borrowed(attr.value()), + } + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Iterator over XML attributes. @@ -290,37 +300,24 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { pub struct Attributes<'a> { /// slice of `Element` corresponding to attributes bytes: &'a [u8], - /// current position of the iterator - pub(crate) position: usize, - /// if true, checks for duplicate names - with_checks: bool, - /// allows attribute without quote or `=` - html: bool, - /// if `with_checks`, contains the ranges corresponding to the - /// attribute names already parsed in this `Element` - consumed: Vec>, + /// Iterator state, independent from the actual source of bytes + state: IterState, } impl<'a> Attributes<'a> { /// Creates a new attribute iterator from a buffer. - pub fn new(buf: &'a [u8], pos: usize) -> Attributes<'a> { - Attributes { + pub fn new(buf: &'a [u8], pos: usize) -> Self { + Self { bytes: buf, - position: pos, - html: false, - with_checks: true, - consumed: Vec::new(), + state: IterState::new(pos, false), } } /// Creates a new attribute iterator from a buffer, allowing HTML attribute syntax. - pub fn html(buf: &'a [u8], pos: usize) -> Attributes<'a> { - Attributes { + pub fn html(buf: &'a [u8], pos: usize) -> Self { + Self { bytes: buf, - position: pos, - html: true, - with_checks: true, - consumed: Vec::new(), + state: IterState::new(pos, true), } } @@ -331,7 +328,7 @@ impl<'a> Attributes<'a> { /// /// (`true` by default) pub fn with_checks(&mut self, val: bool) -> &mut Attributes<'a> { - self.with_checks = val; + self.state.check_duplicates = val; self } } @@ -339,112 +336,18 @@ impl<'a> Attributes<'a> { impl<'a> Iterator for Attributes<'a> { type Item = Result, AttrError>; + #[inline] fn next(&mut self) -> Option { - let len = self.bytes.len(); - - macro_rules! err { - ($err:expr) => {{ - self.position = len; - return Some(Err($err.into())); - }}; - } - - macro_rules! attr { - ($key:expr) => {{ - self.position = len; - if self.html { - attr!($key, 0..0) - } else { - None - } - }}; - ($key:expr, $val:expr) => { - Some(Ok(Attribute { - key: &self.bytes[$key], - value: Cow::Borrowed(&self.bytes[$val]), - })) - }; - } - - if len <= self.position { - return None; - } - - let mut bytes = self.bytes.iter().enumerate().skip(self.position); - - // key starts after the whitespace - let start_key = match bytes - .by_ref() - .skip_while(|&(_, &b)| !is_whitespace(b)) - .find(|&(_, &b)| !is_whitespace(b)) - { - Some((i, _)) => i, - None => return attr!(self.position..len), - }; - - // key ends with either whitespace or = - let end_key = match bytes - .by_ref() - .find(|&(_, &b)| b == b'=' || is_whitespace(b)) - { - Some((i, &b'=')) => i, - Some((i, _)) => { - // consume until `=` or return if html - match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) { - Some((_, &b'=')) => i, - Some((j, _)) if self.html => { - self.position = j - 1; - return attr!(start_key..i, 0..0); - } - Some((j, _)) => err!(AttrError::ExpectedEq(j)), - None if self.html => { - self.position = len; - return attr!(start_key..len, 0..0); - } - None => err!(AttrError::ExpectedEq(len)), - } - } - None => return attr!(start_key..len), - }; - - if self.with_checks { - if let Some(start) = self - .consumed - .iter() - .filter(|r| r.len() == end_key - start_key) - .find(|r| self.bytes[(*r).clone()] == self.bytes[start_key..end_key]) - .map(|ref r| r.start) - { - err!(AttrError::Duplicated(start_key, start)); - } - self.consumed.push(start_key..end_key); - } - - // value has quote if not html - match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) { - Some((i, quote @ &b'\'')) | Some((i, quote @ &b'"')) => { - match bytes.by_ref().find(|&(_, &b)| b == *quote) { - Some((j, _)) => { - self.position = j + 1; - return attr!(start_key..end_key, i + 1..j); - } - None => err!(AttrError::UnquotedValue(i)), - } - } - Some((i, _)) if self.html => { - let j = bytes - .by_ref() - .find(|&(_, &b)| is_whitespace(b)) - .map_or(len, |(j, _)| j); - self.position = j; - return attr!(start_key..end_key, i..j); - } - Some((i, _)) => err!(AttrError::UnquotedValue(i)), - None => return attr!(start_key..end_key), + match self.state.next(self.bytes) { + None => None, + Some(Ok(a)) => Some(Ok(a.map(|range| &self.bytes[range]).into())), + Some(Err(e)) => Some(Err(e)), } } } +impl<'a> FusedIterator for Attributes<'a> {} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Errors that can be raised during parsing attributes. @@ -537,7 +440,7 @@ pub enum AttrError { } impl Display for AttrError { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { match self { Self::ExpectedEq(pos) => write!( f, @@ -572,6 +475,428 @@ impl std::error::Error for AttrError {} //////////////////////////////////////////////////////////////////////////////////////////////////// +/// A struct representing a key/value XML or HTML [attribute]. +/// +/// [attribute]: https://www.w3.org/TR/xml11/#NT-Attribute +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Attr { + /// Attribute with value enclosed in double quotes (`"`). Attribute key and + /// value provided. This is a canonical XML-style attribute. + DoubleQ(T, T), + /// Attribute with value enclosed in single quotes (`'`). Attribute key and + /// value provided. This is an XML-style attribute. + SingleQ(T, T), + /// Attribute with value not enclosed in quotes. Attribute key and value + /// provided. This is HTML-style attribute, it can be returned in HTML-mode + /// parsing only. In an XML mode [`AttrError::UnquotedValue`] will be raised + /// instead. + /// + /// Attribute value can be invalid according to the [HTML specification], + /// in particular, it can contain `"`, `'`, `=`, `<`, and ` + /// characters. The absence of the `>` character is nevertheless guaranteed, + /// since the parser extracts [events] based on them even before the start + /// of parsing attributes. + /// + /// [HTML specification]: https://html.spec.whatwg.org/#unquoted + /// [events]: crate::events::Event::Start + Unquoted(T, T), + /// Attribute without value. Attribute key provided. This is HTML-style attribute, + /// it can be returned in HTML-mode parsing only. In XML mode + /// [`AttrError::ExpectedEq`] will be raised instead. + Empty(T), +} + +impl Attr { + /// Maps an `Attr` to `Attr` by applying a function to a contained key and value. + #[inline] + pub fn map(self, mut f: F) -> Attr + where + F: FnMut(T) -> U, + { + match self { + Attr::DoubleQ(key, value) => Attr::DoubleQ(f(key), f(value)), + Attr::SingleQ(key, value) => Attr::SingleQ(f(key), f(value)), + Attr::Empty(key) => Attr::Empty(f(key)), + Attr::Unquoted(key, value) => Attr::Unquoted(f(key), f(value)), + } + } +} + +impl<'a> Attr<&'a [u8]> { + /// Returns the key value + #[inline] + pub fn key(&self) -> &'a [u8] { + match self { + Attr::DoubleQ(key, _) => key, + Attr::SingleQ(key, _) => key, + Attr::Empty(key) => key, + Attr::Unquoted(key, _) => key, + } + } + /// Returns the attribute value. For [`Self::Empty`] variant an empty slice + /// is returned according to the [HTML specification]. + /// + /// [HTML specification]: https://www.w3.org/TR/2012/WD-html-markup-20120329/syntax.html#syntax-attr-empty + #[inline] + pub fn value(&self) -> &'a [u8] { + match self { + Attr::DoubleQ(_, value) => value, + Attr::SingleQ(_, value) => value, + Attr::Empty(_) => &[], + Attr::Unquoted(_, value) => value, + } + } +} + +impl> Debug for Attr { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + match self { + Attr::DoubleQ(key, value) => f + .debug_tuple("Attr::DoubleQ") + .field(&Bytes(key.as_ref())) + .field(&Bytes(value.as_ref())) + .finish(), + Attr::SingleQ(key, value) => f + .debug_tuple("Attr::SingleQ") + .field(&Bytes(key.as_ref())) + .field(&Bytes(value.as_ref())) + .finish(), + Attr::Empty(key) => f + .debug_tuple("Attr::Empty") + // Comment to prevent formatting and keep style consistent + .field(&Bytes(key.as_ref())) + .finish(), + Attr::Unquoted(key, value) => f + .debug_tuple("Attr::Unquoted") + .field(&Bytes(key.as_ref())) + .field(&Bytes(value.as_ref())) + .finish(), + } + } +} + +/// Unpacks attribute key and value into tuple of this two elements. +/// `None` value element is returned only for [`Attr::Empty`] variant. +impl From> for (T, Option) { + #[inline] + fn from(attr: Attr) -> Self { + match attr { + Attr::DoubleQ(key, value) => (key, Some(value)), + Attr::SingleQ(key, value) => (key, Some(value)), + Attr::Empty(key) => (key, None), + Attr::Unquoted(key, value) => (key, Some(value)), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +type AttrResult = Result>, AttrError>; + +#[derive(Clone, Copy, Debug)] +enum State { + /// Iteration finished, iterator will return `None` to all [`IterState::next`] + /// requests. + Done, + /// The last attribute returned was deserialized successfully. Contains an + /// offset from which next attribute should be searched. + Next(usize), + /// The last attribute returns [`AttrError::UnquotedValue`], offset pointed + /// to the beginning of the value. Recover should skip a value + SkipValue(usize), + /// The last attribute returns [`AttrError::Duplicated`], offset pointed to + /// the equal (`=`) sign. Recover should skip it and a value + SkipEqValue(usize), +} + +/// External iterator over spans of attribute key and value +#[derive(Clone, Debug)] +pub(crate) struct IterState { + /// Iteration state that determines what actions should be done before the + /// actual parsing of the next attribute + state: State, + /// If `true`, enables ability to parse unquoted values and key-only (empty) + /// attributes + html: bool, + /// If `true`, checks for duplicate names + check_duplicates: bool, + /// If `check_duplicates` is set, contains the ranges of already parsed attribute + /// names. We store a ranges instead of slices to able to report a previous + /// attribute position + keys: Vec>, +} + +impl IterState { + pub fn new(offset: usize, html: bool) -> Self { + Self { + state: State::Next(offset), + html, + check_duplicates: true, + keys: Vec::new(), + } + } + + /// Recover from an error that could have been made on a previous step. + /// Returns an offset from which parsing should continue. + /// If there no input left, returns `None`. + fn recover(&self, slice: &[u8]) -> Option { + match self.state { + State::Done => None, + State::Next(offset) => Some(offset), + State::SkipValue(offset) => self.skip_value(slice, offset), + State::SkipEqValue(offset) => self.skip_eq_value(slice, offset), + } + } + + /// Skip all characters up to first space symbol or end-of-input + #[inline] + fn skip_value(&self, slice: &[u8], offset: usize) -> Option { + let mut iter = (offset..).zip(slice[offset..].iter()); + + match iter.find(|(_, &b)| is_whitespace(b)) { + // Input: ` key = value ` + // | ^ + // offset e + Some((e, _)) => Some(e), + // Input: ` key = value` + // | ^ + // offset e = len() + None => None, + } + } + + /// Skip all characters up to first space symbol or end-of-input + #[inline] + fn skip_eq_value(&self, slice: &[u8], offset: usize) -> Option { + let mut iter = (offset..).zip(slice[offset..].iter()); + + // Skip all up to the quote and get the quote type + let quote = match iter.find(|(_, &b)| !is_whitespace(b)) { + // Input: ` key = "` + // | ^ + // offset + Some((_, b'"')) => b'"', + // Input: ` key = '` + // | ^ + // offset + Some((_, b'\'')) => b'\'', + + // Input: ` key = x` + // | ^ + // offset + Some((offset, _)) => return self.skip_value(slice, offset), + // Input: ` key = ` + // | ^ + // offset + None => return None, + }; + + match iter.find(|(_, &b)| b == quote) { + // Input: ` key = " "` + // ^ + Some((e, b'"')) => Some(e), + // Input: ` key = ' '` + // ^ + Some((e, _)) => Some(e), + + // Input: ` key = " ` + // Input: ` key = ' ` + // ^ + // Closing quote not found + None => None, + } + } + + #[inline] + fn check_for_duplicates( + &mut self, + slice: &[u8], + key: Range, + ) -> Result, AttrError> { + if self.check_duplicates { + if let Some(prev) = self + .keys + .iter() + .find(|r| slice[(*r).clone()] == slice[key.clone()]) + { + return Err(AttrError::Duplicated(key.start, prev.start)); + } + self.keys.push(key.clone()); + } + Ok(key) + } + + /// # Parameters + /// + /// - `slice`: content of the tag, used for checking for duplicates + /// - `key`: Range of key in slice, if iterator in HTML mode + /// - `offset`: Position of error if iterator in XML mode + #[inline] + fn key_only(&mut self, slice: &[u8], key: Range, offset: usize) -> Option { + Some(if self.html { + self.check_for_duplicates(slice, key).map(Attr::Empty) + } else { + Err(AttrError::ExpectedEq(offset)) + }) + } + + #[inline] + fn double_q(&mut self, key: Range, value: Range) -> Option { + self.state = State::Next(value.end + 1); // +1 for `"` + + Some(Ok(Attr::DoubleQ(key, value))) + } + + #[inline] + fn single_q(&mut self, key: Range, value: Range) -> Option { + self.state = State::Next(value.end + 1); // +1 for `'` + + Some(Ok(Attr::SingleQ(key, value))) + } + + pub fn next(&mut self, slice: &[u8]) -> Option { + let mut iter = match self.recover(slice) { + Some(offset) => (offset..).zip(slice[offset..].iter()), + None => return None, + }; + + // Index where next key started + let start_key = match iter.find(|(_, &b)| !is_whitespace(b)) { + // Input: ` key` + // ^ + Some((s, _)) => s, + // Input: ` ` + // ^ + None => { + // Because we reach end-of-input, stop iteration on next call + self.state = State::Done; + return None; + } + }; + // Span of a key + let (key, offset) = match iter.find(|(_, &b)| b == b'=' || is_whitespace(b)) { + // Input: ` key=` + // | ^ + // s e + Some((e, b'=')) => (start_key..e, e), + + // Input: ` key ` + // ^ + Some((e, _)) => match iter.find(|(_, &b)| !is_whitespace(b)) { + // Input: ` key =` + // | | ^ + // start_key e + Some((offset, b'=')) => (start_key..e, offset), + // Input: ` key x` + // | | ^ + // start_key e + // If HTML-like attributes is allowed, this is the result, otherwise error + Some((offset, _)) => { + // In any case, recovering is not required + self.state = State::Next(offset); + return self.key_only(slice, start_key..e, offset); + } + // Input: ` key ` + // | | ^ + // start_key e + // If HTML-like attributes is allowed, this is the result, otherwise error + None => { + // Because we reach end-of-input, stop iteration on next call + self.state = State::Done; + return self.key_only(slice, start_key..e, slice.len()); + } + }, + + // Input: ` key` + // | ^ + // s e = len() + // If HTML-like attributes is allowed, this is the result, otherwise error + None => { + // Because we reach end-of-input, stop iteration on next call + self.state = State::Done; + let e = slice.len(); + return self.key_only(slice, start_key..e, e); + } + }; + + let key = match self.check_for_duplicates(slice, key) { + Err(e) => { + self.state = State::SkipEqValue(offset); + return Some(Err(e)); + } + Ok(key) => key, + }; + + //////////////////////////////////////////////////////////////////////// + + // Gets the position of quote and quote type + let (start_value, quote) = match iter.find(|(_, &b)| !is_whitespace(b)) { + // Input: ` key = "` + // ^ + Some((s, b'"')) => (s + 1, b'"'), + // Input: ` key = '` + // ^ + Some((s, b'\'')) => (s + 1, b'\''), + + // Input: ` key = x` + // ^ + // If HTML-like attributes is allowed, this is the start of the value + Some((s, _)) if self.html => { + // We do not check validity of attribute value characters as required + // according to https://html.spec.whatwg.org/#unquoted. It can be done + // during validation phase + let end = match iter.find(|(_, &b)| is_whitespace(b)) { + // Input: ` key = value ` + // | ^ + // s e + Some((e, _)) => e, + // Input: ` key = value` + // | ^ + // s e = len() + None => slice.len(), + }; + self.state = State::Next(end); + return Some(Ok(Attr::Unquoted(key, s..end))); + } + // Input: ` key = x` + // ^ + Some((s, _)) => { + self.state = State::SkipValue(s); + return Some(Err(AttrError::UnquotedValue(s))); + } + + // Input: ` key = ` + // ^ + None => { + // Because we reach end-of-input, stop iteration on next call + self.state = State::Done; + return Some(Err(AttrError::ExpectedValue(slice.len()))); + } + }; + + match iter.find(|(_, &b)| b == quote) { + // Input: ` key = " "` + // ^ + Some((e, b'"')) => self.double_q(key, start_value..e), + // Input: ` key = ' '` + // ^ + Some((e, _)) => self.single_q(key, start_value..e), + + // Input: ` key = " ` + // Input: ` key = ' ` + // ^ + // Closing quote not found + None => { + // Because we reach end-of-input, stop iteration on next call + self.state = State::Done; + return Some(Err(AttrError::ExpectedQuote(slice.len(), quote))); + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Checks, how parsing of XML-style attributes works. Each attribute should /// have a value, enclosed in single or double quotes. #[cfg(test)]