diff --git a/Changelog.md b/Changelog.md index e0ebd96a..a1b4f6fa 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,8 +14,17 @@ ### Bug Fixes +- [#379]: Improved compliance with the XML attribute value normalization process by + adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`, + which ought to be used in place of `Attribute::unescape_value()` and + `Attribute::unescape_value_with()` +### Bug Fixes + ### Misc Changes +- [#379]: Added tests for attribute value normalization + +[#379]: https://github.com/tafia/quick-xml/pull/379 ## 0.29.0 -- 2023-06-13 @@ -111,6 +120,10 @@ - [#565]: Fix compilation error when build with serde <1.0.139 +### New Tests + +- [#379]: Added tests for attribute value normalization + [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged [#490]: https://github.com/tafia/quick-xml/pull/490 [#510]: https://github.com/tafia/quick-xml/issues/510 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index a8cbbd53..8864e74b 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); loop { match criterion::black_box(r.read_event()?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } Event::Text(e) => { @@ -67,7 +66,6 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); let mut buf = Vec::new(); @@ -75,7 +73,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { match criterion::black_box(r.read_event_into(&mut buf)?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } Event::Text(e) => { @@ -93,7 +91,6 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { let mut r = NsReader::from_str(doc); loop { @@ -101,7 +98,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { criterion::black_box(resolved_ns); for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } (resolved_ns, Event::Text(e)) => { @@ -120,7 +117,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { let mut r = NsReader::from_reader(doc); let mut buf = Vec::new(); @@ -129,7 +125,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { criterion::black_box(resolved_ns); for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } (resolved_ns, Event::Text(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index aa5c8b70..0b35f7e7 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -1,6 +1,9 @@ +use std::borrow::Cow; + use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::escape::{escape, unescape}; +use quick_xml::events::attributes::Attribute; use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::reader::{NsReader, Reader}; @@ -242,6 +245,74 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"foobar"), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(LOREM_IPSUM_TEXT.as_bytes()), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"just a bit\n of text without\tany entities"), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + let attr1 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"prefix "some stuff","more stuff""), + }; + let attr2 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"&<"), + }; + b.iter(|| { + criterion::black_box(attr1.normalized_value()).unwrap(); + criterion::black_box(attr2.normalized_value()).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + let attr1 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"age > 72 && age < 21"), + }; + let attr2 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b""what's that?""), + }; + b.iter(|| { + criterion::black_box(attr1.normalized_value()).unwrap(); + criterion::black_box(attr2.normalized_value()).unwrap(); + }) + }); + group.finish(); } @@ -354,6 +425,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/src/errors.rs b/src/errors.rs index 14cd7a5c..fb5d2d18 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -82,6 +82,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index 7ca5da46..ba60e046 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -3,12 +3,13 @@ use memchr::memchr2_iter; use std::borrow::Cow; use std::ops::Range; +use std::slice::Iter; #[cfg(test)] use pretty_assertions::assert_eq; /// Error for XML escape / unescape. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(Range), @@ -211,8 +212,166 @@ where } } +const fn is_normalization_char(b: &u8) -> bool { + matches!(*b, b'\t' | b'\r' | b'\n' | b' ' | b'&') +} + +/// Returns the attribute value normalized as per [the XML specification], +/// using a custom entity resolver. +/// +/// Do not use this method with HTML attributes. +/// +/// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` +/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function +/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still +/// take precedence. +/// +/// This will allocate unless the raw attribute value does not require normalization. +/// +/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize +pub(crate) fn normalize_attribute_value<'input, 'entity, F>( + value: &'input str, + resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + const DEPTH: usize = 128; + + let bytes = value.as_bytes(); + let mut iter = bytes.iter(); + + if let Some(i) = iter.position(is_normalization_char) { + let mut normalized = String::new(); + let pos = normalize_step( + &mut normalized, + &mut iter, + value, + 0, + i, + DEPTH, + &resolve_entity, + )?; + + normalize_steps( + &mut normalized, + &mut iter, + value, + pos, + DEPTH, + &resolve_entity, + )?; + return Ok(normalized.into()); + } + Ok(Cow::Borrowed(value)) +} + +fn normalize_steps<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + mut pos: usize, + depth: usize, + resolve_entity: &F, +) -> Result<(), EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + while let Some(i) = iter.position(is_normalization_char) { + pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?; + } + if let Some(rest) = input.get(pos..) { + normalized.push_str(rest); + } + Ok(()) +} + +/// Performs one step of the [normalization algorithm] (but with recursive part): +/// +/// 1. For a character reference, append the referenced character +/// to the normalized value. +/// 2. For an entity reference, recursively apply this algorithm +/// to the replacement text of the entity. +/// 3. For a white space character (#x20, #xD, #xA, #x9), append +/// a space character (#x20) to the normalized value. +/// 4. For another character, append the character to the normalized value. +/// +/// # Parameters +/// +/// - `normalized`: Output of the algorithm. Normalized value will be placed here +/// - `iter`: Iterator over bytes of `input` +/// - `input`: Original non-normalized value +/// - `last_pos`: Index of the last byte in `input` that was processed +/// - `index`: Index of the byte in `input` that should be processed now +/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm +/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities +/// +/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize +fn normalize_step<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + last_pos: usize, + index: usize, + depth: usize, + resolve_entity: &F, +) -> Result +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + // 4. For another character, append the character to the normalized value. + normalized.push_str(&input[last_pos..index]); + + match input.as_bytes()[index] { + b'&' => { + let start = index + 1; // +1 - skip `&` + let end = start + + match iter.position(|&b| b == b';') { + Some(end) => end, + None => return Err(EscapeError::UnterminatedEntity(index..input.len())), + }; + + // Content between & and ; - &pat; + let pat = &input[start..end]; + // 1. For a character reference, append the referenced character + // to the normalized value. + if pat.starts_with('#') { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity, start..end)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + } else + // 2. For an entity reference, recursively apply this algorithm + // to the replacement text of the entity. + if let Some(value) = resolve_entity(pat) { + normalize_steps( + normalized, + &mut value.as_bytes().iter(), + value, + 0, + depth.saturating_sub(1), + resolve_entity, + )?; + } else { + return Err(EscapeError::UnrecognizedSymbol(start..end, pat.to_string())); + } + Ok(end + 1) // +1 - skip `;` + } + // 3. For a white space character (#x20, #xD, #xA, #x9), append + // a space character (#x20) to the normalized value. + b'\t' | b'\n' | b'\r' | b' ' => { + normalized.push(' '); + Ok(index + 1) // +1 - skip character + } + + _ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"), + } +} + #[cfg(not(feature = "escape-html"))] -fn named_entity(name: &str) -> Option<&str> { +const fn named_entity(name: &str) -> Option<&str> { // match over strings are not allowed in const functions let s = match name.as_bytes() { b"lt" => "<", @@ -225,7 +384,7 @@ fn named_entity(name: &str) -> Option<&str> { Some(s) } #[cfg(feature = "escape-html")] -fn named_entity(name: &str) -> Option<&str> { +const fn named_entity(name: &str) -> Option<&str> { // imported from https://dev.w3.org/html5/html-author/charref // match over strings are not allowed in const functions //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json @@ -1812,3 +1971,104 @@ fn test_partial_escape() { "prefix_\"a\"b&<>c" ); } + +#[cfg(test)] +mod normalization { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!(normalize_attribute_value("", |_| { None }), Ok("".into())); + } + + #[test] + fn only_spaces() { + assert_eq!( + normalize_attribute_value(" ", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\t\t\t", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\r\r\r", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\n\n\n", |_| { None }), + Ok(" ".into()) + ); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_attribute_value("already normalized", |_| { None }), + Ok("already normalized".into()) + ); + } + + #[test] + fn characters() { + assert_eq!( + normalize_attribute_value("string with character", |_| { None }), + Ok("string with character".into()) + ); + assert_eq!( + normalize_attribute_value("string with character", |_| { None }), + Ok("string with character".into()) + ); + } + + #[test] + fn entities() { + assert_eq!( + normalize_attribute_value("string with &entity; reference", |_| { + Some("replacement") + }), + Ok("string with replacement reference".into()) + ); + assert_eq!( + normalize_attribute_value("string with &entity-1; reference", |entity| { + match entity { + "entity-1" => Some("recursive &entity-2;"), + "entity-2" => Some("entity 2"), + _ => None, + } + }), + Ok("string with recursive entity 2 reference".into()) + ); + } + + #[test] + fn unclosed_entity() { + assert_eq!( + normalize_attribute_value("string with unclosed &entity reference", |_| { + // 0 ^ = 21 ^ = 38 + Some("replacement") + }), + Err(EscapeError::UnterminatedEntity(21..38)) + ); + assert_eq!( + normalize_attribute_value("string with unclosed (character) reference", |_| { + // 0 ^ = 21 ^ = 47 + None + }), + Err(EscapeError::UnterminatedEntity(21..47)) + ); + } + + #[test] + fn unknown_entity() { + assert_eq!( + normalize_attribute_value("string with unknown &entity; reference", |_| { None }), + // 0 ^ ^ = 21..27 + Err(EscapeError::UnrecognizedSymbol( + 21..27, + "entity".to_string() + )) + ); + } +} diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 2b109aa9..abde25cf 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,6 +4,9 @@ use crate::errors::Result as XmlResult; use crate::escape::{escape, unescape_with}; +use crate::escapei; +#[cfg(any(doc, not(feature = "encoding")))] +use crate::escapei::EscapeError; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -30,7 +33,100 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per [the XML specification]. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value_with()`](#method.normalized_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value(&'a self) -> Result, EscapeError> { + self.normalized_value_with(|_| None) + } + + /// Returns the attribute value normalized as per [the XML specification], + /// using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value()`](#method.normalized_value) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value_with<'entity>( + &'a self, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> Result, EscapeError> { + escapei::normalize_attribute_value( + std::str::from_utf8(self.value.as_ref()).expect("unable to decode as utf-8"), + resolve_entity, + ) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification]. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + pub fn decode_and_normalize_value(&self, reader: &Reader) -> XmlResult> { + self.decode_and_normalize_value_with(reader, |_| None) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification], using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + pub fn decode_and_normalize_value_with<'entity, B>( + &self, + reader: &Reader, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(), + }; + + match escapei::normalize_attribute_value(&decoded, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -792,6 +888,91 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[cfg(any(doc, not(feature = "encoding")))] + mod attribute_value_normalization { + + use super::*; + use pretty_assertions::assert_eq; + + /// Empty values returned are unchanged + #[test] + fn test_empty() { + let raw_value = "".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed(""))); + } + + /// Already normalized values are returned unchanged + #[test] + fn test_already_normalized() { + let raw_value = "foobar123".as_bytes(); + let output = "foobar123"; + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed(output))); + } + + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with + /// a space character + #[test] + fn test_space_replacement() { + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); + let output = " foo bar baz delta ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + + /// Entities must be terminated + #[test] + fn test_unterminated_entity() { + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnterminatedEntity(3..11)) + ); + } + + /// Unknown entities raise error + #[test] + fn test_unrecognized_entity() { + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate? existing unescape code behaves the same. (see: start index) + ); + } + + /// custom entity replacement works, entity replacement text processed recursively + #[test] + fn test_entity_replacement() { + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + let output = " A B ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some(" "), + "a" => Some(" "), + "da" => Some(" "), + _ => None, + } + } + assert_eq!( + attr.normalized_value_with(&custom_resolver), + Ok(Cow::Owned(output)) + ); + } + + #[test] + fn test_char_references() { + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let output = "\r\rA\n\nB\r\n".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + } + /// Checked attribute is the single attribute mod single { use super::*;