Skip to content

Commit

Permalink
More correctly normalize attribute values
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jan 28, 2023
1 parent add7406 commit c0f0577
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 12 deletions.
11 changes: 11 additions & 0 deletions Changelog.md
Expand Up @@ -14,6 +14,10 @@

- [#541]: Deserialize specially named `$text` enum variant in [externally tagged]
enums from textual content
- [#379]: Improved compliance with the XML attribute value normalization process by
adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
which ought to be used in place of `Attribute::unescape_value()` and
`Attribute::unescape_value_with()`

### Bug Fixes

Expand All @@ -22,10 +26,15 @@

### Misc Changes

### New Tests

- [#379]: Added tests for attribute value normalization

[externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
[#490]: https://github.com/tafia/quick-xml/pull/490
[#537]: https://github.com/tafia/quick-xml/issues/537
[#541]: https://github.com/tafia/quick-xml/pull/541
[#379]: https://github.com/tafia/quick-xml/pull/379

## 0.27.1 -- 2022-12-28

Expand Down Expand Up @@ -120,6 +129,8 @@
- [#489]: Reduced the size of the package uploaded into the crates.io by excluding
tests, examples, and benchmarks.

### New Tests

[#481]: https://github.com/tafia/quick-xml/pull/481
[#489]: https://github.com/tafia/quick-xml/pull/489

Expand Down
3 changes: 1 addition & 2 deletions benches/macrobenches.rs
Expand Up @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
("players.xml", PLAYERS),
];

// TODO: use fully normalized attribute values
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
let mut r = Reader::from_str(doc);
loop {
match criterion::black_box(r.read_event()?) {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.normalized_value()?);
}
}
Event::Text(e) => {
Expand Down
45 changes: 45 additions & 0 deletions benches/microbenches.rs
Expand Up @@ -242,6 +242,50 @@ fn attributes(c: &mut Criterion) {
assert_eq!(count, 150);
})
});

group.finish();
}

/// Benchmarks normalizing attribute values
fn attribute_value_normalization(c: &mut Criterion) {
let mut group = c.benchmark_group("attribute_value_normalization");

group.bench_function("noop_short", |b| {
b.iter(|| {
criterion::black_box(unescape("foobar")).unwrap();
})
});

group.bench_function("noop_long", |b| {
b.iter(|| {
criterion::black_box(unescape("just a bit of text without any entities")).unwrap();
})
});

group.bench_function("replacement_chars", |b| {
b.iter(|| {
criterion::black_box(unescape("just a bit\n of text without\tany entities")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = "&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = "&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.finish();
}

Expand Down Expand Up @@ -354,6 +398,7 @@ criterion_group!(
read_resolved_event_into,
one_event,
attributes,
attribute_value_normalization,
escaping,
unescaping,
);
Expand Down
1 change: 1 addition & 0 deletions src/errors.rs
Expand Up @@ -77,6 +77,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
71 changes: 64 additions & 7 deletions src/escapei.rs
Expand Up @@ -8,7 +8,7 @@ use std::ops::Range;
use pretty_assertions::assert_eq;

/// Error for XML escape / unescape.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(Range<usize>),
Expand Down Expand Up @@ -212,8 +212,66 @@ where
}
}

/// Returns the attribute value normalized as per the XML specification, using a custom
/// entity resolver.
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// Do not use this method with HTML attributes.
///
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
/// take precedence.
///
/// This will allocate unless the raw attribute value does not require normalization.
pub(crate) fn normalize_attribute_value_with<'a, 'entity>(
value: &'a str,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<'a, str>, EscapeError> {
// TODO: avoid allocation when not needed
let mut normalized = String::with_capacity(value.len());

let attr = value.as_bytes();
let mut attr_iter = attr.iter().enumerate();

while let Some((idx, ch)) = attr_iter.next() {
match ch {
b' ' | b'\n' | b'\r' | b'\t' => normalized.push(' '),
b'&' => {
let end = idx
+ 1
+ attr_iter
.position(|(_, c)| *c == b';')
.ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
let entity = &attr[idx + 1..end]; // starts after the &
let entity_str = std::str::from_utf8(entity).expect("failed UTF-8 check");

if entity.starts_with(b"#") {
let entity = &entity_str[1..]; // starts after the #
let codepoint = parse_number(entity, idx..end)?;
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
} else if let Some(s) = named_entity(entity_str) {
normalized.push_str(s);
} else if let Some(value) = resolve_entity(entity_str) {
// TODO: recursively apply entity substitution
normalized.push_str(&value);
} else {
return Err(EscapeError::UnrecognizedSymbol(
idx + 1..end,
String::from_utf8(entity.to_vec()).expect("failed UTF-8 check"),
));
}
}
_ => normalized.push(*ch as char),
}
}

Ok(Cow::Owned(normalized))
}

#[cfg(not(feature = "escape-html"))]
fn named_entity(name: &str) -> Option<&str> {
pub(crate) fn named_entity(name: &str) -> Option<&str> {
// match over strings are not allowed in const functions
let s = match name.as_bytes() {
b"lt" => "<",
Expand All @@ -226,9 +284,8 @@ fn named_entity(name: &str) -> Option<&str> {
Some(s)
}
#[cfg(feature = "escape-html")]
fn named_entity(name: &str) -> Option<&str> {
pub(crate) fn named_entity(name: &str) -> Option<&str> {
// imported from https://dev.w3.org/html5/html-author/charref
// match over strings are not allowed in const functions
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
let s = match name.as_bytes() {
b"Tab" => "\u{09}",
Expand Down Expand Up @@ -1690,7 +1747,7 @@ fn named_entity(name: &str) -> Option<&str> {
Some(s)
}

fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
pub fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
let code = if bytes.starts_with('x') {
parse_hexadecimal(&bytes[1..])
} else {
Expand All @@ -1705,7 +1762,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
}
}

fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
pub fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF => 6 characters
if bytes.len() > 6 {
return Err(EscapeError::TooLongHexadecimal);
Expand All @@ -1723,7 +1780,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
Ok(code)
}

fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
pub fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF = 1114111 => 7 characters
if bytes.len() > 7 {
return Err(EscapeError::TooLongDecimal);
Expand Down

0 comments on commit c0f0577

Please sign in to comment.