Skip to content

Commit

Permalink
Faster escape routines
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jun 29, 2022
1 parent 0febc2b commit a881c1c
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 5 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Expand Up @@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true }
encoding_rs = { version = "0.8", optional = true }
serde = { version = "1.0", optional = true }
memchr = "2.5"
jetscii = "0.5.2"
once_cell = "1.12.0"

[dev-dependencies]
criterion = "0.3"
Expand Down
53 changes: 48 additions & 5 deletions src/escapei.rs
@@ -1,13 +1,20 @@
//! Manage xml character escapes

use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

use jetscii::bytes;
use memchr;
use once_cell::sync::Lazy;

#[cfg(test)]
use pretty_assertions::assert_eq;

static XML_ESCAPE_BYTES: Lazy<jetscii::BytesConst> =
Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"'));
static XML_PARTIAL_ESCAPE_BYTES: Lazy<jetscii::BytesConst> = Lazy::new(|| bytes!(b'<', b'>', b'&'));

/// Error for XML escape/unescqpe.
#[derive(Debug)]
pub enum EscapeError {
Expand Down Expand Up @@ -73,8 +80,8 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> {
_ => false,
}
}

_escape(raw, to_escape)
// _escape(raw, to_escape)
simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES)
}

/// Should only be used for escaping text content. In xml text content, it is allowed
Expand All @@ -89,8 +96,8 @@ pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
_ => false,
}
}

_escape(raw, to_escape)
// _escape(raw, to_escape)
simd_escape(raw, &XML_ESCAPE_BYTES)
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -127,6 +134,42 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
}
}

/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn simd_escape<'a>(raw: &'a [u8], escape_matcher: &jetscii::BytesConst) -> Cow<'a, [u8]> {
let mut escaped = None;
let mut pos = 0;
while let Some(i) = escape_matcher.find(&raw[pos..]) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&raw[pos..new_pos]);
match raw[new_pos] {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
b'&' => escaped.extend_from_slice(b"&amp;"),
b'"' => escaped.extend_from_slice(b"&quot;"),
c @ _ => unreachable!(
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
c as char
),
}
pos = new_pos + 1;
}

if let Some(mut escaped) = escaped {
if let Some(raw) = raw.get(pos..) {
escaped.extend_from_slice(raw);
}
Cow::Owned(escaped)
} else {
Cow::Borrowed(raw)
}
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
Expand Down

0 comments on commit a881c1c

Please sign in to comment.