diff --git a/Cargo.toml b/Cargo.toml index 0da6e32e..e463d7a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true } encoding_rs = { version = "0.8", optional = true } serde = { version = "1.0", optional = true } memchr = "2.5" +jetscii = "0.5.2" +once_cell = "1.12.0" [dev-dependencies] criterion = "0.3" diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..10b03943 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -1,13 +1,20 @@ //! Manage xml character escapes -use memchr; use std::borrow::Cow; use std::collections::HashMap; use std::ops::Range; +use jetscii::bytes; +use memchr; +use once_cell::sync::Lazy; + #[cfg(test)] use pretty_assertions::assert_eq; +static XML_ESCAPE_BYTES: Lazy = + Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"')); +static XML_PARTIAL_ESCAPE_BYTES: Lazy = Lazy::new(|| bytes!(b'<', b'>', b'&')); + /// Error for XML escape/unescqpe. #[derive(Debug)] pub enum EscapeError { @@ -73,8 +80,8 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> { _ => false, } } - - _escape(raw, to_escape) + // _escape(raw, to_escape) + simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES) } /// Should only be used for escaping text content. In xml text content, it is allowed @@ -89,8 +96,8 @@ pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> { _ => false, } } - - _escape(raw, to_escape) + // _escape(raw, to_escape) + simd_escape(raw, &XML_ESCAPE_BYTES) } /// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their @@ -127,6 +134,42 @@ fn _escape bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> { } } +/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their +/// corresponding xml escaped value. +pub fn simd_escape<'a>(raw: &'a [u8], escape_matcher: &jetscii::BytesConst) -> Cow<'a, [u8]> { + let mut escaped = None; + let mut pos = 0; + while let Some(i) = escape_matcher.find(&raw[pos..]) { + if escaped.is_none() { + escaped = Some(Vec::with_capacity(raw.len())); + } + let escaped = escaped.as_mut().expect("initialized"); + let new_pos = pos + i; + escaped.extend_from_slice(&raw[pos..new_pos]); + match raw[new_pos] { + b'<' => escaped.extend_from_slice(b"<"), + b'>' => escaped.extend_from_slice(b">"), + b'\'' => escaped.extend_from_slice(b"'"), + b'&' => escaped.extend_from_slice(b"&"), + b'"' => escaped.extend_from_slice(b"""), + c @ _ => unreachable!( + "Found {} but only '<', '>', ', '&' and '\"' are escaped", + c as char + ), + } + pos = new_pos + 1; + } + + if let Some(mut escaped) = escaped { + if let Some(raw) = raw.get(pos..) { + escaped.extend_from_slice(raw); + } + Cow::Owned(escaped) + } else { + Cow::Borrowed(raw) + } +} + /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding /// value pub fn unescape(raw: &[u8]) -> Result, EscapeError> {