Skip to content

Commit

Permalink
Merge pull request tafia#685 from Mingun/quoting-level
Browse files Browse the repository at this point in the history
Add ability to set desired level of escaping special characters in XML
  • Loading branch information
Mingun committed Nov 25, 2023
2 parents 9fb181a + d9de2d8 commit c383849
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 9 deletions.
7 changes: 6 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ configuration is serializable.
- [#677]: Added methods `config()` and `config_mut()` to inspect and change the parser
configuration. Previous builder methods on `Reader` / `NsReader` was replaced by
direct access to fields of config using `reader.config_mut().<...>`.
- #[#684]: Added a method `Config::enable_all_checks` to turn on or off all
- [#684]: Added a method `Config::enable_all_checks` to turn on or off all
well-formedness checks.
- [#362]: Added `escape::minimal_escape()` which escapes only `&` and `<`.
- [#362]: Added `BytesCData::minimal_escape()` which escapes only `&` and `<`.
- [#362]: Added `Serializer::set_quote_level()` which allow to set desired level of escaping.

### Bug Fixes

Expand All @@ -47,7 +50,9 @@ configuration is serializable.
- [#684]: Now `<??>` parsed as `Event::PI` with empty content instead of raising
syntax error.
- [#684]: Now `<?xml?>` parsed as `Event::Decl` instead of `Event::PI`.
- [#362]: Now default quote level is `QuoteLevel::Partial` when using serde serializer.

[#362]: https://github.com/tafia/quick-xml/issues/362
[#513]: https://github.com/tafia/quick-xml/issues/513
[#622]: https://github.com/tafia/quick-xml/issues/622
[#675]: https://github.com/tafia/quick-xml/pull/675
Expand Down
50 changes: 50 additions & 0 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,16 @@ impl std::error::Error for EscapeError {}
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}
Expand All @@ -88,10 +98,35 @@ pub fn escape(raw: &str) -> Cow<str> {
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// XML standard [requires] that only `<` and `&` was escaped in text content or
/// attribute value. All other characters not necessary to be escaped, although
/// for compatibility with SGML they also should be escaped. Practically, escaping
/// only those characters is enough.
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `&` | `&amp;`
///
/// [requires]: https://www.w3.org/TR/xml11/#syntax
pub fn minimal_escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'&'))
}

/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
/// `&`, `'`, `"`) with their corresponding xml escaped value.
pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
Expand Down Expand Up @@ -1788,6 +1823,7 @@ fn test_escape() {
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(escape("<&\"'>"), "&lt;&amp;&quot;&apos;&gt;");
assert_eq!(escape("<test>"), "&lt;test&gt;");
assert_eq!(escape("\"a\"bc"), "&quot;a&quot;bc");
assert_eq!(escape("\"a\"b&c"), "&quot;a&quot;b&amp;c");
Expand All @@ -1806,6 +1842,7 @@ fn test_partial_escape() {
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(partial_escape("<&\"'>"), "&lt;&amp;\"'&gt;");
assert_eq!(partial_escape("<test>"), "&lt;test&gt;");
assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc");
assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&amp;c");
Expand All @@ -1814,3 +1851,16 @@ fn test_partial_escape() {
"prefix_\"a\"b&amp;&lt;&gt;c"
);
}

#[test]
fn test_minimal_escape() {
assert_eq!(minimal_escape("test"), Cow::Borrowed("test"));
assert_eq!(minimal_escape("<&\"'>"), "&lt;&amp;\"'>");
assert_eq!(minimal_escape("<test>"), "&lt;test>");
assert_eq!(minimal_escape("\"a\"bc"), "\"a\"bc");
assert_eq!(minimal_escape("\"a\"b&c"), "\"a\"b&amp;c");
assert_eq!(
minimal_escape("prefix_\"a\"b&<>c"),
"prefix_\"a\"b&amp;&lt;>c"
);
}
26 changes: 25 additions & 1 deletion src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use std::str::from_utf8;

use crate::encoding::Decoder;
use crate::errors::{Error, IllFormedError, Result};
use crate::escape::{escape, partial_escape, unescape_with};
use crate::escape::{escape, minimal_escape, partial_escape, unescape_with};
use crate::name::{LocalName, QName};
use crate::reader::is_whitespace;
use crate::utils::write_cow_string;
Expand Down Expand Up @@ -913,6 +913,30 @@ impl<'a> BytesCData<'a> {
))
}

/// Converts this CDATA content to an escaped version, that can be written
/// as an usual text in XML. This method escapes only those characters that
/// must be escaped according to the [specification].
///
/// This function performs following replacements:
///
/// | Character | Replacement
/// |-----------|------------
/// | `<` | `&lt;`
/// | `&` | `&amp;`
///
/// [specification]: https://www.w3.org/TR/xml11/#syntax
pub fn minimal_escape(self) -> Result<BytesText<'a>> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match minimal_escape(&decoded) {
// Because result is borrowed, no replacements was done and we can use original content
Cow::Borrowed(_) => self.content,
Cow::Owned(escaped) => Cow::Owned(escaped.into_bytes()),
},
Decoder::utf8(),
))
}

/// Gets content of this text buffer in the specified encoding
pub(crate) fn decode(&self) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ pub mod errors;
mod escapei;
pub mod escape {
//! Manage xml character escapes
pub use crate::escapei::{escape, partial_escape, unescape, unescape_with, EscapeError};
pub use crate::escapei::{
escape, minimal_escape, partial_escape, unescape, unescape_with, EscapeError,
};
}
pub mod events;
pub mod name;
Expand Down
108 changes: 106 additions & 2 deletions src/se/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
Self {
ser: ContentSerializer {
writer,
level: QuoteLevel::Full,
level: QuoteLevel::Partial,
indent: Indent::None,
write_indent: false,
expand_empty_elements: false,
Expand Down Expand Up @@ -526,7 +526,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
Ok(Self {
ser: ContentSerializer {
writer,
level: QuoteLevel::Full,
level: QuoteLevel::Partial,
indent: Indent::None,
write_indent: false,
expand_empty_elements: false,
Expand Down Expand Up @@ -574,6 +574,14 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
self
}

/// Set the level of quoting used when writing texts
///
/// Default: [`QuoteLevel::Minimal`]
pub fn set_quote_level(&mut self, level: QuoteLevel) -> &mut Self {
self.ser.level = level;
self
}

/// Set the indent object for a serializer
pub(crate) fn set_indent(&mut self, indent: Indent<'r>) -> &mut Self {
self.ser.indent = indent;
Expand Down Expand Up @@ -779,3 +787,99 @@ impl<'w, 'r, W: Write> ser::Serializer for Serializer<'w, 'r, W> {
}
}
}

#[cfg(test)]
mod quote_level {
use super::*;
use pretty_assertions::assert_eq;
use serde::Serialize;

#[derive(Debug, PartialEq, Serialize)]
struct Element(&'static str);

#[derive(Debug, PartialEq, Serialize)]
struct Example {
#[serde(rename = "@attribute")]
attribute: &'static str,
element: Element,
}

#[test]
fn default_() {
let example = Example {
attribute: "special chars: &, <, >, \", '",
element: Element("special chars: &, <, >, \", '"),
};

let mut buffer = String::new();
let ser = Serializer::new(&mut buffer);

example.serialize(ser).unwrap();
assert_eq!(
buffer,
"<Example attribute=\"special chars: &amp;, &lt;, &gt;, &quot;, '\">\
<element>special chars: &amp;, &lt;, &gt;, \", '</element>\
</Example>"
);
}

#[test]
fn minimal() {
let example = Example {
attribute: "special chars: &, <, >, \", '",
element: Element("special chars: &, <, >, \", '"),
};

let mut buffer = String::new();
let mut ser = Serializer::new(&mut buffer);
ser.set_quote_level(QuoteLevel::Minimal);

example.serialize(ser).unwrap();
assert_eq!(
buffer,
"<Example attribute=\"special chars: &amp;, &lt;, >, &quot;, '\">\
<element>special chars: &amp;, &lt;, >, \", '</element>\
</Example>"
);
}

#[test]
fn partial() {
let example = Example {
attribute: "special chars: &, <, >, \", '",
element: Element("special chars: &, <, >, \", '"),
};

let mut buffer = String::new();
let mut ser = Serializer::new(&mut buffer);
ser.set_quote_level(QuoteLevel::Partial);

example.serialize(ser).unwrap();
assert_eq!(
buffer,
"<Example attribute=\"special chars: &amp;, &lt;, &gt;, &quot;, '\">\
<element>special chars: &amp;, &lt;, &gt;, \", '</element>\
</Example>"
);
}

#[test]
fn full() {
let example = Example {
attribute: "special chars: &, <, >, \", '",
element: Element("special chars: &, <, >, \", '"),
};

let mut buffer = String::new();
let mut ser = Serializer::new(&mut buffer);
ser.set_quote_level(QuoteLevel::Full);

example.serialize(ser).unwrap();
assert_eq!(
buffer,
"<Example attribute=\"special chars: &amp;, &lt;, &gt;, &quot;, &apos;\">\
<element>special chars: &amp;, &lt;, &gt;, &quot;, &apos;</element>\
</Example>"
);
}
}
8 changes: 4 additions & 4 deletions tests/serde-se.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1807,13 +1807,13 @@ mod with_root {
serialize_as!(char_lt: '<' => "<root>&lt;</root>");
serialize_as!(char_gt: '>' => "<root>&gt;</root>");
serialize_as!(char_amp: '&' => "<root>&amp;</root>");
serialize_as!(char_apos: '\'' => "<root>&apos;</root>");
serialize_as!(char_quot: '"' => "<root>&quot;</root>");
serialize_as!(char_apos: '\'' => "<root>'</root>");
serialize_as!(char_quot: '"' => "<root>\"</root>");
// FIXME: Probably we should trim only for specified types when deserialize
serialize_as_only!(char_space: ' ' => "<root> </root>");

serialize_as!(str_non_escaped: "non-escaped string"; &str => "<root>non-escaped string</root>");
serialize_as!(str_escaped: "<\"escaped & string'>"; String => "<root>&lt;&quot;escaped &amp; string&apos;&gt;</root>");
serialize_as!(str_escaped: "<\"escaped & string'>"; String => "<root>&lt;\"escaped &amp; string'&gt;</root>");

err!(bytes: Bytes(b"<\"escaped & bytes'>") => Unsupported("`serialize_bytes` not supported yet"));

Expand All @@ -1839,7 +1839,7 @@ mod with_root {
serialize_as!(tuple:
// Use to_string() to get owned type that is required for deserialization
("<\"&'>".to_string(), "with\t\r\n spaces", 3usize)
=> "<root>&lt;&quot;&amp;&apos;&gt;</root>\
=> "<root>&lt;\"&amp;'&gt;</root>\
<root>with\t\r\n spaces</root>\
<root>3</root>");
serialize_as!(tuple_struct:
Expand Down

0 comments on commit c383849

Please sign in to comment.