Skip to content

Commit

Permalink
make other Nodes utils that act on whitespace knnow XML's idea of WS
Browse files Browse the repository at this point in the history
see #39
  • Loading branch information
bodewig committed Apr 10, 2023
1 parent 473c2dc commit aa7ae9f
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 15 deletions.
144 changes: 135 additions & 9 deletions src/main/net-core/Util/Nodes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ public static class Nodes {
/// empty text or CDATA nodes and where all textual content
/// including attribute values or comments are trimmed.
/// </summary>
/// <remarks>
/// <para>
/// Unlike <see cref="StripXmlWhitespace"/> this uses Unicode's idea
/// of whitespace rather than the more restricted subset considered
/// whitespace by XML.
/// </para>
/// </remarks>
public static XmlNode StripWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
cloned.Normalize();
Expand All @@ -96,9 +103,16 @@ public static class Nodes {
/// characters XML considers whitespace according to
/// <see href="https://www.w3.org/TR/xml11/#NT-S"/>.
/// </summary>
/// <remarks>
/// <para>
/// Unlike <see cref="StripWhitespace"/> this uses XML's idea
/// of whitespace rather than the more extensive set considered
/// whitespace by Unicode.
/// </para>
/// <para>
/// since XMLUnit 2.10.0
/// </para>
/// </remarks>
public static XmlNode StripXmlWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
cloned.Normalize();
Expand All @@ -118,6 +132,15 @@ public static class Nodes {
/// characters are replaced by space characters and
/// consecutive whitespace characaters are collapsed.
/// </para>
/// <para>
/// This method is similiar to <see cref="StripWhitespace"/>
/// but in addition "normalizes" whitespace.
/// </para>
/// <para>
/// Unlike <see cref="NormalizeXmlWhitespace"/> this uses Unicode's idea
/// of whitespace rather than the more restricted subset considered
/// whitespace by XML.
/// </para>
/// </remarks>
public static XmlNode NormalizeWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
Expand All @@ -126,6 +149,38 @@ public static class Nodes {
return cloned;
}

/// <summary>
/// Creates a new Node (of the same type as the original node)
/// that is similar to the orginal but doesn't contain any
/// empty text or CDATA nodes and where all textual content
/// including attribute values or comments are normalized.
/// </summary>
/// <remarks>
/// <para>
/// "normalized" in this context means all XML whitespace
/// characters are replaced by space characters and
/// consecutive XML whitespace characaters are collapsed.
/// </para>
/// <para>
/// This method is similiar to <see cref="StripXmlWhitespace"/>
/// but in addition "normalizes" XML whitespace.
/// </para>
/// <para>
/// Unlike <see cref="NormalizeWhitespace"/> this uses XML's idea
/// of whitespace rather than the more extensive set considered
/// whitespace by Unicode.
/// </para>
/// <para>
/// since XMLUnit 2.10.0
/// </para>
/// </remarks>
public static XmlNode NormalizeXmlWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
cloned.Normalize();
HandleWsRec(cloned, XmlTrimAndNormalizeValue);
return cloned;
}

/// <summary>
/// Creates a new Node (of the same type as the original node)
/// that is similar to the orginal but doesn't contain any
Expand All @@ -136,27 +191,58 @@ public static class Nodes {
/// This doesn't have any effect if applied to a text or CDATA
/// node itself.
/// </para>
/// <para>
/// Unlike <see cref="StripXmlElementContentWhitespace"/> this uses Unicode's idea
/// of whitespace rather than the more restricted subset considered
/// whitespace by XML.
/// </para>
/// <para>
/// since XMLUnit 2.6.0
/// </para>
/// </remarks>
public static XmlNode StripElementContentWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
cloned.Normalize();
StripECW(cloned);
StripECW(cloned, TrimValue);
return cloned;
}

/// <summary>
/// Returns the nodes' value trimmed of all whitespace.
/// Creates a new Node (of the same type as the original node)
/// that is similar to the orginal but doesn't contain any
/// text or CDATA nodes that only consist of XML whitespace.
/// </summary>
/// <remarks>
/// <para>
/// This doesn't have any effect if applied to a text or CDATA
/// node itself.
/// </para>
/// <para>
/// Unlike <see cref="StripXmlElementContentWhitespace"/> this uses XML's idea
/// of whitespace rather than the more extensive set considered
/// whitespace by Unicode.
/// </para>
/// <para>
/// since XMLUnit 2.10.0
/// </para>
/// </remarks>
public static XmlNode StripXmlElementContentWhitespace(XmlNode original) {
XmlNode cloned = original.CloneNode(true);
cloned.Normalize();
StripECW(cloned, XmlTrimValue);
return cloned;
}

/// <summary>
/// Returns the nodes' value trimmed of all whitespace.
/// </summary>
private static String TrimValue(XmlNode n) {
return n.Value.Trim();
}

/// <summary>
/// Returns the nodes' value trimmed of all whitespace and Normalized
/// <summary>
/// </summary>
private static String TrimAndNormalizeValue(XmlNode n) {
return Normalize(TrimValue(n));
}
Expand All @@ -167,11 +253,18 @@ public static class Nodes {

/// <summary>
/// Returns the nodes' value trimmed of all characters XML considers whitespace.
/// <summary>
/// </summary>
private static String XmlTrimValue(XmlNode n) {
return n.Value.Trim(XML_WHITESPACE_CHARS);
}

/// <summary>
/// Returns the nodes' value trimmed of all whitespace and Normalized
/// </summary>
private static String XmlTrimAndNormalizeValue(XmlNode n) {
return XmlNormalize(XmlTrimValue(n));
}

/// <summary>
/// Trims textual content of this node, removes empty text and
/// CDATA children, recurses into its child nodes.
Expand Down Expand Up @@ -206,16 +299,49 @@ public static class Nodes {
/// Normalize a string.
/// </summary>
/// <remarks>
/// <para>
/// "normalized" in this context means all whitespace
/// characters are replaced by space characters and
/// consecutive whitespace characaters are collapsed.
/// consecutive whitespace characters are collapsed.
/// </para>
/// <para>
/// Unlike <see cref="XmlNormalize"/> this uses Unicode's idea
/// of whitespace rather than the more restricted subset considered
/// whitespace by XML.
/// </para>
/// </remarks>
internal static string Normalize(string s) {
return Normalize(s, c => char.IsWhiteSpace(c));
}

/// <summary>
/// Normalize a string with regard to XML whitespace.
/// </summary>
/// <remarks>
/// <para>
/// "normalized" in this context means all XML whitespace
/// characters are replaced by space characters and
/// consecutive XML whitespace characters are collapsed.
/// </para>
/// <para>
/// Unlike <see cref="Normalize"/> this uses XML's idea
/// of whitespace rather than the more extensive set considered
/// whitespace by Unicode.
/// </para>
/// <para>
/// since XMLUnit 2.10.0
/// </para>
/// </remarks>
internal static string XmlNormalize(string s) {
return Normalize(s, c => XML_WHITESPACE_CHARS.Contains(c));
}

private static string Normalize(string s, Predicate<char> isWhiteSpace) {
StringBuilder sb = new StringBuilder();
bool changed = false;
bool lastCharWasWS = false;
foreach (char c in s) {
if (char.IsWhiteSpace(c)) {
if (isWhiteSpace(c)) {
if (!lastCharWasWS) {
sb.Append(SPACE);
changed |= (c != SPACE);
Expand All @@ -231,13 +357,13 @@ public static class Nodes {
return changed ? sb.ToString() : s;
}

private static void StripECW(XmlNode n) {
private static void StripECW(XmlNode n, Func<XmlNode, String> trimmer) {
LinkedList<XmlNode> toRemove = new LinkedList<XmlNode>();
foreach (XmlNode child in n.ChildNodes) {
StripECW(child);
StripECW(child, trimmer);
if (!(n is XmlAttribute)
&& IsTextualContentNode(child)
&& child.Value.Trim().Length == 0) {
&& trimmer(child).Length == 0) {
toRemove.AddLast(child);
}
}
Expand Down

0 comments on commit aa7ae9f

Please sign in to comment.