Skip to content

Commit

Permalink
Escape ascii control codes in both XML and HTML
Browse files Browse the repository at this point in the history
Required for XML and easier to read for HTML

Fixes #1556.
  • Loading branch information
jhy committed Aug 12, 2021
1 parent 04735f9 commit 2a4c9de
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Expand Up @@ -4,6 +4,10 @@ jsoup changelog
* Improvement: support Pattern.quote \Q and \E escapes in the selector regex matchers.
<https://github.com/jhy/jsoup/pull/1536>

* Bugfix: when serializing output, escape characters that are in the < 0x20 range. This improves XML output
compatibility, and makes HTML output with these characters easier to read (as they're otherwise invisible).
<https://github.com/jhy/jsoup/issues/1556>

* Bugfix: the *|el wildcard namespace selector now also matches elements with no namespace.
<https://github.com/jhy/jsoup/issues/1565>

Expand Down
12 changes: 9 additions & 3 deletions src/main/java/org/jsoup/nodes/Entities.java
Expand Up @@ -220,11 +220,17 @@ static void escape(Appendable accum, String string, OutputSettings out,
else
accum.append(c);
break;
// we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
case 0x9:
case 0xA:
case 0xD:
accum.append(c);
break;
default:
if (canEncode(coreCharset, c, encoder))
accum.append(c);
else
if (c < 0x20 || !canEncode(coreCharset, c, encoder))
appendEncoded(accum, escapeMode, codePoint);
else
accum.append(c);
}
} else {
final String c = new String(Character.toChars(codePoint));
Expand Down
13 changes: 13 additions & 0 deletions src/test/java/org/jsoup/nodes/EntitiesTest.java
@@ -1,6 +1,7 @@
package org.jsoup.nodes;

import org.jsoup.Jsoup;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Test;

import static org.jsoup.nodes.Document.OutputSettings;
Expand Down Expand Up @@ -150,4 +151,16 @@ public class EntitiesTest {
doc.outputSettings().escapeMode(xhtml);
assertEquals("<a title=\"&lt;p>One&lt;/p>\">One</a>", element.outerHtml());
}

@Test public void controlCharactersAreEscaped() {
// https://github.com/jhy/jsoup/issues/1556
// we escape ascii control characters in both HTML and XML for compatibility. Required in XML and probably
// easier to read in HTML
String input = "<a foo=\"&#x1b;esc&#x7;bell\">Text &#x1b; &#x7;</a>";
Document doc = Jsoup.parse(input);
assertEquals(input, doc.body().html());

Document xml = Jsoup.parse(input, "", Parser.xmlParser());
assertEquals(input, xml.html());
}
}
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -814,7 +814,7 @@ public class HtmlParserTest {

@Test public void handlesNullInData() {
Document doc = Jsoup.parse("<p id=\u0000>Blah \u0000</p>");
assertEquals("<p id=\"\uFFFD\">Blah \u0000</p>", doc.body().html()); // replaced in attr, NOT replaced in data
assertEquals("<p id=\"\uFFFD\">Blah &#x0;</p>", doc.body().html()); // replaced in attr, NOT replaced in data (but is escaped as control char <0x20)
}

@Test public void handlesNullInComments() {
Expand Down

0 comments on commit 2a4c9de

Please sign in to comment.