From c0c2e70d30f4a4b8aebb9fd6a426528a86323d2e Mon Sep 17 00:00:00 2001
From: Jairam Chandar
Date: Wed, 29 Sep 2021 13:26:02 +0100
Subject: [PATCH 1/4] Allow attributes valid in html when converting
When parsing and converting an html document, the "syntax" was hard-coded to xml. This PR checks the document type of the output document and uses that to determine which attributes are valid.
---
src/main/java/org/jsoup/helper/W3CDom.java | 13 +++++++++++--
src/test/java/org/jsoup/helper/W3CDomTest.java | 14 ++++++++++++++
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 8d38d74f1c..26c27e12ed 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,10 +38,12 @@
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
-import java.util.regex.Pattern;
+import java.util.Locale;
import static javax.xml.transform.OutputKeys.METHOD;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax.html;
import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax;
/**
* Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
@@ -342,8 +344,15 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
}
private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
+ final Syntax syntax;
+ if (this.doc.getDoctype() != null &&
+ this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
+ syntax = html;
+ } else {
+ syntax = xml;
+ }
for (Attribute attribute : source.attributes()) {
- String key = Attribute.getValidKey(attribute.getKey(), xml);
+ String key = Attribute.getValidKey(attribute.getKey(), syntax);
if (key != null) { // null if couldn't be coerced to validity
el.setAttribute(key, attribute.getValue());
}
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 1c172a4dd0..89131f3ad9 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -190,6 +190,20 @@ public void handlesInvalidAttributeNames() {
assertEquals("
unicode attr names
";
+ org.jsoup.nodes.Document jsoupDoc;
+ jsoupDoc = Jsoup.parse(html);
+ Element body = jsoupDoc.select("body").first();
+ assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
+ assertTrue(body.hasAttr("name\""));
+
+ Document w3Doc = W3CDom.convert(jsoupDoc);
+ String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+ assertEquals("\nunicode attr names
", out);
+ }
+
@Test
public void handlesInvalidTagAsText() {
org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text More
");
From 15a2bca20caac6f8c7d2f00e999a245f7ec978a6 Mon Sep 17 00:00:00 2001
From: Jairam Chandar
Date: Tue, 5 Oct 2021 13:53:21 +0100
Subject: [PATCH 2/4] Pass Parser settings when converting to w3c Doc
---
src/main/java/org/jsoup/helper/W3CDom.java | 37 +++++++++++++---------
1 file changed, 22 insertions(+), 15 deletions(-)
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 26c27e12ed..7b499917e7 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,11 +38,8 @@
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
-import java.util.Locale;
import static javax.xml.transform.OutputKeys.METHOD;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.html;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
import static org.jsoup.nodes.Document.OutputSettings.Syntax;
/**
@@ -223,8 +220,16 @@ public void convert(org.jsoup.nodes.Element in, Document out) {
out.setDocumentURI(inDoc.location());
}
- org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
- NodeTraversor.traverse(new W3CBuilder(out), rootEl);
+ final org.jsoup.nodes.Document.OutputSettings outputSettings;
+ final org.jsoup.nodes.Element rootEl;
+ if (in instanceof org.jsoup.nodes.Document) {
+ outputSettings = ((org.jsoup.nodes.Document) in).outputSettings();
+ rootEl = in.child(0); // skip the #root node if a Document
+ } else {
+ outputSettings = new org.jsoup.nodes.Document.OutputSettings();
+ rootEl = in;
+ }
+ NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl);
}
public NodeList selectXpath(String xpath, Document doc) {
@@ -284,11 +289,13 @@ protected static class W3CBuilder implements NodeVisitor {
private final Document doc;
private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
private Node dest;
+ private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document
- public W3CBuilder(Document doc) {
+ public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) {
this.doc = doc;
this.namespacesStack.push(new HashMap<>());
this.dest = doc;
+ this.outputSettings = outputSettings;
}
public void head(org.jsoup.nodes.Node source, int depth) {
@@ -308,7 +315,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
Element el = namespace == null && tagName.contains(":") ?
doc.createElementNS("", tagName) : // doesn't have a real namespace defined
doc.createElementNS(namespace, tagName);
- copyAttributes(sourceEl, el);
+ copyAttributes(sourceEl, el, outputSettings.syntax());
append(el, sourceEl);
dest = el; // descend
} catch (DOMException e) {
@@ -343,14 +350,14 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
namespacesStack.pop();
}
- private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
- final Syntax syntax;
- if (this.doc.getDoctype() != null &&
- this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
- syntax = html;
- } else {
- syntax = xml;
- }
+ private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) {
+// final Syntax syntax;
+// if (this.doc.getDoctype() != null &&
+// this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
+// syntax = html;
+// } else {
+// syntax = xml;
+// }
for (Attribute attribute : source.attributes()) {
String key = Attribute.getValidKey(attribute.getKey(), syntax);
if (key != null) { // null if couldn't be coerced to validity
From 68456f833b5a1ed2676549785734266a55850361 Mon Sep 17 00:00:00 2001
From: Jairam Chandar
Date: Tue, 5 Oct 2021 13:56:26 +0100
Subject: [PATCH 3/4] Ignore newlines in tests
On windows, DOM will write newlines as \r\n
---
src/test/java/org/jsoup/helper/W3CDomTest.java | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 89131f3ad9..7d5379dccc 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -201,7 +201,8 @@ public void handlesAccentedCharsAttributeNames() {
Document w3Doc = W3CDom.convert(jsoupDoc);
String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
- assertEquals("\nunicode attr names
", out);
+ String expected = "unicode attr names
";
+ assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n
}
@Test
From c0f4d83086089a121e1ac651bae622e2a7aac753 Mon Sep 17 00:00:00 2001
From: jhy
Date: Wed, 6 Oct 2021 21:50:12 +1100
Subject: [PATCH 4/4] Maintain current public method signature
To preserve backcompat
---
CHANGES | 6 +++-
src/main/java/org/jsoup/helper/W3CDom.java | 34 ++++++-------------
.../java/org/jsoup/helper/W3CDomTest.java | 24 +++++++++----
3 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/CHANGES b/CHANGES
index be9fc196ce..17cfe0f48b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,10 @@
jsoup changelog
-*** Release 1.14.3 [PENDING]
+*** Release 1.15.1 [PENDING]
+ * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the
+ input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.)
+
+*** Release 1.14.3 [2021-Sep-30]
* Improvement: added native XPath support in Element#selectXpath(String)
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 7b499917e7..3bb1e8c0ab 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -214,22 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) {
* @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
*/
public void convert(org.jsoup.nodes.Element in, Document out) {
+ W3CBuilder builder = new W3CBuilder(out);
org.jsoup.nodes.Document inDoc = in.ownerDocument();
if (inDoc != null) {
- if (!StringUtil.isBlank(inDoc.location()))
+ if (!StringUtil.isBlank(inDoc.location())) {
out.setDocumentURI(inDoc.location());
+ }
+ builder.syntax = inDoc.outputSettings().syntax();
}
-
- final org.jsoup.nodes.Document.OutputSettings outputSettings;
- final org.jsoup.nodes.Element rootEl;
- if (in instanceof org.jsoup.nodes.Document) {
- outputSettings = ((org.jsoup.nodes.Document) in).outputSettings();
- rootEl = in.child(0); // skip the #root node if a Document
- } else {
- outputSettings = new org.jsoup.nodes.Document.OutputSettings();
- rootEl = in;
- }
- NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl);
+ org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
+ NodeTraversor.traverse(builder, rootEl);
}
public NodeList selectXpath(String xpath, Document doc) {
@@ -289,13 +283,12 @@ protected static class W3CBuilder implements NodeVisitor {
private final Document doc;
private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
private Node dest;
- private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document
+ private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
- public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) {
+ public W3CBuilder(Document doc) {
this.doc = doc;
this.namespacesStack.push(new HashMap<>());
this.dest = doc;
- this.outputSettings = outputSettings;
}
public void head(org.jsoup.nodes.Node source, int depth) {
@@ -315,7 +308,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
Element el = namespace == null && tagName.contains(":") ?
doc.createElementNS("", tagName) : // doesn't have a real namespace defined
doc.createElementNS(namespace, tagName);
- copyAttributes(sourceEl, el, outputSettings.syntax());
+ copyAttributes(sourceEl, el);
append(el, sourceEl);
dest = el; // descend
} catch (DOMException e) {
@@ -350,14 +343,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
namespacesStack.pop();
}
- private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) {
-// final Syntax syntax;
-// if (this.doc.getDoctype() != null &&
-// this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
-// syntax = html;
-// } else {
-// syntax = xml;
-// }
+ private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
for (Attribute attribute : source.attributes()) {
String key = Attribute.getValidKey(attribute.getKey(), syntax);
if (key != null) { // null if couldn't be coerced to validity
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 7d5379dccc..f1ff904454 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -191,18 +191,28 @@ public void handlesInvalidAttributeNames() {
}
@Test
- public void handlesAccentedCharsAttributeNames() {
- String html = "unicode attr names
";
+ public void htmlInputDocMaintainsHtmlAttributeNames() {
+ String html = "unicode attr names
";
org.jsoup.nodes.Document jsoupDoc;
jsoupDoc = Jsoup.parse(html);
- Element body = jsoupDoc.select("body").first();
- assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
- assertTrue(body.hasAttr("name\""));
Document w3Doc = W3CDom.convert(jsoupDoc);
String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
- String expected = "unicode attr names
";
- assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n
+ String expected = "unicode attr names
";
+ assertEquals(expected, TextUtil.stripNewlines(out));
+ }
+
+ @Test
+ public void xmlInputDocMaintainsHtmlAttributeNames() {
+ String html = "unicode attr names coerced
";
+ org.jsoup.nodes.Document jsoupDoc;
+ jsoupDoc = Jsoup.parse(html);
+ jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
+
+ Document w3Doc = W3CDom.convert(jsoupDoc);
+ String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+ String expected = "unicode attr names coerced
";
+ assertEquals(expected, TextUtil.stripNewlines(out));
}
@Test
", xml);
}
+ @Test
+ public void handlesAccentedCharsAttributeNames() {
+ String html = "