diff --git a/CHANGES b/CHANGES
index be9fc196ce..17cfe0f48b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,10 @@
jsoup changelog
-*** Release 1.14.3 [PENDING]
+*** Release 1.15.1 [PENDING]
+ * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the
+ input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.)
+
+*** Release 1.14.3 [2021-Sep-30]
* Improvement: added native XPath support in Element#selectXpath(String)
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 2ebc3ba741..3b0b5df692 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,10 +38,9 @@
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
-import java.util.regex.Pattern;
import static javax.xml.transform.OutputKeys.METHOD;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax;
/**
* Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
@@ -215,14 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) {
* @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
*/
public void convert(org.jsoup.nodes.Element in, Document out) {
+ W3CBuilder builder = new W3CBuilder(out);
org.jsoup.nodes.Document inDoc = in.ownerDocument();
if (inDoc != null) {
- if (!StringUtil.isBlank(inDoc.location()))
+ if (!StringUtil.isBlank(inDoc.location())) {
out.setDocumentURI(inDoc.location());
+ }
+ builder.syntax = inDoc.outputSettings().syntax();
}
-
org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
- NodeTraversor.traverse(new W3CBuilder(out), rootEl);
+ NodeTraversor.traverse(builder, rootEl);
}
public NodeList selectXpath(String xpath, Document doc) {
@@ -282,6 +283,7 @@ protected static class W3CBuilder implements NodeVisitor {
private final Document doc;
private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
private Node dest;
+ private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
public W3CBuilder(Document doc) {
this.doc = doc;
@@ -343,7 +345,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
for (Attribute attribute : source.attributes()) {
- String key = Attribute.getValidKey(attribute.getKey(), xml);
+ String key = Attribute.getValidKey(attribute.getKey(), syntax);
if (key != null) { // null if couldn't be coerced to validity
el.setAttribute(key, attribute.getValue());
}
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 1c172a4dd0..f1ff904454 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -190,6 +190,31 @@ public void handlesInvalidAttributeNames() {
assertEquals("", xml);
}
+ @Test
+ public void htmlInputDocMaintainsHtmlAttributeNames() {
+ String html = "
unicode attr names
";
+ org.jsoup.nodes.Document jsoupDoc;
+ jsoupDoc = Jsoup.parse(html);
+
+ Document w3Doc = W3CDom.convert(jsoupDoc);
+ String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+ String expected = "unicode attr names
";
+ assertEquals(expected, TextUtil.stripNewlines(out));
+ }
+
+ @Test
+ public void xmlInputDocMaintainsHtmlAttributeNames() {
+ String html = "unicode attr names coerced
";
+ org.jsoup.nodes.Document jsoupDoc;
+ jsoupDoc = Jsoup.parse(html);
+ jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
+
+ Document w3Doc = W3CDom.convert(jsoupDoc);
+ String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+ String expected = "unicode attr names coerced
";
+ assertEquals(expected, TextUtil.stripNewlines(out));
+ }
+
@Test
public void handlesInvalidTagAsText() {
org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text More
");