Allow attributes valid in html when converting (#1648)

When parsing and converting an html document, the syntax was hard-coded to xml. This PR checks the document type of the output document and uses that to determine which attributes are valid. Co-authored-by: jhy <jonathan@hedley.net> Fixes #1647
jhy · Oct 6, 2021 · 6b1fbb5 · 6b1fbb5
1 parent 9ce6ae1
commit 6b1fbb5
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,10 @@
 jsoup changelog
 
-*** Release 1.14.3 [PENDING]
+*** Release 1.15.1 [PENDING]
+  * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the
+    input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.)
+
+*** Release 1.14.3 [2021-Sep-30]
   * Improvement: added native XPath support in Element#selectXpath(String)
     <https://github.com/jhy/jsoup/pull/1629>
 

diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,10 +38,9 @@
 import java.util.Map;
 import java.util.Properties;
 import java.util.Stack;
-import java.util.regex.Pattern;
 
 import static javax.xml.transform.OutputKeys.METHOD;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax;
 
 /**
  * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
@@ -215,14 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) {
      * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
      */
     public void convert(org.jsoup.nodes.Element in, Document out) {
+        W3CBuilder builder = new W3CBuilder(out);
         org.jsoup.nodes.Document inDoc = in.ownerDocument();
         if (inDoc != null) {
-            if (!StringUtil.isBlank(inDoc.location()))
+            if (!StringUtil.isBlank(inDoc.location())) {
                 out.setDocumentURI(inDoc.location());
+            }
+            builder.syntax = inDoc.outputSettings().syntax();
         }
-
         org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
-        NodeTraversor.traverse(new W3CBuilder(out), rootEl);
+        NodeTraversor.traverse(builder, rootEl);
     }
 
     public NodeList selectXpath(String xpath, Document doc) {
@@ -282,6 +283,7 @@ protected static class W3CBuilder implements NodeVisitor {
         private final Document doc;
         private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
         private Node dest;
+        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
 
         public W3CBuilder(Document doc) {
             this.doc = doc;
@@ -343,7 +345,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
 
         private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
             for (Attribute attribute : source.attributes()) {
-                String key = Attribute.getValidKey(attribute.getKey(), xml);
+                String key = Attribute.getValidKey(attribute.getKey(), syntax);
                 if (key != null) { // null if couldn't be coerced to validity
                     el.setAttribute(key, attribute.getValue());
                 }

diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -190,6 +190,31 @@ public void handlesInvalidAttributeNames() {
         assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body name=\"\" style=\"color: red\"/></html>", xml);
     }
 
+    @Test
+    public void htmlInputDocMaintainsHtmlAttributeNames() {
+        String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+        org.jsoup.nodes.Document jsoupDoc;
+        jsoupDoc = Jsoup.parse(html);
+
+        Document w3Doc = W3CDom.convert(jsoupDoc);
+        String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+        assertEquals(expected, TextUtil.stripNewlines(out));
+    }
+
+    @Test
+    public void xmlInputDocMaintainsHtmlAttributeNames() {
+        String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names coerced</p></body></html>";
+        org.jsoup.nodes.Document jsoupDoc;
+        jsoupDoc = Jsoup.parse(html);
+        jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
+
+        Document w3Doc = W3CDom.convert(jsoupDoc);
+        String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>";
+        assertEquals(expected, TextUtil.stripNewlines(out));
+    }
+
     @Test
     public void handlesInvalidTagAsText() {
         org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入！>Text <p>More</p>");