From c0c2e70d30f4a4b8aebb9fd6a426528a86323d2e Mon Sep 17 00:00:00 2001
From: Jairam Chandar <jairamc23@gmail.com>
Date: Wed, 29 Sep 2021 13:26:02 +0100
Subject: [PATCH 1/4] Allow attributes valid in html when converting

When parsing and converting an html document, the "syntax" was hard-coded to xml. This PR checks the document type of the output document and uses that to determine which attributes are valid.
---
 src/main/java/org/jsoup/helper/W3CDom.java     | 13 +++++++++++--
 src/test/java/org/jsoup/helper/W3CDomTest.java | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 8d38d74f1c..26c27e12ed 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,10 +38,12 @@
 import java.util.Map;
 import java.util.Properties;
 import java.util.Stack;
-import java.util.regex.Pattern;
+import java.util.Locale;
 
 import static javax.xml.transform.OutputKeys.METHOD;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax.html;
 import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
+import static org.jsoup.nodes.Document.OutputSettings.Syntax;
 
 /**
  * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
@@ -342,8 +344,15 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
         }
 
         private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
+            final Syntax syntax;
+            if (this.doc.getDoctype() != null &&
+                    this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
+                syntax = html;
+            } else {
+                syntax = xml;
+            }
             for (Attribute attribute : source.attributes()) {
-                String key = Attribute.getValidKey(attribute.getKey(), xml);
+                String key = Attribute.getValidKey(attribute.getKey(), syntax);
                 if (key != null) { // null if couldn't be coerced to validity
                     el.setAttribute(key, attribute.getValue());
                 }
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 1c172a4dd0..89131f3ad9 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -190,6 +190,20 @@ public void handlesInvalidAttributeNames() {
         assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body name=\"\" style=\"color: red\"/></html>", xml);
     }
 
+    @Test
+    public void handlesAccentedCharsAttributeNames() {
+        String html = "<!DOCTYPE html><html><head></head><body style=\"color: red\" \" name\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+        org.jsoup.nodes.Document jsoupDoc;
+        jsoupDoc = Jsoup.parse(html);
+        Element body = jsoupDoc.select("body").first();
+        assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
+        assertTrue(body.hasAttr("name\""));
+
+        Document w3Doc = W3CDom.convert(jsoupDoc);
+        String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+        assertEquals("<!DOCTYPE html SYSTEM \"about:legacy-compat\">\n<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body name=\"\" style=\"color: red\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>", out);
+    }
+
     @Test
     public void handlesInvalidTagAsText() {
         org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入！>Text <p>More</p>");

From 15a2bca20caac6f8c7d2f00e999a245f7ec978a6 Mon Sep 17 00:00:00 2001
From: Jairam Chandar <jairamc23@gmail.com>
Date: Tue, 5 Oct 2021 13:53:21 +0100
Subject: [PATCH 2/4] Pass Parser settings when converting to w3c Doc

---
 src/main/java/org/jsoup/helper/W3CDom.java | 37 +++++++++++++---------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 26c27e12ed..7b499917e7 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,11 +38,8 @@
 import java.util.Map;
 import java.util.Properties;
 import java.util.Stack;
-import java.util.Locale;
 
 import static javax.xml.transform.OutputKeys.METHOD;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.html;
-import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
 import static org.jsoup.nodes.Document.OutputSettings.Syntax;
 
 /**
@@ -223,8 +220,16 @@ public void convert(org.jsoup.nodes.Element in, Document out) {
                 out.setDocumentURI(inDoc.location());
         }
 
-        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
-        NodeTraversor.traverse(new W3CBuilder(out), rootEl);
+        final org.jsoup.nodes.Document.OutputSettings outputSettings;
+        final org.jsoup.nodes.Element rootEl;
+        if (in instanceof org.jsoup.nodes.Document) {
+            outputSettings = ((org.jsoup.nodes.Document) in).outputSettings();
+            rootEl = in.child(0); // skip the #root node if a Document
+        } else {
+            outputSettings = new org.jsoup.nodes.Document.OutputSettings();
+            rootEl = in;
+        }
+        NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl);
     }
 
     public NodeList selectXpath(String xpath, Document doc) {
@@ -284,11 +289,13 @@ protected static class W3CBuilder implements NodeVisitor {
         private final Document doc;
         private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
         private Node dest;
+        private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document
 
-        public W3CBuilder(Document doc) {
+        public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) {
             this.doc = doc;
             this.namespacesStack.push(new HashMap<>());
             this.dest = doc;
+            this.outputSettings = outputSettings;
         }
 
         public void head(org.jsoup.nodes.Node source, int depth) {
@@ -308,7 +315,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
                     Element el = namespace == null && tagName.contains(":") ?
                         doc.createElementNS("", tagName) : // doesn't have a real namespace defined
                         doc.createElementNS(namespace, tagName);
-                    copyAttributes(sourceEl, el);
+                    copyAttributes(sourceEl, el, outputSettings.syntax());
                     append(el, sourceEl);
                     dest = el; // descend
                 } catch (DOMException e) {
@@ -343,14 +350,14 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
             namespacesStack.pop();
         }
 
-        private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
-            final Syntax syntax;
-            if (this.doc.getDoctype() != null &&
-                    this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
-                syntax = html;
-            } else {
-                syntax = xml;
-            }
+        private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) {
+//            final Syntax syntax;
+//            if (this.doc.getDoctype() != null &&
+//                    this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
+//                syntax = html;
+//            } else {
+//                syntax = xml;
+//            }
             for (Attribute attribute : source.attributes()) {
                 String key = Attribute.getValidKey(attribute.getKey(), syntax);
                 if (key != null) { // null if couldn't be coerced to validity

From 68456f833b5a1ed2676549785734266a55850361 Mon Sep 17 00:00:00 2001
From: Jairam Chandar <jairamc23@gmail.com>
Date: Tue, 5 Oct 2021 13:56:26 +0100
Subject: [PATCH 3/4] Ignore newlines in tests

On windows, DOM will write newlines as \r\n
---
 src/test/java/org/jsoup/helper/W3CDomTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 89131f3ad9..7d5379dccc 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -201,7 +201,8 @@ public void handlesAccentedCharsAttributeNames() {
 
         Document w3Doc = W3CDom.convert(jsoupDoc);
         String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
-        assertEquals("<!DOCTYPE html SYSTEM \"about:legacy-compat\">\n<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body name=\"\" style=\"color: red\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>", out);
+        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body name=\"\" style=\"color: red\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+        assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n
     }
 
     @Test

From c0f4d83086089a121e1ac651bae622e2a7aac753 Mon Sep 17 00:00:00 2001
From: jhy <jonathan@hedley.net>
Date: Wed, 6 Oct 2021 21:50:12 +1100
Subject: [PATCH 4/4] Maintain current public method signature

To preserve backcompat
---
 CHANGES                                       |  6 +++-
 src/main/java/org/jsoup/helper/W3CDom.java    | 34 ++++++-------------
 .../java/org/jsoup/helper/W3CDomTest.java     | 24 +++++++++----
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/CHANGES b/CHANGES
index be9fc196ce..17cfe0f48b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,10 @@
 jsoup changelog
 
-*** Release 1.14.3 [PENDING]
+*** Release 1.15.1 [PENDING]
+  * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the
+    input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.)
+
+*** Release 1.14.3 [2021-Sep-30]
   * Improvement: added native XPath support in Element#selectXpath(String)
     <https://github.com/jhy/jsoup/pull/1629>
 
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
index 7b499917e7..3bb1e8c0ab 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -214,22 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) {
      * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
      */
     public void convert(org.jsoup.nodes.Element in, Document out) {
+        W3CBuilder builder = new W3CBuilder(out);
         org.jsoup.nodes.Document inDoc = in.ownerDocument();
         if (inDoc != null) {
-            if (!StringUtil.isBlank(inDoc.location()))
+            if (!StringUtil.isBlank(inDoc.location())) {
                 out.setDocumentURI(inDoc.location());
+            }
+            builder.syntax = inDoc.outputSettings().syntax();
         }
-
-        final org.jsoup.nodes.Document.OutputSettings outputSettings;
-        final org.jsoup.nodes.Element rootEl;
-        if (in instanceof org.jsoup.nodes.Document) {
-            outputSettings = ((org.jsoup.nodes.Document) in).outputSettings();
-            rootEl = in.child(0); // skip the #root node if a Document
-        } else {
-            outputSettings = new org.jsoup.nodes.Document.OutputSettings();
-            rootEl = in;
-        }
-        NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl);
+        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document
+        NodeTraversor.traverse(builder, rootEl);
     }
 
     public NodeList selectXpath(String xpath, Document doc) {
@@ -289,13 +283,12 @@ protected static class W3CBuilder implements NodeVisitor {
         private final Document doc;
         private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
         private Node dest;
-        private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document
+        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
 
-        public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) {
+        public W3CBuilder(Document doc) {
             this.doc = doc;
             this.namespacesStack.push(new HashMap<>());
             this.dest = doc;
-            this.outputSettings = outputSettings;
         }
 
         public void head(org.jsoup.nodes.Node source, int depth) {
@@ -315,7 +308,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
                     Element el = namespace == null && tagName.contains(":") ?
                         doc.createElementNS("", tagName) : // doesn't have a real namespace defined
                         doc.createElementNS(namespace, tagName);
-                    copyAttributes(sourceEl, el, outputSettings.syntax());
+                    copyAttributes(sourceEl, el);
                     append(el, sourceEl);
                     dest = el; // descend
                 } catch (DOMException e) {
@@ -350,14 +343,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) {
             namespacesStack.pop();
         }
 
-        private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) {
-//            final Syntax syntax;
-//            if (this.doc.getDoctype() != null &&
-//                    this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) {
-//                syntax = html;
-//            } else {
-//                syntax = xml;
-//            }
+        private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
             for (Attribute attribute : source.attributes()) {
                 String key = Attribute.getValidKey(attribute.getKey(), syntax);
                 if (key != null) { // null if couldn't be coerced to validity
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
index 7d5379dccc..f1ff904454 100644
--- a/src/test/java/org/jsoup/helper/W3CDomTest.java
+++ b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -191,18 +191,28 @@ public void handlesInvalidAttributeNames() {
     }
 
     @Test
-    public void handlesAccentedCharsAttributeNames() {
-        String html = "<!DOCTYPE html><html><head></head><body style=\"color: red\" \" name\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+    public void htmlInputDocMaintainsHtmlAttributeNames() {
+        String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
         org.jsoup.nodes.Document jsoupDoc;
         jsoupDoc = Jsoup.parse(html);
-        Element body = jsoupDoc.select("body").first();
-        assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
-        assertTrue(body.hasAttr("name\""));
 
         Document w3Doc = W3CDom.convert(jsoupDoc);
         String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
-        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body name=\"\" style=\"color: red\"><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
-        assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n
+        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
+        assertEquals(expected, TextUtil.stripNewlines(out));
+    }
+
+    @Test
+    public void xmlInputDocMaintainsHtmlAttributeNames() {
+        String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names coerced</p></body></html>";
+        org.jsoup.nodes.Document jsoupDoc;
+        jsoupDoc = Jsoup.parse(html);
+        jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
+
+        Document w3Doc = W3CDom.convert(jsoupDoc);
+        String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
+        String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>";
+        assertEquals(expected, TextUtil.stripNewlines(out));
     }
 
     @Test