diff --git a/CHANGES b/CHANGES index be9fc196ce..17cfe0f48b 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,10 @@ jsoup changelog -*** Release 1.14.3 [PENDING] +*** Release 1.15.1 [PENDING] + * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the + input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.) + +*** Release 1.14.3 [2021-Sep-30] * Improvement: added native XPath support in Element#selectXpath(String) diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 8d38d74f1c..3bb1e8c0ab 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -38,10 +38,9 @@ import java.util.Map; import java.util.Properties; import java.util.Stack; -import java.util.regex.Pattern; import static javax.xml.transform.OutputKeys.METHOD; -import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml; +import static org.jsoup.nodes.Document.OutputSettings.Syntax; /** * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, @@ -215,14 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) { * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) */ public void convert(org.jsoup.nodes.Element in, Document out) { + W3CBuilder builder = new W3CBuilder(out); org.jsoup.nodes.Document inDoc = in.ownerDocument(); if (inDoc != null) { - if (!StringUtil.isBlank(inDoc.location())) + if (!StringUtil.isBlank(inDoc.location())) { out.setDocumentURI(inDoc.location()); + } + builder.syntax = inDoc.outputSettings().syntax(); } - org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document - NodeTraversor.traverse(new W3CBuilder(out), rootEl); + NodeTraversor.traverse(builder, rootEl); } public NodeList selectXpath(String xpath, Document doc) { @@ -282,6 +283,7 @@ protected static class W3CBuilder implements NodeVisitor { private final Document doc; private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn private Node dest; + private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. public W3CBuilder(Document doc) { this.doc = doc; @@ -343,7 +345,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) { private void copyAttributes(org.jsoup.nodes.Node source, Element el) { for (Attribute attribute : source.attributes()) { - String key = Attribute.getValidKey(attribute.getKey(), xml); + String key = Attribute.getValidKey(attribute.getKey(), syntax); if (key != null) { // null if couldn't be coerced to validity el.setAttribute(key, attribute.getValue()); } diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 1c172a4dd0..f1ff904454 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -190,6 +190,31 @@ public void handlesInvalidAttributeNames() { assertEquals("", xml); } + @Test + public void htmlInputDocMaintainsHtmlAttributeNames() { + String html = "

unicode attr names

"; + org.jsoup.nodes.Document jsoupDoc; + jsoupDoc = Jsoup.parse(html); + + Document w3Doc = W3CDom.convert(jsoupDoc); + String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); + String expected = "

unicode attr names

"; + assertEquals(expected, TextUtil.stripNewlines(out)); + } + + @Test + public void xmlInputDocMaintainsHtmlAttributeNames() { + String html = "

unicode attr names coerced

"; + org.jsoup.nodes.Document jsoupDoc; + jsoupDoc = Jsoup.parse(html); + jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); + + Document w3Doc = W3CDom.convert(jsoupDoc); + String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); + String expected = "

unicode attr names coerced

"; + assertEquals(expected, TextUtil.stripNewlines(out)); + } + @Test public void handlesInvalidTagAsText() { org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text

More

");