From c0c2e70d30f4a4b8aebb9fd6a426528a86323d2e Mon Sep 17 00:00:00 2001 From: Jairam Chandar Date: Wed, 29 Sep 2021 13:26:02 +0100 Subject: [PATCH 1/4] Allow attributes valid in html when converting When parsing and converting an html document, the "syntax" was hard-coded to xml. This PR checks the document type of the output document and uses that to determine which attributes are valid. --- src/main/java/org/jsoup/helper/W3CDom.java | 13 +++++++++++-- src/test/java/org/jsoup/helper/W3CDomTest.java | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 8d38d74f1c..26c27e12ed 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -38,10 +38,12 @@ import java.util.Map; import java.util.Properties; import java.util.Stack; -import java.util.regex.Pattern; +import java.util.Locale; import static javax.xml.transform.OutputKeys.METHOD; +import static org.jsoup.nodes.Document.OutputSettings.Syntax.html; import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml; +import static org.jsoup.nodes.Document.OutputSettings.Syntax; /** * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, @@ -342,8 +344,15 @@ public void tail(org.jsoup.nodes.Node source, int depth) { } private void copyAttributes(org.jsoup.nodes.Node source, Element el) { + final Syntax syntax; + if (this.doc.getDoctype() != null && + this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) { + syntax = html; + } else { + syntax = xml; + } for (Attribute attribute : source.attributes()) { - String key = Attribute.getValidKey(attribute.getKey(), xml); + String key = Attribute.getValidKey(attribute.getKey(), syntax); if (key != null) { // null if couldn't be coerced to validity el.setAttribute(key, attribute.getValue()); } diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 1c172a4dd0..89131f3ad9 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -190,6 +190,20 @@ public void handlesInvalidAttributeNames() { assertEquals("", xml); } + @Test + public void handlesAccentedCharsAttributeNames() { + String html = "

unicode attr names

"; + org.jsoup.nodes.Document jsoupDoc; + jsoupDoc = Jsoup.parse(html); + Element body = jsoupDoc.select("body").first(); + assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it + assertTrue(body.hasAttr("name\"")); + + Document w3Doc = W3CDom.convert(jsoupDoc); + String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); + assertEquals("\n

unicode attr names

", out); + } + @Test public void handlesInvalidTagAsText() { org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text

More

"); From 15a2bca20caac6f8c7d2f00e999a245f7ec978a6 Mon Sep 17 00:00:00 2001 From: Jairam Chandar Date: Tue, 5 Oct 2021 13:53:21 +0100 Subject: [PATCH 2/4] Pass Parser settings when converting to w3c Doc --- src/main/java/org/jsoup/helper/W3CDom.java | 37 +++++++++++++--------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 26c27e12ed..7b499917e7 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -38,11 +38,8 @@ import java.util.Map; import java.util.Properties; import java.util.Stack; -import java.util.Locale; import static javax.xml.transform.OutputKeys.METHOD; -import static org.jsoup.nodes.Document.OutputSettings.Syntax.html; -import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml; import static org.jsoup.nodes.Document.OutputSettings.Syntax; /** @@ -223,8 +220,16 @@ public void convert(org.jsoup.nodes.Element in, Document out) { out.setDocumentURI(inDoc.location()); } - org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document - NodeTraversor.traverse(new W3CBuilder(out), rootEl); + final org.jsoup.nodes.Document.OutputSettings outputSettings; + final org.jsoup.nodes.Element rootEl; + if (in instanceof org.jsoup.nodes.Document) { + outputSettings = ((org.jsoup.nodes.Document) in).outputSettings(); + rootEl = in.child(0); // skip the #root node if a Document + } else { + outputSettings = new org.jsoup.nodes.Document.OutputSettings(); + rootEl = in; + } + NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl); } public NodeList selectXpath(String xpath, Document doc) { @@ -284,11 +289,13 @@ protected static class W3CBuilder implements NodeVisitor { private final Document doc; private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn private Node dest; + private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document - public W3CBuilder(Document doc) { + public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) { this.doc = doc; this.namespacesStack.push(new HashMap<>()); this.dest = doc; + this.outputSettings = outputSettings; } public void head(org.jsoup.nodes.Node source, int depth) { @@ -308,7 +315,7 @@ public void head(org.jsoup.nodes.Node source, int depth) { Element el = namespace == null && tagName.contains(":") ? doc.createElementNS("", tagName) : // doesn't have a real namespace defined doc.createElementNS(namespace, tagName); - copyAttributes(sourceEl, el); + copyAttributes(sourceEl, el, outputSettings.syntax()); append(el, sourceEl); dest = el; // descend } catch (DOMException e) { @@ -343,14 +350,14 @@ public void tail(org.jsoup.nodes.Node source, int depth) { namespacesStack.pop(); } - private void copyAttributes(org.jsoup.nodes.Node source, Element el) { - final Syntax syntax; - if (this.doc.getDoctype() != null && - this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) { - syntax = html; - } else { - syntax = xml; - } + private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) { +// final Syntax syntax; +// if (this.doc.getDoctype() != null && +// this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) { +// syntax = html; +// } else { +// syntax = xml; +// } for (Attribute attribute : source.attributes()) { String key = Attribute.getValidKey(attribute.getKey(), syntax); if (key != null) { // null if couldn't be coerced to validity From 68456f833b5a1ed2676549785734266a55850361 Mon Sep 17 00:00:00 2001 From: Jairam Chandar Date: Tue, 5 Oct 2021 13:56:26 +0100 Subject: [PATCH 3/4] Ignore newlines in tests On windows, DOM will write newlines as \r\n --- src/test/java/org/jsoup/helper/W3CDomTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 89131f3ad9..7d5379dccc 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -201,7 +201,8 @@ public void handlesAccentedCharsAttributeNames() { Document w3Doc = W3CDom.convert(jsoupDoc); String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); - assertEquals("\n

unicode attr names

", out); + String expected = "

unicode attr names

"; + assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n } @Test From c0f4d83086089a121e1ac651bae622e2a7aac753 Mon Sep 17 00:00:00 2001 From: jhy Date: Wed, 6 Oct 2021 21:50:12 +1100 Subject: [PATCH 4/4] Maintain current public method signature To preserve backcompat --- CHANGES | 6 +++- src/main/java/org/jsoup/helper/W3CDom.java | 34 ++++++------------- .../java/org/jsoup/helper/W3CDomTest.java | 24 +++++++++---- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/CHANGES b/CHANGES index be9fc196ce..17cfe0f48b 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,10 @@ jsoup changelog -*** Release 1.14.3 [PENDING] +*** Release 1.15.1 [PENDING] + * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, preserve HTML valid attribute names if the + input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.) + +*** Release 1.14.3 [2021-Sep-30] * Improvement: added native XPath support in Element#selectXpath(String) diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 7b499917e7..3bb1e8c0ab 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -214,22 +214,16 @@ public void convert(org.jsoup.nodes.Document in, Document out) { * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) */ public void convert(org.jsoup.nodes.Element in, Document out) { + W3CBuilder builder = new W3CBuilder(out); org.jsoup.nodes.Document inDoc = in.ownerDocument(); if (inDoc != null) { - if (!StringUtil.isBlank(inDoc.location())) + if (!StringUtil.isBlank(inDoc.location())) { out.setDocumentURI(inDoc.location()); + } + builder.syntax = inDoc.outputSettings().syntax(); } - - final org.jsoup.nodes.Document.OutputSettings outputSettings; - final org.jsoup.nodes.Element rootEl; - if (in instanceof org.jsoup.nodes.Document) { - outputSettings = ((org.jsoup.nodes.Document) in).outputSettings(); - rootEl = in.child(0); // skip the #root node if a Document - } else { - outputSettings = new org.jsoup.nodes.Document.OutputSettings(); - rootEl = in; - } - NodeTraversor.traverse(new W3CBuilder(out, outputSettings), rootEl); + org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.child(0) : in; // skip the #root node if a Document + NodeTraversor.traverse(builder, rootEl); } public NodeList selectXpath(String xpath, Document doc) { @@ -289,13 +283,12 @@ protected static class W3CBuilder implements NodeVisitor { private final Document doc; private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn private Node dest; - private final org.jsoup.nodes.Document.OutputSettings outputSettings; // the outputsettings used by the parser of original jsoup Document + private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. - public W3CBuilder(Document doc, org.jsoup.nodes.Document.OutputSettings outputSettings) { + public W3CBuilder(Document doc) { this.doc = doc; this.namespacesStack.push(new HashMap<>()); this.dest = doc; - this.outputSettings = outputSettings; } public void head(org.jsoup.nodes.Node source, int depth) { @@ -315,7 +308,7 @@ public void head(org.jsoup.nodes.Node source, int depth) { Element el = namespace == null && tagName.contains(":") ? doc.createElementNS("", tagName) : // doesn't have a real namespace defined doc.createElementNS(namespace, tagName); - copyAttributes(sourceEl, el, outputSettings.syntax()); + copyAttributes(sourceEl, el); append(el, sourceEl); dest = el; // descend } catch (DOMException e) { @@ -350,14 +343,7 @@ public void tail(org.jsoup.nodes.Node source, int depth) { namespacesStack.pop(); } - private void copyAttributes(org.jsoup.nodes.Node source, Element el, Syntax syntax) { -// final Syntax syntax; -// if (this.doc.getDoctype() != null && -// this.doc.getDoctype().getName().toLowerCase(Locale.ROOT).equals("html")) { -// syntax = html; -// } else { -// syntax = xml; -// } + private void copyAttributes(org.jsoup.nodes.Node source, Element el) { for (Attribute attribute : source.attributes()) { String key = Attribute.getValidKey(attribute.getKey(), syntax); if (key != null) { // null if couldn't be coerced to validity diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 7d5379dccc..f1ff904454 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -191,18 +191,28 @@ public void handlesInvalidAttributeNames() { } @Test - public void handlesAccentedCharsAttributeNames() { - String html = "

unicode attr names

"; + public void htmlInputDocMaintainsHtmlAttributeNames() { + String html = "

unicode attr names

"; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); - Element body = jsoupDoc.select("body").first(); - assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it - assertTrue(body.hasAttr("name\"")); Document w3Doc = W3CDom.convert(jsoupDoc); String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); - String expected = "

unicode attr names

"; - assertEquals(expected, TextUtil.stripNewlines(out)); // on windows, DOM will write newlines as \r\n + String expected = "

unicode attr names

"; + assertEquals(expected, TextUtil.stripNewlines(out)); + } + + @Test + public void xmlInputDocMaintainsHtmlAttributeNames() { + String html = "

unicode attr names coerced

"; + org.jsoup.nodes.Document jsoupDoc; + jsoupDoc = Jsoup.parse(html); + jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); + + Document w3Doc = W3CDom.convert(jsoupDoc); + String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); + String expected = "

unicode attr names coerced

"; + assertEquals(expected, TextUtil.stripNewlines(out)); } @Test