Pretty-print doctypes on a newline

Fixes #1664
jhy · Jun 19, 2022 · 67b48dd · 67b48dd
1 parent 8733445
commit 67b48dd
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 1 deletion.
diff --git a/CHANGES b/CHANGES
@@ -12,6 +12,9 @@ jsoup changelog
     a null if there is no match, will throw an IllegalArgumentException. This is useful if you want to simply abort
     processing if an expected match is not found.
 
+  * Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceeding comment.
+    <https://github.com/jhy/jsoup/pull/1664>
+
   * Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not
     already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default
     internal buffer size is read.

diff --git a/src/main/java/org/jsoup/nodes/DocumentType.java b/src/main/java/org/jsoup/nodes/DocumentType.java
@@ -78,6 +78,10 @@ public String nodeName() {
 
     @Override
     void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
+        // add a newline if the doctype has a preceding node (which must be a comment)
+        if (siblingIndex > 0 && out.prettyPrint())
+            accum.append('\n');
+
         if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
             // looks like a html5 doctype, go lowercase for aesthetics
             accum.append("<!doctype");

diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java
@@ -2284,4 +2284,25 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
 
         assertEquals("<p><span>One</span> <span>Two</span> <span>Three</span></p>", body.html());
     }
+
+    @Test void doctypeIsPrettyPrinted() {
+        // resolves underlying issue raised in https://github.com/jhy/jsoup/pull/1664
+        Document doc1 = Jsoup.parse("<!--\nlicense\n-->\n \n<!doctype html>\n<html>");
+        Document doc2 = Jsoup.parse("\n  <!doctype html><html>");
+        Document doc3 = Jsoup.parse("<!doctype html>\n<html>");
+        Document doc4 = Jsoup.parse("\n<!doctype html>\n<html>");
+        Document doc5 = Jsoup.parse("\n<!--\n comment \n -->  <!doctype html>\n<html>");
+        Document doc6 = Jsoup.parse("<!--\n comment \n -->  <!doctype html>\n<html>");
+
+        assertEquals("<!--\nlicense\n-->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc1.html());
+        doc1.outputSettings().prettyPrint(false);
+        assertEquals("<!--\nlicense\n--><!doctype html>\n<html><head></head><body></body></html>", doc1.html());
+        // note that the whitespace between the comment and the doctype is not retained, in Initial state
+
+        assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc2.html());
+        assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc3.html());
+        assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc4.html());
+        assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc5.html());
+        assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc6.html());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1246,7 +1246,7 @@ public void testInvalidTableContents() throws IOException {
         File in = ParseTest.getFile("/htmltests/comments.html");
         Document doc = Jsoup.parse(in, "UTF-8");
 
-        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
+        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
             StringUtil.normaliseWhitespace(doc.html()));
 
         assertEquals("A Certain Kind of Test", doc.head().select("title").text());