Skip to content

Commit

Permalink
Pretty-print doctypes on a newline
Browse files Browse the repository at this point in the history
Fixes #1664
  • Loading branch information
jhy committed Jun 19, 2022
1 parent 8733445 commit 67b48dd
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -12,6 +12,9 @@ jsoup changelog
a null if there is no match, will throw an IllegalArgumentException. This is useful if you want to simply abort
processing if an expected match is not found.

* Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceeding comment.
<https://github.com/jhy/jsoup/pull/1664>

* Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not
already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default
internal buffer size is read.
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/org/jsoup/nodes/DocumentType.java
Expand Up @@ -78,6 +78,10 @@ public String nodeName() {

@Override
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
// add a newline if the doctype has a preceding node (which must be a comment)
if (siblingIndex > 0 && out.prettyPrint())
accum.append('\n');

if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
// looks like a html5 doctype, go lowercase for aesthetics
accum.append("<!doctype");
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/org/jsoup/nodes/ElementTest.java
Expand Up @@ -2284,4 +2284,25 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {

assertEquals("<p><span>One</span> <span>Two</span> <span>Three</span></p>", body.html());
}

@Test void doctypeIsPrettyPrinted() {
// resolves underlying issue raised in https://github.com/jhy/jsoup/pull/1664
Document doc1 = Jsoup.parse("<!--\nlicense\n-->\n \n<!doctype html>\n<html>");
Document doc2 = Jsoup.parse("\n <!doctype html><html>");
Document doc3 = Jsoup.parse("<!doctype html>\n<html>");
Document doc4 = Jsoup.parse("\n<!doctype html>\n<html>");
Document doc5 = Jsoup.parse("\n<!--\n comment \n --> <!doctype html>\n<html>");
Document doc6 = Jsoup.parse("<!--\n comment \n --> <!doctype html>\n<html>");

assertEquals("<!--\nlicense\n-->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc1.html());
doc1.outputSettings().prettyPrint(false);
assertEquals("<!--\nlicense\n--><!doctype html>\n<html><head></head><body></body></html>", doc1.html());
// note that the whitespace between the comment and the doctype is not retained, in Initial state

assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc2.html());
assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc3.html());
assertEquals("<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc4.html());
assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc5.html());
assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc6.html());
}
}
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -1246,7 +1246,7 @@ public void testInvalidTableContents() throws IOException {
File in = ParseTest.getFile("/htmltests/comments.html");
Document doc = Jsoup.parse(in, "UTF-8");

assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
StringUtil.normaliseWhitespace(doc.html()));

assertEquals("A Certain Kind of Test", doc.head().select("title").text());
Expand Down

0 comments on commit 67b48dd

Please sign in to comment.