Improved newline and whitespace normalization

Fixes #1787 In TextNode, if this is a blank, check the next Element if it will indent. If so, can skip. Previously would check if the parent el would indent, which is wrong for inline elements. Also made empty tags (like <img>) not indent, but inline. And fixed up how whitespace is normalized at the end of an element, and after the body tag.
jhy · Jun 17, 2022 · e714ef1 · e714ef1
1 parent 7c21b2e
commit e714ef1
Show file tree

Hide file tree

Showing 13 changed files with 97 additions and 52 deletions.
diff --git a/CHANGES b/CHANGES
@@ -21,6 +21,11 @@ jsoup changelog
     what should have been preformatted output to instead be a run of text.
     <https://github.com/jhy/jsoup/issues/1776>
 
+  * Bugfix: when pretty-print serializing HTML, newlines separating phrasing content (e.g. a <span> tag within a <p> tag
+    would be incorrectly skipped, instead of normalized to a space. Additionally, improved space normalization between
+    other end of line occurences, and whitespace handling after a closing </body>
+    <https://github.com/jhy/jsoup/issues/1787>
+
 *** Release 1.15.1 [2022-May-15]
   * Change: removed previously deprecated methods and classes (including org.jsoup.safety.Whitelist; use
     org.jsoup.safety.Safelist instead).

diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java
@@ -1842,7 +1842,6 @@ private boolean isFormatAsBlock(Document.OutputSettings out) {
 
     private boolean isInlineable(Document.OutputSettings out) {
         return tag().isInline()
-            && !tag().isEmpty()
             && (parent() == null || parent().isBlock())
             && previousSibling() != null
             && !out.outline();

diff --git a/src/main/java/org/jsoup/nodes/TextNode.java b/src/main/java/org/jsoup/nodes/TextNode.java
@@ -1,7 +1,7 @@
 package org.jsoup.nodes;
 
-import org.jsoup.internal.StringUtil;
 import org.jsoup.helper.Validate;
+import org.jsoup.internal.StringUtil;
 
 import java.io.IOException;
 
@@ -80,17 +80,30 @@ public TextNode splitText(int offset) {
         return tailNode;
     }
 
-	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
+    void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
         final boolean prettyPrint = out.prettyPrint();
         final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
-        final boolean parentIndent = parent != null && parent.shouldIndent(out);
         final boolean blank = isBlank();
         final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);
 
-        if (normaliseWhite && parentIndent && StringUtil.startsWithNewline(coreValue()) && blank) // we are skippable whitespace
-            return;
-
-        if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size()>0 && !blank) ))
+        // if this text is just whitespace, and the next node will cause an indent, skip this text:
+        if (normaliseWhite && blank) {
+            boolean canSkip = false;
+            Node next = this.nextSibling();
+            if (next instanceof Element) {
+                Element nextEl = (Element) next;
+                canSkip = nextEl.shouldIndent(out);
+            } else if (next == null && parent != null) { // we are the last child, check parent
+                canSkip = parent.shouldIndent(out);
+            } else if (next instanceof TextNode && (((TextNode) next).isBlank())) {
+                // sometimes get a run of textnodes from parser if nodes are re-parented
+                canSkip = true;
+            }
+            if (canSkip)
+                return;
+        }
+
+        if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size() > 0 && !blank)))
             indent(accum, depth, out);
 
         final boolean stripWhite = prettyPrint && parentNode instanceof Document;

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -558,6 +558,14 @@ boolean resetInsertionMode() {
         return state != origState;
     }
 
+    /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */
+    void resetBody() {
+        if (!onStack("body")) {
+            stack.add(doc.body());
+        }
+        transition(HtmlTreeBuilderState.InBody);
+    }
+
     // todo: tidy up in specific scope methods
     private String[] specificScopeTarget = {null};
 

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
@@ -718,6 +718,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
                         return false;
                     } else {
                         // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
+                        anyOtherEndTag(t, tb);
                         tb.transition(AfterBody);
                     }
                     break;
@@ -1597,13 +1598,14 @@ boolean process(Token t, HtmlTreeBuilder tb) {
                     tb.error(this);
                     return false;
                 } else {
+                    if (tb.onStack("html")) tb.popStackToClose("html");
                     tb.transition(AfterAfterBody);
                 }
             } else if (t.isEOF()) {
                 // chillax! we're done
             } else {
                 tb.error(this);
-                tb.transition(InBody);
+                tb.resetBody();
                 return tb.process(t);
             }
             return true;
@@ -1688,21 +1690,12 @@ boolean process(Token t, HtmlTreeBuilder tb) {
             } else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
                 return tb.process(t, InBody);
             } else if (isWhitespace(t)) {
-                // allows space after </html>, and put the body back on stack to allow subsequent tags if any
-                // todo - might be better for </body> and </html> to close them, allow trailing space, and then reparent
-                //  that space into body if other tags get re-added. but that's overkill for now
-                Element html = tb.popStackToClose("html");
                 tb.insert(t.asCharacter());
-                if (html != null) {
-                    tb.stack.add(html);
-                    Element body = html.selectFirst("body");
-                    if (body != null) tb.stack.add(body);
-                }
             }else if (t.isEOF()) {
                 // nice work chuck
             } else {
                 tb.error(this);
-                tb.transition(InBody);
+                tb.resetBody();
                 return tb.process(t);
             }
             return true;

diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java
@@ -109,11 +109,17 @@ public class DocumentTest {
         Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
         Document clone = doc.clone();
 
-        assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
+        assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
         clone.title("Hello there");
-        clone.select("p").first().text("One more").attr("id", "1");
-        assertEquals("<html><head><title>Hello there</title> </head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
-        assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
+        clone.expectFirst("p").text("One more").attr("id", "1");
+        assertEquals("<html><head><title>Hello there</title></head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
+        assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
+    }
+
+    @Test void testBasicIndent() {
+        Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
+        String expect = "<html>\n <head>\n  <title>Hello</title>\n </head>\n <body>\n  <p>One</p>\n  <p>Two</p>\n </body>\n</html>";
+        assertEquals(expect, doc.html());
     }
 
     @Test public void testClonesDeclarations() {

diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java
@@ -536,7 +536,7 @@ public void testContainerOutput() {
         Document doc = Jsoup.parse("<title>Hello there</title> <div><p>Hello</p><p>there</p></div> <div>Another</div>");
         assertEquals("<title>Hello there</title>", doc.select("title").first().outerHtml());
         assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div>", doc.select("div").first().outerHtml());
-        assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div> \n<div>\n Another\n</div>", doc.select("body").first().html());
+        assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div>\n<div>\n Another\n</div>", doc.select("body").first().html());
     }
 
     @Test
@@ -2269,4 +2269,19 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
         }
         assertTrue(threw);
     }
+
+    @Test void spanRunsMaintainSpace() {
+        // https://github.com/jhy/jsoup/issues/1787
+        Document doc = Jsoup.parse("<p><span>One</span>\n<span>Two</span>\n<span>Three</span></p>");
+        String text = "One Two Three";
+        Element body = doc.body();
+        assertEquals(text, body.text());
+
+        Element p = doc.expectFirst("p");
+        String html = p.html();
+        p.html(html);
+        assertEquals(text, body.text());
+
+        assertEquals("<p><span>One</span> <span>Two</span> <span>Three</span></p>", body.html());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -273,13 +273,13 @@ public class HtmlParserTest {
 
     @Test public void handlesNestedImplicitTable() {
         Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
-        assertEquals("<table><tbody><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tbody><tr><td>3</td> <td>4</td></tr></tbody></table> </td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<table><tbody><tr><td>1</td></tr><tr><td>2</td></tr><tr><td><table><tbody><tr><td>3</td><td>4</td></tr></tbody></table></td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void handlesWhatWgExpensesTableExample() {
         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
         Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>");
-        assertEquals("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody><tr><th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody><tr><th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void handlesTbodyTable() {
@@ -294,7 +294,7 @@ public class HtmlParserTest {
 
     @Test public void noTableDirectInTable() {
         Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");
-        assertEquals("<table> <tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table> <table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
+        assertEquals("<table><tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
             TextUtil.stripNewlines(doc.body().html()));
     }
 
@@ -472,7 +472,7 @@ public class HtmlParserTest {
         // if a known tag, allow self closing outside of spec, but force an end tag. unknown tags can be self closing.
         String h = "<div id='1' /><script src='/foo' /><div id=2><img /><img></div><a id=3 /><i /><foo /><foo>One</foo> <hr /> hr text <hr> hr text two";
         Document doc = Jsoup.parse(h);
-        assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo> <hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo><hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void handlesKnownEmptyNoFrames() {
@@ -599,7 +599,7 @@ public class HtmlParserTest {
     @Test public void testHgroup() {
         // jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK
         Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");
-        assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void testRelaxedTags() {
@@ -611,7 +611,7 @@ public class HtmlParserTest {
         // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any
         // spec, which defines them as containing phrasing content only. so, reality over theory.
         Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>");
-        assertEquals("<h1>Hello <div>There</div> now</h1> <h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<h1>Hello <div>There</div> now</h1><h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void testSpanContents() {
@@ -720,7 +720,7 @@ public class HtmlParserTest {
         // and the <i> inside the table and does not leak out.
         String h = "<p><b>One</p> <table><tr><td><p><i>Three<p>Four</i></td></tr></table> <p>Five</p>";
         Document doc = Jsoup.parse(h);
-        String want = "<p><b>One</b></p><b> \n" +
+        String want = "<p><b>One</b></p><b>\n" +
             " <table>\n" +
             "  <tbody>\n" +
             "   <tr>\n" +
@@ -1246,7 +1246,7 @@ public void testInvalidTableContents() throws IOException {
         File in = ParseTest.getFile("/htmltests/comments.html");
         Document doc = Jsoup.parse(in, "UTF-8");
 
-        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"><!-- now --> <head><!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
+        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
             StringUtil.normaliseWhitespace(doc.html()));
 
         assertEquals("A Certain Kind of Test", doc.head().select("title").text());
@@ -1399,15 +1399,14 @@ public void testUNewlines() {
         String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
         Document doc = Jsoup.parse(html);
         doc.outputSettings().prettyPrint(false);
-        assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
+        assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n", doc.outerHtml());
     }
 
     @Test public void handleContentAfterBody() {
         String html = "<body>One</body>  <p>Hello!</p></html> <p>There</p>";
-        // todo - ideally would move that space afer /html to the body when the There <p> is seen
         Document doc = Jsoup.parse(html);
         doc.outputSettings().prettyPrint(false);
-        assertEquals("<html><head></head><body>One  <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
+        assertEquals("<html><head></head><body>One<p>Hello!</p><p>There</p></body>  </html> ", doc.outerHtml());
     }
 
     @Test public void preservesTabs() {
@@ -1487,7 +1486,7 @@ private boolean didAddElements(String input) {
         String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>";
         Document doc = Jsoup.parse(html);
         assertNotNull(doc);
-        assertEquals("<a><b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<a> <b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void tagsMustStartWithAscii() {

diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java
@@ -75,7 +75,7 @@ public void nestedAnchorElements01() {
         String s = Jsoup.parse(html).toString();
         assertEquals("<html>\n" +
             " <head></head>\n" +
-            " <body><a href=\"#1\"> </a>\n" +
+            " <body> <a href=\"#1\"> </a>\n" +
             "  <div>\n" +
             "   <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
             "  </div>\n" +
@@ -99,7 +99,7 @@ public void nestedAnchorElements02() {
         String s = Jsoup.parse(html).toString();
         assertEquals("<html>\n" +
             " <head></head>\n" +
-            " <body><a href=\"#1\"> </a>\n" +
+            " <body> <a href=\"#1\"> </a>\n" +
             "  <div>\n" +
             "   <a href=\"#1\"></a>\n" +
             "   <div>\n" +

diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
@@ -94,7 +94,7 @@ public void testSupplyParserToDataStream() throws IOException, URISyntaxExceptio
     public void testDoesNotForceSelfClosingKnownTags() {
         // html will force "<br>one</br>" to logically "<br />One<br />". XML should be stay "<br>one</br> -- don't recognise tag.
         Document htmlDoc = Jsoup.parse("<br>one</br>");
-        assertEquals("<br>one\n<br>", htmlDoc.body().html());
+        assertEquals("<br>one<br>", htmlDoc.body().html());
 
         Document xmlDoc = Jsoup.parse("<br>one</br>", "", Parser.xmlParser());
         assertEquals("<br>one</br>", xmlDoc.html());