Trim leading and trailing spaces in blocks when appropriate

Improved whitespace and indenting behaviour when pretty printing HTML. This corrects the indenting level of textnodes (that were indented in the original source), and improves the round-trip parse/output of pretty-printed HTML. Trims leading and trailing whitespace for block elements when that whitespace is not part of the text flow. As an implementation note: the pretty-printing code is getting too dispersed in the various node methods. Particularly, some aspects are not symmetric (e.g. elements and textnode indent, or head and tail). It would be a useful refactoring to make a pretty-print class and contain all this logic, and make the traversor use that and include a little more state (if last indented, if this is meaningful whitespace, etc). Fixes #1798
jhy · Jun 24, 2022 · fc41ec9 · fc41ec9
1 parent 67b48dd
commit fc41ec9
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 47 deletions.
diff --git a/CHANGES b/CHANGES
@@ -12,9 +12,13 @@ jsoup changelog
     a null if there is no match, will throw an IllegalArgumentException. This is useful if you want to simply abort
     processing if an expected match is not found.
 
-  * Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceeding comment.
+  * Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceding comment.
     <https://github.com/jhy/jsoup/pull/1664>
 
+  * Improvement: when pretty-printing, trim the leading and trailing spaces of textnodes in block tags when possible,
+    so that they are indented correctly.
+    <https://github.com/jhy/jsoup/issues/1798>
+
   * Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not
     already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default
     internal buffer size is read.

diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java
@@ -142,7 +142,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
         accum.append(key);
         if (!shouldCollapseAttribute(key, val, out)) {
             accum.append("=\"");
-            Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
+            Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false, false);
             accum.append('"');
         }
     }

diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java
@@ -142,7 +142,7 @@ public static String escape(String string, OutputSettings out) {
             return "";
         StringBuilder accum = StringUtil.borrowBuilder();
         try {
-            escape(accum, string, out, false, false, false);
+            escape(accum, string, out, false, false, false, false);
         } catch (IOException e) {
             throw new SerializationException(e); // doesn't happen
         }
@@ -160,9 +160,9 @@ public static String escape(String string) {
         return escape(string, DefaultOutput);
     }
 
-    // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
+    // this method does a lot, but other breakups cause rescanning and stringbuilder generations
     static void escape(Appendable accum, String string, OutputSettings out,
-                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
+                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {
 
         boolean lastWasWhite = false;
         boolean reachedNonWhite = false;
@@ -172,19 +172,28 @@ static void escape(Appendable accum, String string, OutputSettings out,
         final int length = string.length();
 
         int codePoint;
+        boolean skipped = false;
         for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
             codePoint = string.codePointAt(offset);
 
             if (normaliseWhite) {
                 if (StringUtil.isWhitespace(codePoint)) {
-                    if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
+                    if (stripLeadingWhite && !reachedNonWhite) continue;
+                    if (lastWasWhite) continue;
+                    if (trimTrailing) {
+                        skipped = true;
                         continue;
+                    }
                     accum.append(' ');
                     lastWasWhite = true;
                     continue;
                 } else {
                     lastWasWhite = false;
                     reachedNonWhite = true;
+                    if (skipped) {
+                        accum.append(' '); // wasn't the end, so need to place a normalized space
+                        skipped = false;
+                    }
                 }
             }
             // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):

diff --git a/src/main/java/org/jsoup/nodes/TextNode.java b/src/main/java/org/jsoup/nodes/TextNode.java
@@ -83,31 +83,27 @@ public TextNode splitText(int offset) {
     void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
         final boolean prettyPrint = out.prettyPrint();
         final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
-        final boolean blank = isBlank();
         final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);
 
-        // if this text is just whitespace, and the next node will cause an indent, skip this text:
-        if (normaliseWhite && blank) {
-            boolean canSkip = false;
+        boolean trimLeading = false;
+        boolean trimTrailing = false;
+        if (normaliseWhite) {
+            trimLeading = (siblingIndex == 0 && parent != null && parent.tag().isBlock()) ||
+                parentNode instanceof Document;
+            trimTrailing = nextSibling() == null && parent != null && parent.tag().isBlock();
+
+            // if this text is just whitespace, and the next node will cause an indent, skip this text:
             Node next = this.nextSibling();
-            if (next instanceof Element) {
-                Element nextEl = (Element) next;
-                canSkip = nextEl.shouldIndent(out);
-            } else if (next == null && parent != null) { // we are the last child, check parent
-                canSkip = parent.shouldIndent(out);
-            } else if (next instanceof TextNode && (((TextNode) next).isBlank())) {
-                // sometimes get a run of textnodes from parser if nodes are re-parented
-                canSkip = true;
-            }
-            if (canSkip)
-                return;
-        }
+            boolean couldSkip = (next instanceof Element && ((Element) next).shouldIndent(out)) // next will indent
+                || (next instanceof TextNode && (((TextNode) next).isBlank())); // next is blank text, from re-parenting
+            if (couldSkip && isBlank()) return;
 
-        if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size() > 0 && !blank)))
-            indent(accum, depth, out);
+            if ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !isBlank()) ||
+                (out.outline() && siblingNodes().size() > 0 && !isBlank()))
+                indent(accum, depth, out);
+        }
 
-        final boolean stripWhite = prettyPrint && parentNode instanceof Document;
-        Entities.escape(accum, coreValue(), out, false, normaliseWhite, stripWhite);
+        Entities.escape(accum, coreValue(), out, false, normaliseWhite, trimLeading, trimTrailing);
     }
 
 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}

diff --git a/src/main/java/org/jsoup/nodes/XmlDeclaration.java b/src/main/java/org/jsoup/nodes/XmlDeclaration.java
@@ -60,7 +60,7 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
                 accum.append(key);
                 if (!val.isEmpty()) {
                     accum.append("=\"");
-                    Entities.escape(accum, val, out, true, false, false);
+                    Entities.escape(accum, val, out, true, false, false, false);
                     accum.append('"');
                 }
             }

diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java
@@ -55,15 +55,15 @@ public class DocumentTest {
     @Test public void testOutputEncoding() {
         Document doc = Jsoup.parse("<p title=π>π & < > </p>");
         // default is utf-8
-        assertEquals("<p title=\"π\">π &amp; &lt; &gt; </p>", doc.body().html());
+        assertEquals("<p title=\"π\">π &amp; &lt; &gt;</p>", doc.body().html());
         assertEquals("UTF-8", doc.outputSettings().charset().name());
 
         doc.outputSettings().charset("ascii");
         assertEquals(Entities.EscapeMode.base, doc.outputSettings().escapeMode());
-        assertEquals("<p title=\"&#x3c0;\">&#x3c0; &amp; &lt; &gt; </p>", doc.body().html());
+        assertEquals("<p title=\"&#x3c0;\">&#x3c0; &amp; &lt; &gt;</p>", doc.body().html());
 
         doc.outputSettings().escapeMode(Entities.EscapeMode.extended);
-        assertEquals("<p title=\"&pi;\">&pi; &amp; &lt; &gt; </p>", doc.body().html());
+        assertEquals("<p title=\"&pi;\">&pi; &amp; &lt; &gt;</p>", doc.body().html());
     }
 
     @Test public void testXhtmlReferences() {

diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java
@@ -15,6 +15,7 @@
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.MethodSource;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -625,6 +626,7 @@ public void testAddNewText() {
         Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
         Element div = doc.getElementById("1");
         div.appendText(" there & now >");
+        assertEquals ("Hello there & now >", div.text());
         assertEquals("<p>Hello</p> there &amp; now &gt;", TextUtil.stripNewlines(div.html()));
     }
 
@@ -2305,4 +2307,31 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
         assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc5.html());
         assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc6.html());
     }
+
+    @Test void textnodeInBlockIndent() {
+        String html ="<div>\n{{ msg }} \n </div>\n<div>\n{{ msg }} \n </div>";
+        Document doc = Jsoup.parse(html);
+        assertEquals("<div>\n {{ msg }}\n</div>\n<div>\n {{ msg }}\n</div>", doc.body().html());
+    }
+
+    @Test void stripTrailing() {
+        String html = "<p> This <span>is </span>fine. </p>";
+        Document doc = Jsoup.parse(html);
+        assertEquals("<p>This <span>is </span>fine.</p>", doc.body().html());
+    }
+
+    @Test void elementIndentAndSpaceTrims() {
+        String html = "<body><div> <p> One Two </p> <a>  Hello </a><p>\nSome text \n</p>\n </div>";
+        Document doc = Jsoup.parse(html);
+        assertEquals("<div>\n" +
+            " <p>One Two</p> <a> Hello </a>\n" +
+            " <p>Some text</p>\n" +
+            "</div>", doc.body().html());
+    }
+
+    @Test void divAInlineable() {
+        String html = "<body><div> <a>Text</a>";
+        Document doc = Jsoup.parse(html);
+        assertEquals("<div><a>Text</a>\n</div>", doc.body().html());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -158,7 +158,7 @@ public class HtmlParserTest {
 
     @Test public void testSpaceAfterTag() {
         Document doc = Jsoup.parse("<div > <a name=\"top\"></a ><p id=1 >Hello</p></div>");
-        assertEquals("<div> <a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<div><a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void createsDocumentStructure() {
@@ -279,7 +279,7 @@ public class HtmlParserTest {
     @Test public void handlesWhatWgExpensesTableExample() {
         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
         Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>");
-        assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody><tr><th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody><tr><th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008</th><th>2007</th><th>2006</th></tr></thead><tbody><tr><th scope=\"rowgroup\">Research and development</th><td>$ 1,109</td><td>$ 782</td><td>$ 712</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>3.4%</td><td>3.3%</td><td>3.7%</td></tr></tbody><tbody><tr><th scope=\"rowgroup\">Selling, general, and administrative</th><td>$ 3,761</td><td>$ 2,963</td><td>$ 2,433</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>11.6%</td><td>12.3%</td><td>12.6%</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void handlesTbodyTable() {
@@ -294,7 +294,7 @@ public class HtmlParserTest {
 
     @Test public void noTableDirectInTable() {
         Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");
-        assertEquals("<table><tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
+        assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
             TextUtil.stripNewlines(doc.body().html()));
     }
 
@@ -568,7 +568,7 @@ public class HtmlParserTest {
     @Test public void normalisesDocument() {
         String h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven ";
         Document doc = Jsoup.parse(h);
-        assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven </body></html>",
+        assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven</body></html>",
             TextUtil.stripNewlines(doc.html()));
     }
 
@@ -599,7 +599,7 @@ public class HtmlParserTest {
     @Test public void testHgroup() {
         // jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK
         Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");
-        assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<h1>Hello</h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void testRelaxedTags() {
@@ -611,7 +611,7 @@ public class HtmlParserTest {
         // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any
         // spec, which defines them as containing phrasing content only. so, reality over theory.
         Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>");
-        assertEquals("<h1>Hello <div>There</div> now</h1><h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<h1>Hello <div>There</div> now</h1><h2>More</h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void testSpanContents() {
@@ -1175,7 +1175,8 @@ public void testInvalidTableContents() throws IOException {
         Document doc = Parser.htmlParser()
             .settings(preserveCase)
             .parseInput(html, "");
-        assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html()));
+        //assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html()));
+        assertEquals("<A>ONE </A><A>Two</A>", doc.body().html());
     }
 
     @Test public void normalizesDiscordantTags() {
@@ -1246,7 +1247,7 @@ public void testInvalidTableContents() throws IOException {
         File in = ParseTest.getFile("/htmltests/comments.html");
         Document doc = Jsoup.parse(in, "UTF-8");
 
-        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
+        assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"><!-- now --> <head><!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
             StringUtil.normaliseWhitespace(doc.html()));
 
         assertEquals("A Certain Kind of Test", doc.head().select("title").text());
@@ -1486,7 +1487,7 @@ private boolean didAddElements(String input) {
         String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>";
         Document doc = Jsoup.parse(html);
         assertNotNull(doc);
-        assertEquals("<a> <b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
+        assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
     }
 
     @Test public void tagsMustStartWithAscii() {

diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java
@@ -75,9 +75,9 @@ public void nestedAnchorElements01() {
         String s = Jsoup.parse(html).toString();
         assertEquals("<html>\n" +
             " <head></head>\n" +
-            " <body> <a href=\"#1\"> </a>\n" +
+            " <body><a href=\"#1\"> </a>\n" +
             "  <div>\n" +
-            "   <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
+            "   <a href=\"#1\"> </a><a href=\"#2\">child</a>\n" +
             "  </div>\n" +
             " </body>\n" +
             "</html>", s);
@@ -99,11 +99,11 @@ public void nestedAnchorElements02() {
         String s = Jsoup.parse(html).toString();
         assertEquals("<html>\n" +
             " <head></head>\n" +
-            " <body> <a href=\"#1\"> </a>\n" +
+            " <body><a href=\"#1\"> </a>\n" +
             "  <div>\n" +
-            "   <a href=\"#1\"></a>\n" +
+            "   <a href=\"#1\"> </a>\n" +
             "   <div>\n" +
-            "    <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
+            "    <a href=\"#1\"> </a><a href=\"#2\">child</a>\n" +
             "   </div>\n" +
             "  </div>\n" +
             " </body>\n" +

diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java
@@ -95,7 +95,7 @@ public FilterResult tail(Node node, int depth) {
                 return ("b".equals(node.nodeName())) ? FilterResult.REMOVE : FilterResult.CONTINUE;
             }
         }, doc.select("div"));
-        assertEquals("<div></div>\n<div>\n There be \n</div>", doc.select("body").html());
+        assertEquals("<div></div>\n<div>\n There be\n</div>", doc.select("body").html());
     }
 
     @Test