Skip to content

Commit

Permalink
Trim leading and trailing spaces in blocks when appropriate
Browse files Browse the repository at this point in the history
Improved whitespace and indenting behaviour when pretty printing HTML. This corrects the indenting level of textnodes (that were indented in the original source), and improves the round-trip parse/output of pretty-printed HTML.

Trims leading and trailing whitespace for block elements when that whitespace is not part of the text flow.

As an implementation note: the pretty-printing code is getting too dispersed in the various node methods. Particularly, some aspects are not symmetric (e.g. elements and textnode indent, or head and tail). It would be a useful refactoring to make a pretty-print class and contain all this logic, and make the traversor use that and include a little more state (if last indented, if this is meaningful whitespace, etc).

Fixes #1798
  • Loading branch information
jhy committed Jun 24, 2022
1 parent 67b48dd commit fc41ec9
Show file tree
Hide file tree
Showing 11 changed files with 86 additions and 47 deletions.
6 changes: 5 additions & 1 deletion CHANGES
Expand Up @@ -12,9 +12,13 @@ jsoup changelog
a null if there is no match, will throw an IllegalArgumentException. This is useful if you want to simply abort
processing if an expected match is not found.

* Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceeding comment.
* Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceding comment.
<https://github.com/jhy/jsoup/pull/1664>

* Improvement: when pretty-printing, trim the leading and trailing spaces of textnodes in block tags when possible,
so that they are indented correctly.
<https://github.com/jhy/jsoup/issues/1798>

* Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not
already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default
internal buffer size is read.
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/Attribute.java
Expand Up @@ -142,7 +142,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
accum.append(key);
if (!shouldCollapseAttribute(key, val, out)) {
accum.append("=\"");
Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false, false);
accum.append('"');
}
}
Expand Down
17 changes: 13 additions & 4 deletions src/main/java/org/jsoup/nodes/Entities.java
Expand Up @@ -142,7 +142,7 @@ public static String escape(String string, OutputSettings out) {
return "";
StringBuilder accum = StringUtil.borrowBuilder();
try {
escape(accum, string, out, false, false, false);
escape(accum, string, out, false, false, false, false);
} catch (IOException e) {
throw new SerializationException(e); // doesn't happen
}
Expand All @@ -160,9 +160,9 @@ public static String escape(String string) {
return escape(string, DefaultOutput);
}

// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
// this method does a lot, but other breakups cause rescanning and stringbuilder generations
static void escape(Appendable accum, String string, OutputSettings out,
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {

boolean lastWasWhite = false;
boolean reachedNonWhite = false;
Expand All @@ -172,19 +172,28 @@ static void escape(Appendable accum, String string, OutputSettings out,
final int length = string.length();

int codePoint;
boolean skipped = false;
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
codePoint = string.codePointAt(offset);

if (normaliseWhite) {
if (StringUtil.isWhitespace(codePoint)) {
if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
if (stripLeadingWhite && !reachedNonWhite) continue;
if (lastWasWhite) continue;
if (trimTrailing) {
skipped = true;
continue;
}
accum.append(' ');
lastWasWhite = true;
continue;
} else {
lastWasWhite = false;
reachedNonWhite = true;
if (skipped) {
accum.append(' '); // wasn't the end, so need to place a normalized space
skipped = false;
}
}
}
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
Expand Down
36 changes: 16 additions & 20 deletions src/main/java/org/jsoup/nodes/TextNode.java
Expand Up @@ -83,31 +83,27 @@ public TextNode splitText(int offset) {
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
final boolean prettyPrint = out.prettyPrint();
final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
final boolean blank = isBlank();
final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);

// if this text is just whitespace, and the next node will cause an indent, skip this text:
if (normaliseWhite && blank) {
boolean canSkip = false;
boolean trimLeading = false;
boolean trimTrailing = false;
if (normaliseWhite) {
trimLeading = (siblingIndex == 0 && parent != null && parent.tag().isBlock()) ||
parentNode instanceof Document;
trimTrailing = nextSibling() == null && parent != null && parent.tag().isBlock();

// if this text is just whitespace, and the next node will cause an indent, skip this text:
Node next = this.nextSibling();
if (next instanceof Element) {
Element nextEl = (Element) next;
canSkip = nextEl.shouldIndent(out);
} else if (next == null && parent != null) { // we are the last child, check parent
canSkip = parent.shouldIndent(out);
} else if (next instanceof TextNode && (((TextNode) next).isBlank())) {
// sometimes get a run of textnodes from parser if nodes are re-parented
canSkip = true;
}
if (canSkip)
return;
}
boolean couldSkip = (next instanceof Element && ((Element) next).shouldIndent(out)) // next will indent
|| (next instanceof TextNode && (((TextNode) next).isBlank())); // next is blank text, from re-parenting
if (couldSkip && isBlank()) return;

if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size() > 0 && !blank)))
indent(accum, depth, out);
if ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !isBlank()) ||
(out.outline() && siblingNodes().size() > 0 && !isBlank()))
indent(accum, depth, out);
}

final boolean stripWhite = prettyPrint && parentNode instanceof Document;
Entities.escape(accum, coreValue(), out, false, normaliseWhite, stripWhite);
Entities.escape(accum, coreValue(), out, false, normaliseWhite, trimLeading, trimTrailing);
}

void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/XmlDeclaration.java
Expand Up @@ -60,7 +60,7 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
accum.append(key);
if (!val.isEmpty()) {
accum.append("=\"");
Entities.escape(accum, val, out, true, false, false);
Entities.escape(accum, val, out, true, false, false, false);
accum.append('"');
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/org/jsoup/nodes/DocumentTest.java
Expand Up @@ -55,15 +55,15 @@ public class DocumentTest {
@Test public void testOutputEncoding() {
Document doc = Jsoup.parse("<p title=π>π & < > </p>");
// default is utf-8
assertEquals("<p title=\"π\">π &amp; &lt; &gt; </p>", doc.body().html());
assertEquals("<p title=\"π\">π &amp; &lt; &gt;</p>", doc.body().html());
assertEquals("UTF-8", doc.outputSettings().charset().name());

doc.outputSettings().charset("ascii");
assertEquals(Entities.EscapeMode.base, doc.outputSettings().escapeMode());
assertEquals("<p title=\"&#x3c0;\">&#x3c0; &amp; &lt; &gt; </p>", doc.body().html());
assertEquals("<p title=\"&#x3c0;\">&#x3c0; &amp; &lt; &gt;</p>", doc.body().html());

doc.outputSettings().escapeMode(Entities.EscapeMode.extended);
assertEquals("<p title=\"&pi;\">&pi; &amp; &lt; &gt; </p>", doc.body().html());
assertEquals("<p title=\"&pi;\">&pi; &amp; &lt; &gt;</p>", doc.body().html());
}

@Test public void testXhtmlReferences() {
Expand Down
29 changes: 29 additions & 0 deletions src/test/java/org/jsoup/nodes/ElementTest.java
Expand Up @@ -15,6 +15,7 @@
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
Expand Down Expand Up @@ -625,6 +626,7 @@ public void testAddNewText() {
Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
Element div = doc.getElementById("1");
div.appendText(" there & now >");
assertEquals ("Hello there & now >", div.text());
assertEquals("<p>Hello</p> there &amp; now &gt;", TextUtil.stripNewlines(div.html()));
}

Expand Down Expand Up @@ -2305,4 +2307,31 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc5.html());
assertEquals("<!--\n comment \n -->\n<!doctype html>\n<html>\n <head></head>\n <body></body>\n</html>", doc6.html());
}

@Test void textnodeInBlockIndent() {
String html ="<div>\n{{ msg }} \n </div>\n<div>\n{{ msg }} \n </div>";
Document doc = Jsoup.parse(html);
assertEquals("<div>\n {{ msg }}\n</div>\n<div>\n {{ msg }}\n</div>", doc.body().html());
}

@Test void stripTrailing() {
String html = "<p> This <span>is </span>fine. </p>";
Document doc = Jsoup.parse(html);
assertEquals("<p>This <span>is </span>fine.</p>", doc.body().html());
}

@Test void elementIndentAndSpaceTrims() {
String html = "<body><div> <p> One Two </p> <a> Hello </a><p>\nSome text \n</p>\n </div>";
Document doc = Jsoup.parse(html);
assertEquals("<div>\n" +
" <p>One Two</p> <a> Hello </a>\n" +
" <p>Some text</p>\n" +
"</div>", doc.body().html());
}

@Test void divAInlineable() {
String html = "<body><div> <a>Text</a>";
Document doc = Jsoup.parse(html);
assertEquals("<div><a>Text</a>\n</div>", doc.body().html());
}
}
19 changes: 10 additions & 9 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -158,7 +158,7 @@ public class HtmlParserTest {

@Test public void testSpaceAfterTag() {
Document doc = Jsoup.parse("<div > <a name=\"top\"></a ><p id=1 >Hello</p></div>");
assertEquals("<div> <a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<div><a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void createsDocumentStructure() {
Expand Down Expand Up @@ -279,7 +279,7 @@ public class HtmlParserTest {
@Test public void handlesWhatWgExpensesTableExample() {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>");
assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody><tr><th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody><tr><th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008</th><th>2007</th><th>2006</th></tr></thead><tbody><tr><th scope=\"rowgroup\">Research and development</th><td>$ 1,109</td><td>$ 782</td><td>$ 712</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>3.4%</td><td>3.3%</td><td>3.7%</td></tr></tbody><tbody><tr><th scope=\"rowgroup\">Selling, general, and administrative</th><td>$ 3,761</td><td>$ 2,963</td><td>$ 2,433</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>11.6%</td><td>12.3%</td><td>12.6%</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesTbodyTable() {
Expand All @@ -294,7 +294,7 @@ public class HtmlParserTest {

@Test public void noTableDirectInTable() {
Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");
assertEquals("<table><tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
TextUtil.stripNewlines(doc.body().html()));
}

Expand Down Expand Up @@ -568,7 +568,7 @@ public class HtmlParserTest {
@Test public void normalisesDocument() {
String h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven ";
Document doc = Jsoup.parse(h);
assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven </body></html>",
assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven</body></html>",
TextUtil.stripNewlines(doc.html()));
}

Expand Down Expand Up @@ -599,7 +599,7 @@ public class HtmlParserTest {
@Test public void testHgroup() {
// jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK
Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");
assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<h1>Hello</h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testRelaxedTags() {
Expand All @@ -611,7 +611,7 @@ public class HtmlParserTest {
// h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any
// spec, which defines them as containing phrasing content only. so, reality over theory.
Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>");
assertEquals("<h1>Hello <div>There</div> now</h1><h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<h1>Hello <div>There</div> now</h1><h2>More</h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testSpanContents() {
Expand Down Expand Up @@ -1175,7 +1175,8 @@ public void testInvalidTableContents() throws IOException {
Document doc = Parser.htmlParser()
.settings(preserveCase)
.parseInput(html, "");
assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html()));
//assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html()));
assertEquals("<A>ONE </A><A>Two</A>", doc.body().html());
}

@Test public void normalizesDiscordantTags() {
Expand Down Expand Up @@ -1246,7 +1247,7 @@ public void testInvalidTableContents() throws IOException {
File in = ParseTest.getFile("/htmltests/comments.html");
Document doc = Jsoup.parse(in, "UTF-8");

assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"><!-- now --> <head><!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
StringUtil.normaliseWhitespace(doc.html()));

assertEquals("A Certain Kind of Test", doc.head().select("title").text());
Expand Down Expand Up @@ -1486,7 +1487,7 @@ private boolean didAddElements(String input) {
String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>";
Document doc = Jsoup.parse(html);
assertNotNull(doc);
assertEquals("<a> <b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void tagsMustStartWithAscii() {
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java
Expand Up @@ -75,9 +75,9 @@ public void nestedAnchorElements01() {
String s = Jsoup.parse(html).toString();
assertEquals("<html>\n" +
" <head></head>\n" +
" <body> <a href=\"#1\"> </a>\n" +
" <body><a href=\"#1\"> </a>\n" +
" <div>\n" +
" <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
" <a href=\"#1\"> </a><a href=\"#2\">child</a>\n" +
" </div>\n" +
" </body>\n" +
"</html>", s);
Expand All @@ -99,11 +99,11 @@ public void nestedAnchorElements02() {
String s = Jsoup.parse(html).toString();
assertEquals("<html>\n" +
" <head></head>\n" +
" <body> <a href=\"#1\"> </a>\n" +
" <body><a href=\"#1\"> </a>\n" +
" <div>\n" +
" <a href=\"#1\"></a>\n" +
" <a href=\"#1\"> </a>\n" +
" <div>\n" +
" <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
" <a href=\"#1\"> </a><a href=\"#2\">child</a>\n" +
" </div>\n" +
" </div>\n" +
" </body>\n" +
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/select/TraversorTest.java
Expand Up @@ -95,7 +95,7 @@ public FilterResult tail(Node node, int depth) {
return ("b".equals(node.nodeName())) ? FilterResult.REMOVE : FilterResult.CONTINUE;
}
}, doc.select("div"));
assertEquals("<div></div>\n<div>\n There be \n</div>", doc.select("body").html());
assertEquals("<div></div>\n<div>\n There be\n</div>", doc.select("body").html());
}

@Test
Expand Down

0 comments on commit fc41ec9

Please sign in to comment.