Skip to content

Commit

Permalink
Improved newline and whitespace normalization
Browse files Browse the repository at this point in the history
Fixes #1787

In TextNode, if this is a blank, check the next Element if it will indent. If so, can skip. Previously would check if the parent el would indent, which is wrong for inline elements.

Also made empty tags (like <img>) not indent, but inline.

And fixed up how whitespace is normalized at the end of an element, and after the body tag.
  • Loading branch information
jhy committed Jun 17, 2022
1 parent 7c21b2e commit e714ef1
Show file tree
Hide file tree
Showing 13 changed files with 97 additions and 52 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Expand Up @@ -21,6 +21,11 @@ jsoup changelog
what should have been preformatted output to instead be a run of text.
<https://github.com/jhy/jsoup/issues/1776>

* Bugfix: when pretty-print serializing HTML, newlines separating phrasing content (e.g. a <span> tag within a <p> tag
would be incorrectly skipped, instead of normalized to a space. Additionally, improved space normalization between
other end of line occurences, and whitespace handling after a closing </body>
<https://github.com/jhy/jsoup/issues/1787>

*** Release 1.15.1 [2022-May-15]
* Change: removed previously deprecated methods and classes (including org.jsoup.safety.Whitelist; use
org.jsoup.safety.Safelist instead).
Expand Down
1 change: 0 additions & 1 deletion src/main/java/org/jsoup/nodes/Element.java
Expand Up @@ -1842,7 +1842,6 @@ private boolean isFormatAsBlock(Document.OutputSettings out) {

private boolean isInlineable(Document.OutputSettings out) {
return tag().isInline()
&& !tag().isEmpty()
&& (parent() == null || parent().isBlock())
&& previousSibling() != null
&& !out.outline();
Expand Down
27 changes: 20 additions & 7 deletions src/main/java/org/jsoup/nodes/TextNode.java
@@ -1,7 +1,7 @@
package org.jsoup.nodes;

import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;

import java.io.IOException;

Expand Down Expand Up @@ -80,17 +80,30 @@ public TextNode splitText(int offset) {
return tailNode;
}

void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
final boolean prettyPrint = out.prettyPrint();
final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
final boolean parentIndent = parent != null && parent.shouldIndent(out);
final boolean blank = isBlank();
final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);

if (normaliseWhite && parentIndent && StringUtil.startsWithNewline(coreValue()) && blank) // we are skippable whitespace
return;

if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size()>0 && !blank) ))
// if this text is just whitespace, and the next node will cause an indent, skip this text:
if (normaliseWhite && blank) {
boolean canSkip = false;
Node next = this.nextSibling();
if (next instanceof Element) {
Element nextEl = (Element) next;
canSkip = nextEl.shouldIndent(out);
} else if (next == null && parent != null) { // we are the last child, check parent
canSkip = parent.shouldIndent(out);
} else if (next instanceof TextNode && (((TextNode) next).isBlank())) {
// sometimes get a run of textnodes from parser if nodes are re-parented
canSkip = true;
}
if (canSkip)
return;
}

if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size() > 0 && !blank)))
indent(accum, depth, out);

final boolean stripWhite = prettyPrint && parentNode instanceof Document;
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Expand Up @@ -558,6 +558,14 @@ boolean resetInsertionMode() {
return state != origState;
}

/** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */
void resetBody() {
if (!onStack("body")) {
stack.add(doc.body());
}
transition(HtmlTreeBuilderState.InBody);
}

// todo: tidy up in specific scope methods
private String[] specificScopeTarget = {null};

Expand Down
15 changes: 4 additions & 11 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Expand Up @@ -718,6 +718,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
return false;
} else {
// todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
anyOtherEndTag(t, tb);
tb.transition(AfterBody);
}
break;
Expand Down Expand Up @@ -1597,13 +1598,14 @@ boolean process(Token t, HtmlTreeBuilder tb) {
tb.error(this);
return false;
} else {
if (tb.onStack("html")) tb.popStackToClose("html");
tb.transition(AfterAfterBody);
}
} else if (t.isEOF()) {
// chillax! we're done
} else {
tb.error(this);
tb.transition(InBody);
tb.resetBody();
return tb.process(t);
}
return true;
Expand Down Expand Up @@ -1688,21 +1690,12 @@ boolean process(Token t, HtmlTreeBuilder tb) {
} else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
return tb.process(t, InBody);
} else if (isWhitespace(t)) {
// allows space after </html>, and put the body back on stack to allow subsequent tags if any
// todo - might be better for </body> and </html> to close them, allow trailing space, and then reparent
// that space into body if other tags get re-added. but that's overkill for now
Element html = tb.popStackToClose("html");
tb.insert(t.asCharacter());
if (html != null) {
tb.stack.add(html);
Element body = html.selectFirst("body");
if (body != null) tb.stack.add(body);
}
}else if (t.isEOF()) {
// nice work chuck
} else {
tb.error(this);
tb.transition(InBody);
tb.resetBody();
return tb.process(t);
}
return true;
Expand Down
14 changes: 10 additions & 4 deletions src/test/java/org/jsoup/nodes/DocumentTest.java
Expand Up @@ -109,11 +109,17 @@ public class DocumentTest {
Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
Document clone = doc.clone();

assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
clone.title("Hello there");
clone.select("p").first().text("One more").attr("id", "1");
assertEquals("<html><head><title>Hello there</title> </head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
clone.expectFirst("p").text("One more").attr("id", "1");
assertEquals("<html><head><title>Hello there</title></head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
}

@Test void testBasicIndent() {
Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
String expect = "<html>\n <head>\n <title>Hello</title>\n </head>\n <body>\n <p>One</p>\n <p>Two</p>\n </body>\n</html>";
assertEquals(expect, doc.html());
}

@Test public void testClonesDeclarations() {
Expand Down
17 changes: 16 additions & 1 deletion src/test/java/org/jsoup/nodes/ElementTest.java
Expand Up @@ -536,7 +536,7 @@ public void testContainerOutput() {
Document doc = Jsoup.parse("<title>Hello there</title> <div><p>Hello</p><p>there</p></div> <div>Another</div>");
assertEquals("<title>Hello there</title>", doc.select("title").first().outerHtml());
assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div>", doc.select("div").first().outerHtml());
assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div> \n<div>\n Another\n</div>", doc.select("body").first().html());
assertEquals("<div>\n <p>Hello</p>\n <p>there</p>\n</div>\n<div>\n Another\n</div>", doc.select("body").first().html());
}

@Test
Expand Down Expand Up @@ -2269,4 +2269,19 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
}
assertTrue(threw);
}

@Test void spanRunsMaintainSpace() {
// https://github.com/jhy/jsoup/issues/1787
Document doc = Jsoup.parse("<p><span>One</span>\n<span>Two</span>\n<span>Three</span></p>");
String text = "One Two Three";
Element body = doc.body();
assertEquals(text, body.text());

Element p = doc.expectFirst("p");
String html = p.html();
p.html(html);
assertEquals(text, body.text());

assertEquals("<p><span>One</span> <span>Two</span> <span>Three</span></p>", body.html());
}
}
23 changes: 11 additions & 12 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -273,13 +273,13 @@ public class HtmlParserTest {

@Test public void handlesNestedImplicitTable() {
Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
assertEquals("<table><tbody><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tbody><tr><td>3</td> <td>4</td></tr></tbody></table> </td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<table><tbody><tr><td>1</td></tr><tr><td>2</td></tr><tr><td><table><tbody><tr><td>3</td><td>4</td></tr></tbody></table></td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesWhatWgExpensesTableExample() {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>");
assertEquals("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody><tr><th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody><tr><th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr><th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesTbodyTable() {
Expand All @@ -294,7 +294,7 @@ public class HtmlParserTest {

@Test public void noTableDirectInTable() {
Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");
assertEquals("<table> <tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table> <table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
assertEquals("<table><tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
TextUtil.stripNewlines(doc.body().html()));
}

Expand Down Expand Up @@ -472,7 +472,7 @@ public class HtmlParserTest {
// if a known tag, allow self closing outside of spec, but force an end tag. unknown tags can be self closing.
String h = "<div id='1' /><script src='/foo' /><div id=2><img /><img></div><a id=3 /><i /><foo /><foo>One</foo> <hr /> hr text <hr> hr text two";
Document doc = Jsoup.parse(h);
assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo> <hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo><hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesKnownEmptyNoFrames() {
Expand Down Expand Up @@ -599,7 +599,7 @@ public class HtmlParserTest {
@Test public void testHgroup() {
// jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK
Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");
assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testRelaxedTags() {
Expand All @@ -611,7 +611,7 @@ public class HtmlParserTest {
// h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any
// spec, which defines them as containing phrasing content only. so, reality over theory.
Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>");
assertEquals("<h1>Hello <div>There</div> now</h1> <h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<h1>Hello <div>There</div> now</h1><h2>More </h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testSpanContents() {
Expand Down Expand Up @@ -720,7 +720,7 @@ public class HtmlParserTest {
// and the <i> inside the table and does not leak out.
String h = "<p><b>One</p> <table><tr><td><p><i>Three<p>Four</i></td></tr></table> <p>Five</p>";
Document doc = Jsoup.parse(h);
String want = "<p><b>One</b></p><b> \n" +
String want = "<p><b>One</b></p><b>\n" +
" <table>\n" +
" <tbody>\n" +
" <tr>\n" +
Expand Down Expand Up @@ -1246,7 +1246,7 @@ public void testInvalidTableContents() throws IOException {
File in = ParseTest.getFile("/htmltests/comments.html");
Document doc = Jsoup.parse(in, "UTF-8");

assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"><!-- now --> <head><!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
StringUtil.normaliseWhitespace(doc.html()));

assertEquals("A Certain Kind of Test", doc.head().select("title").text());
Expand Down Expand Up @@ -1399,15 +1399,14 @@ public void testUNewlines() {
String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n", doc.outerHtml());
}

@Test public void handleContentAfterBody() {
String html = "<body>One</body> <p>Hello!</p></html> <p>There</p>";
// todo - ideally would move that space afer /html to the body when the There <p> is seen
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
assertEquals("<html><head></head><body>One <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
assertEquals("<html><head></head><body>One<p>Hello!</p><p>There</p></body> </html> ", doc.outerHtml());
}

@Test public void preservesTabs() {
Expand Down Expand Up @@ -1487,7 +1486,7 @@ private boolean didAddElements(String input) {
String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>";
Document doc = Jsoup.parse(html);
assertNotNull(doc);
assertEquals("<a><b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<a> <b> </b></a><b><div><a></a><a>test</a></div> </b>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void tagsMustStartWithAscii() {
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java
Expand Up @@ -75,7 +75,7 @@ public void nestedAnchorElements01() {
String s = Jsoup.parse(html).toString();
assertEquals("<html>\n" +
" <head></head>\n" +
" <body><a href=\"#1\"> </a>\n" +
" <body> <a href=\"#1\"> </a>\n" +
" <div>\n" +
" <a href=\"#1\"></a><a href=\"#2\">child</a>\n" +
" </div>\n" +
Expand All @@ -99,7 +99,7 @@ public void nestedAnchorElements02() {
String s = Jsoup.parse(html).toString();
assertEquals("<html>\n" +
" <head></head>\n" +
" <body><a href=\"#1\"> </a>\n" +
" <body> <a href=\"#1\"> </a>\n" +
" <div>\n" +
" <a href=\"#1\"></a>\n" +
" <div>\n" +
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Expand Up @@ -94,7 +94,7 @@ public void testSupplyParserToDataStream() throws IOException, URISyntaxExceptio
public void testDoesNotForceSelfClosingKnownTags() {
// html will force "<br>one</br>" to logically "<br />One<br />". XML should be stay "<br>one</br> -- don't recognise tag.
Document htmlDoc = Jsoup.parse("<br>one</br>");
assertEquals("<br>one\n<br>", htmlDoc.body().html());
assertEquals("<br>one<br>", htmlDoc.body().html());

Document xmlDoc = Jsoup.parse("<br>one</br>", "", Parser.xmlParser());
assertEquals("<br>one</br>", xmlDoc.html());
Expand Down

0 comments on commit e714ef1

Please sign in to comment.