From fc41ec98eaf2a9f803602c4d2dea6703edbdf4c7 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 24 Jun 2022 16:32:10 +1000 Subject: [PATCH] Trim leading and trailing spaces in blocks when appropriate Improved whitespace and indenting behaviour when pretty printing HTML. This corrects the indenting level of textnodes (that were indented in the original source), and improves the round-trip parse/output of pretty-printed HTML. Trims leading and trailing whitespace for block elements when that whitespace is not part of the text flow. As an implementation note: the pretty-printing code is getting too dispersed in the various node methods. Particularly, some aspects are not symmetric (e.g. elements and textnode indent, or head and tail). It would be a useful refactoring to make a pretty-print class and contain all this logic, and make the traversor use that and include a little more state (if last indented, if this is meaningful whitespace, etc). Fixes #1798 --- CHANGES | 6 +++- src/main/java/org/jsoup/nodes/Attribute.java | 2 +- src/main/java/org/jsoup/nodes/Entities.java | 17 ++++++--- src/main/java/org/jsoup/nodes/TextNode.java | 36 +++++++++---------- .../java/org/jsoup/nodes/XmlDeclaration.java | 2 +- .../java/org/jsoup/nodes/DocumentTest.java | 6 ++-- .../java/org/jsoup/nodes/ElementTest.java | 29 +++++++++++++++ .../java/org/jsoup/parser/HtmlParserTest.java | 19 +++++----- .../parser/HtmlTreeBuilderStateTest.java | 10 +++--- .../java/org/jsoup/select/TraversorTest.java | 2 +- src/test/resources/htmltests/medium.html | 4 +-- 11 files changed, 86 insertions(+), 47 deletions(-) diff --git a/CHANGES b/CHANGES index 28d545582e..33ed77eb49 100644 --- a/CHANGES +++ b/CHANGES @@ -12,9 +12,13 @@ jsoup changelog a null if there is no match, will throw an IllegalArgumentException. This is useful if you want to simply abort processing if an expected match is not found. - * Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceeding comment. + * Improvement: when pretty-printing HTML, doctypes are emitted on a newline if there is a preceding comment. + * Improvement: when pretty-printing, trim the leading and trailing spaces of textnodes in block tags when possible, + so that they are indented correctly. + + * Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default internal buffer size is read. diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index 4106bd8a0f..56a34b49bf 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -142,7 +142,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D accum.append(key); if (!shouldCollapseAttribute(key, val, out)) { accum.append("=\""); - Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false); + Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false, false); accum.append('"'); } } diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index 732bf34d1e..d330005d79 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -142,7 +142,7 @@ public static String escape(String string, OutputSettings out) { return ""; StringBuilder accum = StringUtil.borrowBuilder(); try { - escape(accum, string, out, false, false, false); + escape(accum, string, out, false, false, false, false); } catch (IOException e) { throw new SerializationException(e); // doesn't happen } @@ -160,9 +160,9 @@ public static String escape(String string) { return escape(string, DefaultOutput); } - // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations + // this method does a lot, but other breakups cause rescanning and stringbuilder generations static void escape(Appendable accum, String string, OutputSettings out, - boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException { + boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { boolean lastWasWhite = false; boolean reachedNonWhite = false; @@ -172,19 +172,28 @@ static void escape(Appendable accum, String string, OutputSettings out, final int length = string.length(); int codePoint; + boolean skipped = false; for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { codePoint = string.codePointAt(offset); if (normaliseWhite) { if (StringUtil.isWhitespace(codePoint)) { - if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite) + if (stripLeadingWhite && !reachedNonWhite) continue; + if (lastWasWhite) continue; + if (trimTrailing) { + skipped = true; continue; + } accum.append(' '); lastWasWhite = true; continue; } else { lastWasWhite = false; reachedNonWhite = true; + if (skipped) { + accum.append(' '); // wasn't the end, so need to place a normalized space + skipped = false; + } } } // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): diff --git a/src/main/java/org/jsoup/nodes/TextNode.java b/src/main/java/org/jsoup/nodes/TextNode.java index e32a0dbf13..4dbd116818 100644 --- a/src/main/java/org/jsoup/nodes/TextNode.java +++ b/src/main/java/org/jsoup/nodes/TextNode.java @@ -83,31 +83,27 @@ public TextNode splitText(int offset) { void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { final boolean prettyPrint = out.prettyPrint(); final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null; - final boolean blank = isBlank(); final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode); - // if this text is just whitespace, and the next node will cause an indent, skip this text: - if (normaliseWhite && blank) { - boolean canSkip = false; + boolean trimLeading = false; + boolean trimTrailing = false; + if (normaliseWhite) { + trimLeading = (siblingIndex == 0 && parent != null && parent.tag().isBlock()) || + parentNode instanceof Document; + trimTrailing = nextSibling() == null && parent != null && parent.tag().isBlock(); + + // if this text is just whitespace, and the next node will cause an indent, skip this text: Node next = this.nextSibling(); - if (next instanceof Element) { - Element nextEl = (Element) next; - canSkip = nextEl.shouldIndent(out); - } else if (next == null && parent != null) { // we are the last child, check parent - canSkip = parent.shouldIndent(out); - } else if (next instanceof TextNode && (((TextNode) next).isBlank())) { - // sometimes get a run of textnodes from parser if nodes are re-parented - canSkip = true; - } - if (canSkip) - return; - } + boolean couldSkip = (next instanceof Element && ((Element) next).shouldIndent(out)) // next will indent + || (next instanceof TextNode && (((TextNode) next).isBlank())); // next is blank text, from re-parenting + if (couldSkip && isBlank()) return; - if (prettyPrint && ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !blank) || (out.outline() && siblingNodes().size() > 0 && !blank))) - indent(accum, depth, out); + if ((siblingIndex == 0 && parent != null && parent.tag().formatAsBlock() && !isBlank()) || + (out.outline() && siblingNodes().size() > 0 && !isBlank())) + indent(accum, depth, out); + } - final boolean stripWhite = prettyPrint && parentNode instanceof Document; - Entities.escape(accum, coreValue(), out, false, normaliseWhite, stripWhite); + Entities.escape(accum, coreValue(), out, false, normaliseWhite, trimLeading, trimTrailing); } void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {} diff --git a/src/main/java/org/jsoup/nodes/XmlDeclaration.java b/src/main/java/org/jsoup/nodes/XmlDeclaration.java index 6ab9b16241..ca22eb7ff7 100644 --- a/src/main/java/org/jsoup/nodes/XmlDeclaration.java +++ b/src/main/java/org/jsoup/nodes/XmlDeclaration.java @@ -60,7 +60,7 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out) accum.append(key); if (!val.isEmpty()) { accum.append("=\""); - Entities.escape(accum, val, out, true, false, false); + Entities.escape(accum, val, out, true, false, false, false); accum.append('"'); } } diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java index a490b8ea16..77a140aad7 100644 --- a/src/test/java/org/jsoup/nodes/DocumentTest.java +++ b/src/test/java/org/jsoup/nodes/DocumentTest.java @@ -55,15 +55,15 @@ public class DocumentTest { @Test public void testOutputEncoding() { Document doc = Jsoup.parse("

π & < >

"); // default is utf-8 - assertEquals("

π & < >

", doc.body().html()); + assertEquals("

π & < >

", doc.body().html()); assertEquals("UTF-8", doc.outputSettings().charset().name()); doc.outputSettings().charset("ascii"); assertEquals(Entities.EscapeMode.base, doc.outputSettings().escapeMode()); - assertEquals("

π & < >

", doc.body().html()); + assertEquals("

π & < >

", doc.body().html()); doc.outputSettings().escapeMode(Entities.EscapeMode.extended); - assertEquals("

π & < >

", doc.body().html()); + assertEquals("

π & < >

", doc.body().html()); } @Test public void testXhtmlReferences() { diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index b2701e8441..1f92eb11d1 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -15,6 +15,7 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -625,6 +626,7 @@ public void testAddNewText() { Document doc = Jsoup.parse("

Hello

"); Element div = doc.getElementById("1"); div.appendText(" there & now >"); + assertEquals ("Hello there & now >", div.text()); assertEquals("

Hello

there & now >", TextUtil.stripNewlines(div.html())); } @@ -2305,4 +2307,31 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) { assertEquals("\n\n\n \n \n", doc5.html()); assertEquals("\n\n\n \n \n", doc6.html()); } + + @Test void textnodeInBlockIndent() { + String html ="
\n{{ msg }} \n
\n
\n{{ msg }} \n
"; + Document doc = Jsoup.parse(html); + assertEquals("
\n {{ msg }}\n
\n
\n {{ msg }}\n
", doc.body().html()); + } + + @Test void stripTrailing() { + String html = "

This is fine.

"; + Document doc = Jsoup.parse(html); + assertEquals("

This is fine.

", doc.body().html()); + } + + @Test void elementIndentAndSpaceTrims() { + String html = "

One Two

Hello

\nSome text \n

\n
"; + Document doc = Jsoup.parse(html); + assertEquals("
\n" + + "

One Two

Hello \n" + + "

Some text

\n" + + "
", doc.body().html()); + } + + @Test void divAInlineable() { + String html = "
Text"; + Document doc = Jsoup.parse(html); + assertEquals("
Text\n
", doc.body().html()); + } } \ No newline at end of file diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index dd1ecc36e4..48663a6371 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -158,7 +158,7 @@ public class HtmlParserTest { @Test public void testSpaceAfterTag() { Document doc = Jsoup.parse("

Hello

"); - assertEquals("

Hello

", TextUtil.stripNewlines(doc.body().html())); + assertEquals("

Hello

", TextUtil.stripNewlines(doc.body().html())); } @Test public void createsDocumentStructure() { @@ -279,7 +279,7 @@ public class HtmlParserTest { @Test public void handlesWhatWgExpensesTableExample() { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0 Document doc = Jsoup.parse("
2008 2007 2006
Research and development $ 1,109 $ 782 $ 712
Percentage of net sales 3.4% 3.3% 3.7%
Selling, general, and administrative $ 3,761 $ 2,963 $ 2,433
Percentage of net sales 11.6% 12.3% 12.6%
"); - assertEquals("
2008 2007 2006
Research and development $ 1,109 $ 782 $ 712
Percentage of net sales 3.4% 3.3% 3.7%
Selling, general, and administrative $ 3,761 $ 2,963 $ 2,433
Percentage of net sales 11.6% 12.3% 12.6%
", TextUtil.stripNewlines(doc.body().html())); + assertEquals("
200820072006
Research and development$ 1,109$ 782$ 712
Percentage of net sales3.4%3.3%3.7%
Selling, general, and administrative$ 3,761$ 2,963$ 2,433
Percentage of net sales11.6%12.3%12.6%
", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesTbodyTable() { @@ -294,7 +294,7 @@ public class HtmlParserTest { @Test public void noTableDirectInTable() { Document doc = Jsoup.parse("
One
Two
Three"); - assertEquals("
One
Two
Three
", + assertEquals("
One
Two
Three
", TextUtil.stripNewlines(doc.body().html())); } @@ -568,7 +568,7 @@ public class HtmlParserTest { @Test public void normalisesDocument() { String h = "OneTwoThreeFourFive Six Seven "; Document doc = Jsoup.parse(h); - assertEquals("OneTwoThreeFourFive Six Seven ", + assertEquals("OneTwoThreeFourFive Six Seven", TextUtil.stripNewlines(doc.html())); } @@ -599,7 +599,7 @@ public class HtmlParserTest { @Test public void testHgroup() { // jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK Document doc = Jsoup.parse("

Hello

There

Another

headline

More

stuff

"); - assertEquals("

Hello

There

Another

headline

More

stuff

", TextUtil.stripNewlines(doc.body().html())); + assertEquals("

Hello

There

Another

headline

More

stuff

", TextUtil.stripNewlines(doc.body().html())); } @Test public void testRelaxedTags() { @@ -611,7 +611,7 @@ public class HtmlParserTest { // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any // spec, which defines them as containing phrasing content only. so, reality over theory. Document doc = Jsoup.parse("

Hello
There
now

More

Content

"); - assertEquals("

Hello
There
now

More

Content

", TextUtil.stripNewlines(doc.body().html())); + assertEquals("

Hello
There
now

More

Content

", TextUtil.stripNewlines(doc.body().html())); } @Test public void testSpanContents() { @@ -1175,7 +1175,8 @@ public void testInvalidTableContents() throws IOException { Document doc = Parser.htmlParser() .settings(preserveCase) .parseInput(html, ""); - assertEquals("ONE Two", StringUtil.normaliseWhitespace(doc.body().html())); + //assertEquals("ONE Two", StringUtil.normaliseWhitespace(doc.body().html())); + assertEquals("ONE Two", doc.body().html()); } @Test public void normalizesDiscordantTags() { @@ -1246,7 +1247,7 @@ public void testInvalidTableContents() throws IOException { File in = ParseTest.getFile("/htmltests/comments.html"); Document doc = Jsoup.parse(in, "UTF-8"); - assertEquals(" A Certain Kind of Test

Hello

h1> (There is a UTF8 hidden BOM at the top of this file.) ", + assertEquals(" A Certain Kind of Test

Hello

h1> (There is a UTF8 hidden BOM at the top of this file.) ", StringUtil.normaliseWhitespace(doc.html())); assertEquals("A Certain Kind of Test", doc.head().select("title").text()); @@ -1486,7 +1487,7 @@ private boolean didAddElements(String input) { String html = "\n\n\n\n"; Document doc = Jsoup.parse(html); assertNotNull(doc); - assertEquals(" ", TextUtil.stripNewlines(doc.body().html())); + assertEquals(" ", TextUtil.stripNewlines(doc.body().html())); } @Test public void tagsMustStartWithAscii() { diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java index c5dbbfc169..f05ee26897 100644 --- a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java +++ b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java @@ -75,9 +75,9 @@ public void nestedAnchorElements01() { String s = Jsoup.parse(html).toString(); assertEquals("\n" + " \n" + - " \n" + + " \n" + "
\n" + - " child\n" + + " child\n" + "
\n" + " \n" + "", s); @@ -99,11 +99,11 @@ public void nestedAnchorElements02() { String s = Jsoup.parse(html).toString(); assertEquals("\n" + " \n" + - " \n" + + " \n" + "
\n" + - " \n" + + " \n" + "
\n" + - " child\n" + + " child\n" + "
\n" + "
\n" + " \n" + diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java index 8d0667e648..11a5167d61 100644 --- a/src/test/java/org/jsoup/select/TraversorTest.java +++ b/src/test/java/org/jsoup/select/TraversorTest.java @@ -95,7 +95,7 @@ public FilterResult tail(Node node, int depth) { return ("b".equals(node.nodeName())) ? FilterResult.REMOVE : FilterResult.CONTINUE; } }, doc.select("div")); - assertEquals("
\n
\n There be \n
", doc.select("body").html()); + assertEquals("
\n
\n There be\n
", doc.select("body").html()); } @Test diff --git a/src/test/resources/htmltests/medium.html b/src/test/resources/htmltests/medium.html index 1f90df242e..728961cb52 100644 --- a/src/test/resources/htmltests/medium.html +++ b/src/test/resources/htmltests/medium.html @@ -1,10 +1,10 @@ - Large HTML + Medium HTML

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero.

-

Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Aenean quam. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh.

+

Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Aenean quam. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh.

Quisque volutpat condimentum velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam nec ante. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Nulla facilisi. Ut fringilla. Suspendisse potenti. Nunc feugiat mi a tellus consequat imperdiet. Vestibulum sapien. Proin quam. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Etiam ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna.

Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Morbi lacinia molestie dui. Praesent blandit dolor. Sed non quam. In vel mi sit amet augue congue elementum. Morbi in ipsum sit amet pede facilisis laoreet. Suspendisse in justo eu magna luctus suscipit. Donec lacus nunc, viverra nec, blandit vel, egestas et, augue. Vestibulum tincidunt malesuada tellus. Ut ultrices ultrices enim. Curabitur sit amet mauris. Morbi in dui quis est pulvinar ullamcorper. Nulla facilisi.

Integer lacinia sollicitudin massa. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Cras metus. Sed aliquet risus a tortor. Integer id quam. Morbi mi. Morbi in dui quis est pulvinar ullamcorper. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Proin sodales libero eget ante. Nulla quam. Aenean laoreet. Vestibulum nisi lectus, commodo ac, facilisis ac, ultricies eu, pede. Ut orci risus, accumsan porttitor, cursus quis, aliquet eget, justo. Sed pretium blandit orci. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Ut eu diam at pede suscipit sodales. Aenean lectus elit, fermentum non, convallis id, sagittis at, neque.