diff --git a/CHANGES b/CHANGES index 073e7e5762..e52bf92734 100644 --- a/CHANGES +++ b/CHANGES @@ -12,6 +12,11 @@ jsoup changelog useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc. + * Improvement: added Element#wholeOwnText() to retrieve the original (non-normalized) ownText of an Element. Also + added the :containsWholeOwnText(text) selector, to match against that. BR elements are now treated as newlines + in the wholeText methods. + + * Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the element, e.g. ancestor-or-self::*. diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 8d30a10dbb..f96582f490 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1285,19 +1285,44 @@ public String wholeText() { final StringBuilder accum = StringUtil.borrowBuilder(); NodeTraversor.traverse(new NodeVisitor() { public void head(Node node, int depth) { - if (node instanceof TextNode) { - TextNode textNode = (TextNode) node; - accum.append(textNode.getWholeText()); - } + appendWholeText(node, accum); } - public void tail(Node node, int depth) { - } + public void tail(Node node, int depth) {} }, this); return StringUtil.releaseBuilder(accum); } + private static void appendWholeText(Node node, StringBuilder accum) { + if (node instanceof TextNode) { + accum.append(((TextNode) node).getWholeText()); + } else if (node instanceof Element) { + appendNewlineIfBr((Element) node, accum); + } + } + + /** + Get the (unencoded) text of this element, not including any child elements, including any newlines and spaces + present in the original. + + @return unencoded, un-normalized text that is a direct child of this Element + @see #text() + @see #wholeText() + @see #ownText() + @since 1.15.1 + */ + public String wholeOwnText() { + final StringBuilder accum = StringUtil.borrowBuilder(); + final int size = childNodeSize(); + for (int i = 0; i < size; i++) { + Node node = childNodes.get(i); + appendWholeText(node, accum); + } + + return StringUtil.releaseBuilder(accum); + } + /** * Gets the (normalized) text owned by this element only; does not get the combined text of all children. *

@@ -1336,11 +1361,18 @@ private static void appendNormalisedText(StringBuilder accum, TextNode textNode) StringUtil.appendNormalisedWhitespace(accum, text, TextNode.lastCharIsWhitespace(accum)); } + /** For normalized text, treat a br element as a space, if there is not already a space. */ private static void appendWhitespaceIfBr(Element element, StringBuilder accum) { if (element.tag.normalName().equals("br") && !TextNode.lastCharIsWhitespace(accum)) accum.append(" "); } + /** For WholeText, treat a br element as a newline. */ + private static void appendNewlineIfBr(Element element, StringBuilder accum) { + if (element.tag.normalName().equals("br")) + accum.append("\n"); + } + static boolean preserveWhitespace(@Nullable Node node) { // looks only at this element and five levels up, to prevent recursion & needless stack searches if (node instanceof Element) { diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java index 8c319f33bf..a73b7c6893 100644 --- a/src/main/java/org/jsoup/select/Evaluator.java +++ b/src/main/java/org/jsoup/select/Evaluator.java @@ -704,6 +704,29 @@ public String toString() { } } + /** + * Evaluator for matching Element (but not its descendants) wholeText. Neither the input nor the element text is + * normalized. :containsWholeOwnText() + * @since 1.15.1. + */ + public static final class ContainsWholeOwnText extends Evaluator { + private final String searchText; + + public ContainsWholeOwnText(String searchText) { + this.searchText = searchText; + } + + @Override + public boolean matches(Element root, Element element) { + return element.wholeOwnText().contains(searchText); + } + + @Override + public String toString() { + return String.format(":containsWholeOwnText(%s)", searchText); + } + } + /** * Evaluator for matching Element (and its descendants) data */ diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java index 4611624151..5139493c29 100644 --- a/src/main/java/org/jsoup/select/QueryParser.java +++ b/src/main/java/org/jsoup/select/QueryParser.java @@ -182,6 +182,8 @@ else if (tq.matches(":containsOwn(")) contains(true); else if (tq.matches(":containsWholeText(")) containsWholeText(); + else if (tq.matches(":containsWholeOwnText(")) + containsWholeOwnText(); else if (tq.matches(":containsData(")) containsData(); else if (tq.matches(":matches(")) @@ -376,6 +378,13 @@ private void containsWholeText() { evals.add(new Evaluator.ContainsWholeText(searchText)); } + private void containsWholeOwnText() { + tq.consume(":containsWholeOwnText"); + String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); + Validate.notEmpty(searchText, ":containsWholeOwnText(text) query must not be empty"); + evals.add(new Evaluator.ContainsWholeOwnText(searchText)); + } + // pseudo selector :containsData(data) private void containsData() { tq.consume(":containsData"); diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java index 8e86bf8370..4361cc8b6f 100644 --- a/src/main/java/org/jsoup/select/Selector.java +++ b/src/main/java/org/jsoup/select/Selector.java @@ -53,7 +53,8 @@ * :contains(text)elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized.

To find content that includes parentheses, escape those with a {@code \}.

p:contains(jsoup) finds p elements containing the text "jsoup".

{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}

* :containsOwn(text)elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.p:containsOwn(jsoup) finds p elements with own text "jsoup". * :containsData(data)elements that contains the specified data. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.script:contains(jsoup) finds script elements containing the data "jsoup". - * :containsWholeText(text)elements that contains the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeText(jsoup\nThe Java HTML Parser) finds p elements containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would.

+ * :containsWholeText(text)elements that contains the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeText(jsoup\nThe Java HTML Parser) finds p elements containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline.

+ * :containsWholeOwnText(text)elements that directly contain the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeOwnText(jsoup\nThe Java HTML Parser) finds p elements directly containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline.

* :matches(regex)elements containing whitespace normalized text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.td:matches(\\d+) finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively. * :matchesOwn(regex)elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.td:matchesOwn(\\d+) finds table cells directly containing digits. div:matchesOwn((?i)login) finds divs containing the text, case insensitively. * The above may be combined in any order and with other selectors.light:contains(name):eq(0) diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 08accbbd35..5214f1cbaf 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1429,6 +1429,15 @@ public void testUNewlines() { assertEquals(html, doc.body().html()); // disabling pretty-printing - round-trips the tab throughout, as no normalization occurs } + @Test void wholeTextTreatsBRasNewline() { + String html = "
\nOne
Two

Three
Four

"; + Document doc = Jsoup.parse(html); + Element div = doc.selectFirst("div"); + assertNotNull(div); + assertEquals("\nOne\nTwo Three\nFour", div.wholeText()); + assertEquals("\nOne\nTwo ", div.wholeOwnText()); + } + @Test public void canDetectAutomaticallyAddedElements() { String bare = ""; String full = "Check

One

"; diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index 8adff3a45e..e1cf6ddaa3 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -637,6 +637,26 @@ public void testPseudoContains(Locale locale) { assertEquals(". ", blanks.first().wholeText()); } + @Test void containsWholeOwnText() { + Document doc = Jsoup.parse("

jsoup\n The HTML Parser

jsoup The HTML Parser

"); + Elements ps = doc.select("p"); + + Elements es1 = doc.select("p:containsWholeOwnText( jsoup\n The Parser)"); + Elements es2 = doc.select("p:containsWholeOwnText(jsoup The HTML Parser\n)"); + assertEquals(1, es1.size()); + assertEquals(1, es2.size()); + assertEquals(ps.get(0), es1.first()); + assertEquals(ps.get(1), es2.first()); + + assertEquals(0, doc.select("div:containsWholeOwnText(jsoup the html parser)").size()); + assertEquals(0, doc.select("div:containsWholeOwnText(jsoup\n the parser)").size()); + + doc = Jsoup.parse("

.

"); + Elements blanks = doc.select("p:containsWholeOwnText( )"); + assertEquals(1, blanks.size()); + assertEquals(". ", blanks.first().wholeText()); + } + @MultiLocaleTest public void containsOwn(Locale locale) { Locale.setDefault(locale);