From adba4e80be13fb7a8d0ddfb31258b628bba61b99 Mon Sep 17 00:00:00 2001 From: jhy Date: Thu, 7 Oct 2021 22:08:31 +1100 Subject: [PATCH] Added :containsWholeText Part of #1636 --- CHANGES | 4 ++++ src/main/java/org/jsoup/select/Evaluator.java | 23 +++++++++++++++++++ .../java/org/jsoup/select/QueryParser.java | 9 ++++++++ src/main/java/org/jsoup/select/Selector.java | 1 + .../java/org/jsoup/select/SelectorTest.java | 20 ++++++++++++++++ 5 files changed, 57 insertions(+) diff --git a/CHANGES b/CHANGES index 61dd021e01..b108fed3fc 100644 --- a/CHANGES +++ b/CHANGES @@ -8,6 +8,10 @@ jsoup changelog input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.) + * Improvement: added the :containsWholeText(text) selector, to match against non-normalized Element text. That can be + useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc. + + *** Release 1.14.3 [2021-Sep-30] * Improvement: added native XPath support in Element#selectXpath(String) diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java index 52cd432099..8c319f33bf 100644 --- a/src/main/java/org/jsoup/select/Evaluator.java +++ b/src/main/java/org/jsoup/select/Evaluator.java @@ -681,6 +681,29 @@ public String toString() { } } + /** + * Evaluator for matching Element (and its descendants) wholeText. Neither the input nor the element text is + * normalized. :containsWholeText() + * @since 1.15.1. + */ + public static final class ContainsWholeText extends Evaluator { + private final String searchText; + + public ContainsWholeText(String searchText) { + this.searchText = searchText; + } + + @Override + public boolean matches(Element root, Element element) { + return element.wholeText().contains(searchText); + } + + @Override + public String toString() { + return String.format(":containsWholeText(%s)", searchText); + } + } + /** * Evaluator for matching Element (and its descendants) data */ diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java index bed123b494..46849a52fa 100644 --- a/src/main/java/org/jsoup/select/QueryParser.java +++ b/src/main/java/org/jsoup/select/QueryParser.java @@ -180,6 +180,8 @@ else if (tq.matches(":contains(")) contains(false); else if (tq.matches(":containsOwn(")) contains(true); + else if (tq.matches(":containsWholeText(")) + containsWholeText(); else if (tq.matches(":containsData(")) containsData(); else if (tq.matches(":matches(")) @@ -367,6 +369,13 @@ private void contains(boolean own) { evals.add(new Evaluator.ContainsText(searchText)); } + private void containsWholeText() { + tq.consume(":containsWholeText"); + String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); + Validate.notEmpty(searchText, ":containsWholeText(text) query must not be empty"); + evals.add(new Evaluator.ContainsWholeText(searchText)); + } + // pseudo selector :containsData(data) private void containsData() { tq.consume(":containsData"); diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java index 511efbafc2..dd86e091ac 100644 --- a/src/main/java/org/jsoup/select/Selector.java +++ b/src/main/java/org/jsoup/select/Selector.java @@ -53,6 +53,7 @@ * :contains(text)elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized.

To find content that includes parentheses, escape those with a {@code \}.

p:contains(jsoup) finds p elements containing the text "jsoup".

{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}

* :containsOwn(text)elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.p:containsOwn(jsoup) finds p elements with own text "jsoup". * :containsData(data)elements that contains the specified data. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.script:contains(jsoup) finds script elements containing the data "jsoup". + * :containsWholeText(text)elements that contains the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeText(jsoup\nThe Java HTML Parser) finds p elements containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would.

* :matches(regex)elements containing whitespace normalized text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.td:matches(\\d+) finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively. * :matchesOwn(regex)elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.td:matchesOwn(\\d+) finds table cells directly containing digits. div:matchesOwn((?i)login) finds divs containing the text, case insensitively. * The above may be combined in any order and with other selectors.light:contains(name):eq(0) diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index bedb41440b..14d50431ca 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -617,6 +617,26 @@ public void testPseudoContains(Locale locale) { assertEquals("2", ps2.first().id()); } + @Test void containsWholeText() { + Document doc = Jsoup.parse("

jsoup\n The HTML Parser

jsoup The HTML Parser

"); + Elements ps = doc.select("p"); + + Elements es1 = doc.select("p:containsWholeText( jsoup\n The HTML Parser)"); + Elements es2 = doc.select("p:containsWholeText(jsoup The HTML Parser)"); + assertEquals(1, es1.size()); + assertEquals(1, es2.size()); + assertEquals(ps.get(0), es1.first()); + assertEquals(ps.get(1), es2.first()); + + assertEquals(0, doc.select("div:containsWholeText(jsoup the html parser)").size()); + assertEquals(0, doc.select("div:containsWholeText(jsoup\n the html parser)").size()); + + doc = Jsoup.parse("

.

"); + Elements blanks = doc.select("p:containsWholeText( )"); + assertEquals(1, blanks.size()); + assertEquals(". ", blanks.first().wholeText()); + } + @MultiLocaleTest public void containsOwn(Locale locale) { Locale.setDefault(locale);