Skip to content

Commit

Permalink
Added :containsWholeText
Browse files Browse the repository at this point in the history
Part of #1636
  • Loading branch information
jhy committed Oct 7, 2021
1 parent f16f71d commit adba4e8
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Expand Up @@ -8,6 +8,10 @@ jsoup changelog
input document is using the HTML syntax. (Previously, would always coerce using the more restrictive XML syntax.)
<https://github.com/jhy/jsoup/pull/1648>

* Improvement: added the :containsWholeText(text) selector, to match against non-normalized Element text. That can be
useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc.
<https://github.com/jhy/jsoup/issues/1636>

*** Release 1.14.3 [2021-Sep-30]
* Improvement: added native XPath support in Element#selectXpath(String)
<https://github.com/jhy/jsoup/pull/1629>
Expand Down
23 changes: 23 additions & 0 deletions src/main/java/org/jsoup/select/Evaluator.java
Expand Up @@ -681,6 +681,29 @@ public String toString() {
}
}

/**
* Evaluator for matching Element (and its descendants) wholeText. Neither the input nor the element text is
* normalized. <code>:containsWholeText()</code>
* @since 1.15.1.
*/
public static final class ContainsWholeText extends Evaluator {
private final String searchText;

public ContainsWholeText(String searchText) {
this.searchText = searchText;
}

@Override
public boolean matches(Element root, Element element) {
return element.wholeText().contains(searchText);
}

@Override
public String toString() {
return String.format(":containsWholeText(%s)", searchText);
}
}

/**
* Evaluator for matching Element (and its descendants) data
*/
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/org/jsoup/select/QueryParser.java
Expand Up @@ -180,6 +180,8 @@ else if (tq.matches(":contains("))
contains(false);
else if (tq.matches(":containsOwn("))
contains(true);
else if (tq.matches(":containsWholeText("))
containsWholeText();
else if (tq.matches(":containsData("))
containsData();
else if (tq.matches(":matches("))
Expand Down Expand Up @@ -367,6 +369,13 @@ private void contains(boolean own) {
evals.add(new Evaluator.ContainsText(searchText));
}

private void containsWholeText() {
tq.consume(":containsWholeText");
String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
Validate.notEmpty(searchText, ":containsWholeText(text) query must not be empty");
evals.add(new Evaluator.ContainsWholeText(searchText));
}

// pseudo selector :containsData(data)
private void containsData() {
tq.consume(":containsData");
Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jsoup/select/Selector.java
Expand Up @@ -53,6 +53,7 @@
* <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".<p>{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}</p></td></tr>
* <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr>
* <tr><td><code>:containsData(<em>data</em>)</code></td><td>elements that contains the specified <em>data</em>. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.</td><td><code>script:contains(jsoup)</code> finds script elements containing the data "jsoup".</td></tr>
* <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would.</p></td></tr>
* <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>
Expand Down
20 changes: 20 additions & 0 deletions src/test/java/org/jsoup/select/SelectorTest.java
Expand Up @@ -617,6 +617,26 @@ public void testPseudoContains(Locale locale) {
assertEquals("2", ps2.first().id());
}

@Test void containsWholeText() {
Document doc = Jsoup.parse("<div><p> jsoup\n The <i>HTML</i> Parser</p><p>jsoup The HTML Parser</div>");
Elements ps = doc.select("p");

Elements es1 = doc.select("p:containsWholeText( jsoup\n The HTML Parser)");
Elements es2 = doc.select("p:containsWholeText(jsoup The HTML Parser)");
assertEquals(1, es1.size());
assertEquals(1, es2.size());
assertEquals(ps.get(0), es1.first());
assertEquals(ps.get(1), es2.first());

assertEquals(0, doc.select("div:containsWholeText(jsoup the html parser)").size());
assertEquals(0, doc.select("div:containsWholeText(jsoup\n the html parser)").size());

doc = Jsoup.parse("<div><p></p><p> </p><p>. </p>");
Elements blanks = doc.select("p:containsWholeText( )");
assertEquals(1, blanks.size());
assertEquals(". ", blanks.first().wholeText());
}

@MultiLocaleTest
public void containsOwn(Locale locale) {
Locale.setDefault(locale);
Expand Down

0 comments on commit adba4e8

Please sign in to comment.