Skip to content

Commit

Permalink
Added selectors for matchesWholeText and matchesWholeOwnText
Browse files Browse the repository at this point in the history
Fixes #1636
  • Loading branch information
jhy committed Dec 28, 2021
1 parent ab1d80b commit 4535a57
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Expand Up @@ -17,6 +17,10 @@ jsoup changelog
in the wholeText methods.
<https://github.com/jhy/jsoup/issues/1636>

* Improvement: added the :matchesWholeText(regex) and :matchesWholeOwnText(regex) selectors, to match against whole
(non-normalized, case sensitive> element text and own text, respectively.
<https://github.com/jhy/jsoup/issues/1636>

* Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
element, e.g. ancestor-or-self::*.
Expand Down
44 changes: 44 additions & 0 deletions src/main/java/org/jsoup/select/Evaluator.java
Expand Up @@ -813,6 +813,50 @@ public String toString() {
}
}

/**
* Evaluator for matching Element (and its descendants) whole text with regex
*/
public static final class MatchesWholeText extends Evaluator {
private final Pattern pattern;

public MatchesWholeText(Pattern pattern) {
this.pattern = pattern;
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.wholeText());
return m.find();
}

@Override
public String toString() {
return String.format(":matchesWholeText(%s)", pattern);
}
}

/**
* Evaluator for matching Element's own whole text with regex
*/
public static final class MatchesWholeOwnText extends Evaluator {
private final Pattern pattern;

public MatchesWholeOwnText(Pattern pattern) {
this.pattern = pattern;
}

@Override
public boolean matches(Element root, Element element) {
Matcher m = pattern.matcher(element.wholeOwnText());
return m.find();
}

@Override
public String toString() {
return String.format(":matchesWholeOwnText(%s)", pattern);
}
}

public static final class MatchText extends Evaluator {

@Override
Expand Down
28 changes: 22 additions & 6 deletions src/main/java/org/jsoup/select/QueryParser.java
Expand Up @@ -190,6 +190,10 @@ else if (tq.matches(":matches("))
matches(false);
else if (tq.matches(":matchesOwn("))
matches(true);
else if (tq.matches(":matchesWholeText("))
matchesWholeText(false);
else if (tq.matches(":matchesWholeOwnText("))
matchesWholeText(true);
else if (tq.matches(":not("))
not();
else if (tq.matchChomp(":nth-child("))
Expand Down Expand Up @@ -391,14 +395,26 @@ private void containsData() {

// :matches(regex), matchesOwn(regex)
private void matches(boolean own) {
tq.consume(own ? ":matchesOwn" : ":matches");
String query = own ? ":matchesOwn" : ":matches";
tq.consume(query);
String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, ":matches(regex) query must not be empty");
Validate.notEmpty(regex, query + "(regex) query must not be empty");

if (own)
evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex)));
else
evals.add(new Evaluator.Matches(Pattern.compile(regex)));
evals.add(own
? new Evaluator.MatchesOwn(Pattern.compile(regex))
: new Evaluator.Matches(Pattern.compile(regex)));
}

// :matches(regex), matchesOwn(regex)
private void matchesWholeText(boolean own) {
String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
tq.consume(query);
String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, query + "(regex) query must not be empty");

evals.add(own
? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
: new Evaluator.MatchesWholeText(Pattern.compile(regex)));
}

// :not(selector)
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jsoup/select/Selector.java
Expand Up @@ -56,7 +56,8 @@
* <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
* <tr><td><code>:containsWholeOwnText(<em>text</em>)</code></td><td>elements that <b>directly</b> contain the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeOwnText(jsoup\nThe Java HTML Parser)</code> finds p elements directly containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
* <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td><code>:matchesWholeText(<em>regex</em>)</code></td><td>elements containing <b>non-normalized</b> whole text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matchesWholeText(\\s{2,})</code> finds table cells a run of at least two space characters.</td></tr>
* <tr><td><code>:matchesWholeOwnText(<em>regex</em>)</code></td><td>elements whose own <b>non-normalized</b> whole text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesWholeOwnText(\n\\d+)</code> finds table cells directly containing digits following a neewline.</td></tr>
* <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>
* <tr><td><code>:matchText</code></td><td>treats text nodes as elements, and so allows you to match against and select text nodes.<p><b>Note</b> that using this selector will modify the DOM, so you may want to {@code clone} your document before using.</td><td>{@code p:matchText:firstChild} with input {@code <p>One<br />Two</p>} will return one {@link org.jsoup.nodes.PseudoTextElement} with text "{@code One}".</td></tr>
* <tr><td colspan="3"><h3>Structural pseudo selectors</h3></td></tr>
Expand Down
37 changes: 37 additions & 0 deletions src/test/java/org/jsoup/select/SelectorTest.java
Expand Up @@ -710,6 +710,43 @@ public void containsOwn(Locale locale) {
assertEquals(0, doc.select("p:matchesOwn(there)").size());
}

@Test public void matchesWholeText() {
Document doc = Jsoup.parse("<p id=1>Hello <b>there</b>\n now</p><p id=2> </p><p id=3></p>");

Elements p1 = doc.select("p:matchesWholeText((?i)hello there\n now)");
assertEquals(1, p1.size());
assertEquals("1", p1.first().id());

assertEquals(1, doc.select("p:matchesWholeText(there\n now)").size());
assertEquals(0, doc.select("p:matchesWholeText(There\n now)").size());

Elements p2 = doc.select("p:matchesWholeText(^\\s+$)");
assertEquals(1, p2.size());
assertEquals("2", p2.first().id());

Elements p3 = doc.select("p:matchesWholeText(^$)");
assertEquals(1, p3.size());
assertEquals("3", p3.first().id());
}

@Test public void matchesWholeOwnText() {
Document doc = Jsoup.parse("<p id=1>Hello <b>there</b>\n now</p><p id=2> </p><p id=3><i>Text</i></p>");

Elements p1 = doc.select("p:matchesWholeOwnText((?i)hello \n now)");
assertEquals(1, p1.size());
assertEquals("1", p1.first().id());

assertEquals(0, doc.select("p:matchesWholeOwnText(there\n now)").size());

Elements p2 = doc.select("p:matchesWholeOwnText(^\\s+$)");
assertEquals(1, p2.size());
assertEquals("2", p2.first().id());

Elements p3 = doc.select("p:matchesWholeOwnText(^$)");
assertEquals(1, p3.size());
assertEquals("3", p3.first().id());
}

@Test public void testRelaxedTags() {
Document doc = Jsoup.parse("<abc_def id=1>Hello</abc_def> <abc-def id=2>There</abc-def>");

Expand Down

0 comments on commit 4535a57

Please sign in to comment.