Added selectors for matchesWholeText and matchesWholeOwnText

Fixes #1636
jhy · Dec 28, 2021 · 4535a57 · 4535a57
1 parent ab1d80b
commit 4535a57
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -17,6 +17,10 @@ jsoup changelog
     in the wholeText methods.
     <https://github.com/jhy/jsoup/issues/1636>
 
+  * Improvement: added the :matchesWholeText(regex) and :matchesWholeOwnText(regex) selectors, to match against whole
+    (non-normalized, case sensitive> element text and own text, respectively.
+    <https://github.com/jhy/jsoup/issues/1636>
+
   * Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
     query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
     element, e.g. ancestor-or-self::*.

diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java
@@ -813,6 +813,50 @@ public String toString() {
         }
     }
 
+    /**
+     * Evaluator for matching Element (and its descendants) whole text with regex
+     */
+    public static final class MatchesWholeText extends Evaluator {
+        private final Pattern pattern;
+
+        public MatchesWholeText(Pattern pattern) {
+            this.pattern = pattern;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            Matcher m = pattern.matcher(element.wholeText());
+            return m.find();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":matchesWholeText(%s)", pattern);
+        }
+    }
+
+    /**
+     * Evaluator for matching Element's own whole text with regex
+     */
+    public static final class MatchesWholeOwnText extends Evaluator {
+        private final Pattern pattern;
+
+        public MatchesWholeOwnText(Pattern pattern) {
+            this.pattern = pattern;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            Matcher m = pattern.matcher(element.wholeOwnText());
+            return m.find();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":matchesWholeOwnText(%s)", pattern);
+        }
+    }
+
     public static final class MatchText extends Evaluator {
 
         @Override

diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java
@@ -190,6 +190,10 @@ else if (tq.matches(":matches("))
             matches(false);
         else if (tq.matches(":matchesOwn("))
             matches(true);
+        else if (tq.matches(":matchesWholeText("))
+            matchesWholeText(false);
+        else if (tq.matches(":matchesWholeOwnText("))
+            matchesWholeText(true);
         else if (tq.matches(":not("))
             not();
 		else if (tq.matchChomp(":nth-child("))
@@ -391,14 +395,26 @@ private void containsData() {
 
     // :matches(regex), matchesOwn(regex)
     private void matches(boolean own) {
-        tq.consume(own ? ":matchesOwn" : ":matches");
+        String query = own ? ":matchesOwn" : ":matches";
+        tq.consume(query);
         String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
-        Validate.notEmpty(regex, ":matches(regex) query must not be empty");
+        Validate.notEmpty(regex, query + "(regex) query must not be empty");
 
-        if (own)
-            evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex)));
-        else
-            evals.add(new Evaluator.Matches(Pattern.compile(regex)));
+        evals.add(own
+            ? new Evaluator.MatchesOwn(Pattern.compile(regex))
+            : new Evaluator.Matches(Pattern.compile(regex)));
+    }
+
+    // :matches(regex), matchesOwn(regex)
+    private void matchesWholeText(boolean own) {
+        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
+        tq.consume(query);
+        String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
+        Validate.notEmpty(regex, query + "(regex) query must not be empty");
+
+        evals.add(own
+            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
+            : new Evaluator.MatchesWholeText(Pattern.compile(regex)));
     }
 
     // :not(selector)

diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java
@@ -56,7 +56,8 @@
  * <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
  * <tr><td><code>:containsWholeOwnText(<em>text</em>)</code></td><td>elements that <b>directly</b> contain the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeOwnText(jsoup\nThe Java HTML Parser)</code> finds p elements directly containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
  * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
- * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
+ * <tr><td><code>:matchesWholeText(<em>regex</em>)</code></td><td>elements containing <b>non-normalized</b> whole text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matchesWholeText(\\s{2,})</code> finds table cells a run of at least two space characters.</td></tr>
+ * <tr><td><code>:matchesWholeOwnText(<em>regex</em>)</code></td><td>elements whose own <b>non-normalized</b> whole text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesWholeOwnText(\n\\d+)</code> finds table cells directly containing digits following a neewline.</td></tr>
  * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>
  * <tr><td><code>:matchText</code></td><td>treats text nodes as elements, and so allows you to match against and select text nodes.<p><b>Note</b> that using this selector will modify the DOM, so you may want to {@code clone} your document before using.</td><td>{@code p:matchText:firstChild} with input {@code <p>One<br />Two</p>} will return one {@link org.jsoup.nodes.PseudoTextElement} with text "{@code One}".</td></tr>
  * <tr><td colspan="3"><h3>Structural pseudo selectors</h3></td></tr>

diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java
@@ -710,6 +710,43 @@ public void containsOwn(Locale locale) {
         assertEquals(0, doc.select("p:matchesOwn(there)").size());
     }
 
+    @Test public void matchesWholeText() {
+        Document doc = Jsoup.parse("<p id=1>Hello <b>there</b>\n now</p><p id=2> </p><p id=3></p>");
+
+        Elements p1 = doc.select("p:matchesWholeText((?i)hello there\n now)");
+        assertEquals(1, p1.size());
+        assertEquals("1", p1.first().id());
+
+        assertEquals(1, doc.select("p:matchesWholeText(there\n now)").size());
+        assertEquals(0, doc.select("p:matchesWholeText(There\n now)").size());
+
+        Elements p2 = doc.select("p:matchesWholeText(^\\s+$)");
+        assertEquals(1, p2.size());
+        assertEquals("2", p2.first().id());
+
+        Elements p3 = doc.select("p:matchesWholeText(^$)");
+        assertEquals(1, p3.size());
+        assertEquals("3", p3.first().id());
+    }
+
+    @Test public void matchesWholeOwnText() {
+        Document doc = Jsoup.parse("<p id=1>Hello <b>there</b>\n now</p><p id=2> </p><p id=3><i>Text</i></p>");
+
+        Elements p1 = doc.select("p:matchesWholeOwnText((?i)hello \n now)");
+        assertEquals(1, p1.size());
+        assertEquals("1", p1.first().id());
+
+        assertEquals(0, doc.select("p:matchesWholeOwnText(there\n now)").size());
+
+        Elements p2 = doc.select("p:matchesWholeOwnText(^\\s+$)");
+        assertEquals(1, p2.size());
+        assertEquals("2", p2.first().id());
+
+        Elements p3 = doc.select("p:matchesWholeOwnText(^$)");
+        assertEquals(1, p3.size());
+        assertEquals("3", p3.first().id());
+    }
+
     @Test public void testRelaxedTags() {
         Document doc = Jsoup.parse("<abc_def id=1>Hello</abc_def> <abc-def id=2>There</abc-def>");