Added the :containsWholeOwnText selector

And the corresponding Element#wholeOwnText() method. For #1636
jhy · Dec 28, 2021 · 027c70c · 027c70c
1 parent 99f9258
commit 027c70c
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -12,6 +12,11 @@ jsoup changelog
     useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc.
     <https://github.com/jhy/jsoup/issues/1636>
 
+  * Improvement: added Element#wholeOwnText() to retrieve the original (non-normalized) ownText of an Element. Also
+    added the :containsWholeOwnText(text) selector, to match against that. BR elements are now treated as newlines
+    in the wholeText methods.
+    <https://github.com/jhy/jsoup/issues/1636>
+
   * Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
     query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
     element, e.g. ancestor-or-self::*.

diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java
@@ -1285,19 +1285,44 @@ public String wholeText() {
         final StringBuilder accum = StringUtil.borrowBuilder();
         NodeTraversor.traverse(new NodeVisitor() {
             public void head(Node node, int depth) {
-                if (node instanceof TextNode) {
-                    TextNode textNode = (TextNode) node;
-                    accum.append(textNode.getWholeText());
-                }
+                appendWholeText(node, accum);
             }
 
-            public void tail(Node node, int depth) {
-            }
+            public void tail(Node node, int depth) {}
         }, this);
 
         return StringUtil.releaseBuilder(accum);
     }
 
+    private static void appendWholeText(Node node, StringBuilder accum) {
+        if (node instanceof TextNode) {
+            accum.append(((TextNode) node).getWholeText());
+        } else if (node instanceof Element) {
+            appendNewlineIfBr((Element) node, accum);
+        }
+    }
+
+    /**
+     Get the (unencoded) text of this element, <b>not including</b> any child elements, including any newlines and spaces
+     present in the original.
+
+     @return unencoded, un-normalized text that is a direct child of this Element
+     @see #text()
+     @see #wholeText()
+     @see #ownText()
+     @since 1.15.1
+     */
+    public String wholeOwnText() {
+        final StringBuilder accum = StringUtil.borrowBuilder();
+        final int size = childNodeSize();
+        for (int i = 0; i < size; i++) {
+            Node node = childNodes.get(i);
+            appendWholeText(node, accum);
+        }
+
+        return StringUtil.releaseBuilder(accum);
+    }
+
     /**
      * Gets the (normalized) text owned by this element only; does not get the combined text of all children.
      * <p>
@@ -1336,11 +1361,18 @@ private static void appendNormalisedText(StringBuilder accum, TextNode textNode)
             StringUtil.appendNormalisedWhitespace(accum, text, TextNode.lastCharIsWhitespace(accum));
     }
 
+    /** For normalized text, treat a br element as a space, if there is not already a space. */
     private static void appendWhitespaceIfBr(Element element, StringBuilder accum) {
         if (element.tag.normalName().equals("br") && !TextNode.lastCharIsWhitespace(accum))
             accum.append(" ");
     }
 
+    /** For WholeText, treat a br element as a newline. */
+    private static void appendNewlineIfBr(Element element, StringBuilder accum) {
+        if (element.tag.normalName().equals("br"))
+            accum.append("\n");
+    }
+
     static boolean preserveWhitespace(@Nullable Node node) {
         // looks only at this element and five levels up, to prevent recursion & needless stack searches
         if (node instanceof Element) {

diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java
@@ -704,6 +704,29 @@ public String toString() {
         }
     }
 
+    /**
+     * Evaluator for matching Element (but <b>not</b> its descendants) wholeText. Neither the input nor the element text is
+     * normalized. <code>:containsWholeOwnText()</code>
+     * @since 1.15.1.
+     */
+    public static final class ContainsWholeOwnText extends Evaluator {
+        private final String searchText;
+
+        public ContainsWholeOwnText(String searchText) {
+            this.searchText = searchText;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.wholeOwnText().contains(searchText);
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":containsWholeOwnText(%s)", searchText);
+        }
+    }
+
     /**
      * Evaluator for matching Element (and its descendants) data
      */

diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java
@@ -182,6 +182,8 @@ else if (tq.matches(":containsOwn("))
             contains(true);
         else if (tq.matches(":containsWholeText("))
             containsWholeText();
+        else if (tq.matches(":containsWholeOwnText("))
+            containsWholeOwnText();
         else if (tq.matches(":containsData("))
             containsData();
         else if (tq.matches(":matches("))
@@ -376,6 +378,13 @@ private void containsWholeText() {
         evals.add(new Evaluator.ContainsWholeText(searchText));
     }
 
+    private void containsWholeOwnText() {
+        tq.consume(":containsWholeOwnText");
+        String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
+        Validate.notEmpty(searchText, ":containsWholeOwnText(text) query must not be empty");
+        evals.add(new Evaluator.ContainsWholeOwnText(searchText));
+    }
+
     // pseudo selector :containsData(data)
     private void containsData() {
         tq.consume(":containsData");

diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java
@@ -53,7 +53,8 @@
  * <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".<p>{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}</p></td></tr>
  * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr>
  * <tr><td><code>:containsData(<em>data</em>)</code></td><td>elements that contains the specified <em>data</em>. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.</td><td><code>script:contains(jsoup)</code> finds script elements containing the data "jsoup".</td></tr>
- * <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would.</p></td></tr>
+ * <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
+ * <tr><td><code>:containsWholeOwnText(<em>text</em>)</code></td><td>elements that <b>directly</b> contain the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeOwnText(jsoup\nThe Java HTML Parser)</code> finds p elements directly containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
  * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
  * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
  * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1429,6 +1429,15 @@ public void testUNewlines() {
         assertEquals(html, doc.body().html()); // disabling pretty-printing - round-trips the tab throughout, as no normalization occurs
     }
 
+    @Test void wholeTextTreatsBRasNewline() {
+        String html = "<div>\nOne<br>Two <p>Three<br>Four</div>";
+        Document doc = Jsoup.parse(html);
+        Element div = doc.selectFirst("div");
+        assertNotNull(div);
+        assertEquals("\nOne\nTwo Three\nFour", div.wholeText());
+        assertEquals("\nOne\nTwo ", div.wholeOwnText());
+    }
+
     @Test public void canDetectAutomaticallyAddedElements() {
         String bare = "<script>One</script>";
         String full = "<html><head><title>Check</title></head><body><p>One</p></body></html>";

diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java
@@ -637,6 +637,26 @@ public void testPseudoContains(Locale locale) {
         assertEquals(".  ", blanks.first().wholeText());
     }
 
+    @Test void containsWholeOwnText() {
+        Document doc = Jsoup.parse("<div><p> jsoup\n The <i>HTML</i> Parser</p><p>jsoup The HTML Parser<br></div>");
+        Elements ps = doc.select("p");
+
+        Elements es1 = doc.select("p:containsWholeOwnText( jsoup\n The  Parser)");
+        Elements es2 = doc.select("p:containsWholeOwnText(jsoup The HTML Parser\n)");
+        assertEquals(1, es1.size());
+        assertEquals(1, es2.size());
+        assertEquals(ps.get(0), es1.first());
+        assertEquals(ps.get(1), es2.first());
+
+        assertEquals(0, doc.select("div:containsWholeOwnText(jsoup the html parser)").size());
+        assertEquals(0, doc.select("div:containsWholeOwnText(jsoup\n the  parser)").size());
+
+        doc = Jsoup.parse("<div><p></p><p> </p><p>.  </p>");
+        Elements blanks = doc.select("p:containsWholeOwnText(  )");
+        assertEquals(1, blanks.size());
+        assertEquals(".  ", blanks.first().wholeText());
+    }
+
     @MultiLocaleTest
     public void containsOwn(Locale locale) {
         Locale.setDefault(locale);