Skip to content

Commit

Permalink
Added the :containsWholeOwnText selector
Browse files Browse the repository at this point in the history
And the corresponding Element#wholeOwnText() method.

For #1636
  • Loading branch information
jhy committed Dec 28, 2021
1 parent 99f9258 commit 027c70c
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 7 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Expand Up @@ -12,6 +12,11 @@ jsoup changelog
useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc.
<https://github.com/jhy/jsoup/issues/1636>

* Improvement: added Element#wholeOwnText() to retrieve the original (non-normalized) ownText of an Element. Also
added the :containsWholeOwnText(text) selector, to match against that. BR elements are now treated as newlines
in the wholeText methods.
<https://github.com/jhy/jsoup/issues/1636>

* Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
element, e.g. ancestor-or-self::*.
Expand Down
44 changes: 38 additions & 6 deletions src/main/java/org/jsoup/nodes/Element.java
Expand Up @@ -1285,19 +1285,44 @@ public String wholeText() {
final StringBuilder accum = StringUtil.borrowBuilder();
NodeTraversor.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.getWholeText());
}
appendWholeText(node, accum);
}

public void tail(Node node, int depth) {
}
public void tail(Node node, int depth) {}
}, this);

return StringUtil.releaseBuilder(accum);
}

private static void appendWholeText(Node node, StringBuilder accum) {
if (node instanceof TextNode) {
accum.append(((TextNode) node).getWholeText());
} else if (node instanceof Element) {
appendNewlineIfBr((Element) node, accum);
}
}

/**
Get the (unencoded) text of this element, <b>not including</b> any child elements, including any newlines and spaces
present in the original.
@return unencoded, un-normalized text that is a direct child of this Element
@see #text()
@see #wholeText()
@see #ownText()
@since 1.15.1
*/
public String wholeOwnText() {
final StringBuilder accum = StringUtil.borrowBuilder();
final int size = childNodeSize();
for (int i = 0; i < size; i++) {
Node node = childNodes.get(i);
appendWholeText(node, accum);
}

return StringUtil.releaseBuilder(accum);
}

/**
* Gets the (normalized) text owned by this element only; does not get the combined text of all children.
* <p>
Expand Down Expand Up @@ -1336,11 +1361,18 @@ private static void appendNormalisedText(StringBuilder accum, TextNode textNode)
StringUtil.appendNormalisedWhitespace(accum, text, TextNode.lastCharIsWhitespace(accum));
}

/** For normalized text, treat a br element as a space, if there is not already a space. */
private static void appendWhitespaceIfBr(Element element, StringBuilder accum) {
if (element.tag.normalName().equals("br") && !TextNode.lastCharIsWhitespace(accum))
accum.append(" ");
}

/** For WholeText, treat a br element as a newline. */
private static void appendNewlineIfBr(Element element, StringBuilder accum) {
if (element.tag.normalName().equals("br"))
accum.append("\n");
}

static boolean preserveWhitespace(@Nullable Node node) {
// looks only at this element and five levels up, to prevent recursion & needless stack searches
if (node instanceof Element) {
Expand Down
23 changes: 23 additions & 0 deletions src/main/java/org/jsoup/select/Evaluator.java
Expand Up @@ -704,6 +704,29 @@ public String toString() {
}
}

/**
* Evaluator for matching Element (but <b>not</b> its descendants) wholeText. Neither the input nor the element text is
* normalized. <code>:containsWholeOwnText()</code>
* @since 1.15.1.
*/
public static final class ContainsWholeOwnText extends Evaluator {
private final String searchText;

public ContainsWholeOwnText(String searchText) {
this.searchText = searchText;
}

@Override
public boolean matches(Element root, Element element) {
return element.wholeOwnText().contains(searchText);
}

@Override
public String toString() {
return String.format(":containsWholeOwnText(%s)", searchText);
}
}

/**
* Evaluator for matching Element (and its descendants) data
*/
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/org/jsoup/select/QueryParser.java
Expand Up @@ -182,6 +182,8 @@ else if (tq.matches(":containsOwn("))
contains(true);
else if (tq.matches(":containsWholeText("))
containsWholeText();
else if (tq.matches(":containsWholeOwnText("))
containsWholeOwnText();
else if (tq.matches(":containsData("))
containsData();
else if (tq.matches(":matches("))
Expand Down Expand Up @@ -376,6 +378,13 @@ private void containsWholeText() {
evals.add(new Evaluator.ContainsWholeText(searchText));
}

private void containsWholeOwnText() {
tq.consume(":containsWholeOwnText");
String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
Validate.notEmpty(searchText, ":containsWholeOwnText(text) query must not be empty");
evals.add(new Evaluator.ContainsWholeOwnText(searchText));
}

// pseudo selector :containsData(data)
private void containsData() {
tq.consume(":containsData");
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jsoup/select/Selector.java
Expand Up @@ -53,7 +53,8 @@
* <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".<p>{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}</p></td></tr>
* <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr>
* <tr><td><code>:containsData(<em>data</em>)</code></td><td>elements that contains the specified <em>data</em>. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.</td><td><code>script:contains(jsoup)</code> finds script elements containing the data "jsoup".</td></tr>
* <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would.</p></td></tr>
* <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
* <tr><td><code>:containsWholeOwnText(<em>text</em>)</code></td><td>elements that <b>directly</b> contain the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeOwnText(jsoup\nThe Java HTML Parser)</code> finds p elements directly containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr>
* <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
* <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>
Expand Down
9 changes: 9 additions & 0 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -1429,6 +1429,15 @@ public void testUNewlines() {
assertEquals(html, doc.body().html()); // disabling pretty-printing - round-trips the tab throughout, as no normalization occurs
}

@Test void wholeTextTreatsBRasNewline() {
String html = "<div>\nOne<br>Two <p>Three<br>Four</div>";
Document doc = Jsoup.parse(html);
Element div = doc.selectFirst("div");
assertNotNull(div);
assertEquals("\nOne\nTwo Three\nFour", div.wholeText());
assertEquals("\nOne\nTwo ", div.wholeOwnText());
}

@Test public void canDetectAutomaticallyAddedElements() {
String bare = "<script>One</script>";
String full = "<html><head><title>Check</title></head><body><p>One</p></body></html>";
Expand Down
20 changes: 20 additions & 0 deletions src/test/java/org/jsoup/select/SelectorTest.java
Expand Up @@ -637,6 +637,26 @@ public void testPseudoContains(Locale locale) {
assertEquals(". ", blanks.first().wholeText());
}

@Test void containsWholeOwnText() {
Document doc = Jsoup.parse("<div><p> jsoup\n The <i>HTML</i> Parser</p><p>jsoup The HTML Parser<br></div>");
Elements ps = doc.select("p");

Elements es1 = doc.select("p:containsWholeOwnText( jsoup\n The Parser)");
Elements es2 = doc.select("p:containsWholeOwnText(jsoup The HTML Parser\n)");
assertEquals(1, es1.size());
assertEquals(1, es2.size());
assertEquals(ps.get(0), es1.first());
assertEquals(ps.get(1), es2.first());

assertEquals(0, doc.select("div:containsWholeOwnText(jsoup the html parser)").size());
assertEquals(0, doc.select("div:containsWholeOwnText(jsoup\n the parser)").size());

doc = Jsoup.parse("<div><p></p><p> </p><p>. </p>");
Elements blanks = doc.select("p:containsWholeOwnText( )");
assertEquals(1, blanks.size());
assertEquals(". ", blanks.first().wholeText());
}

@MultiLocaleTest
public void containsOwn(Locale locale) {
Locale.setDefault(locale);
Expand Down

0 comments on commit 027c70c

Please sign in to comment.