Skip to content

Commit

Permalink
When evaluating XPath on a context node, use the entire w3c document
Browse files Browse the repository at this point in the history
And track the context in user data sections. This allows queries to evaluate against the complete document, vs just a sub-tree.

Fixes #1652
  • Loading branch information
jhy committed Oct 19, 2021
1 parent 7f28cb0 commit 1e4d127
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 15 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Expand Up @@ -12,6 +12,11 @@ jsoup changelog
useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc.
<https://github.com/jhy/jsoup/issues/1636>

* Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
element, e.g. ancestor-or-self::*.
<https://github.com/jhy/jsoup/issues/1652>

*** Release 1.14.3 [2021-Sep-30]
* Improvement: added native XPath support in Element#selectXpath(String)
<https://github.com/jhy/jsoup/pull/1629>
Expand Down
64 changes: 53 additions & 11 deletions src/main/java/org/jsoup/helper/W3CDom.java
Expand Up @@ -49,6 +49,9 @@
public class W3CDom {
/** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
public static final String SourceProperty = "jsoupSource";
private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context


/**
To get support for XPath versions &gt; 1, set this property to the classname of an alternate XPathFactory
Expand Down Expand Up @@ -161,31 +164,34 @@ public Document fromJsoup(org.jsoup.nodes.Document in) {
}

/**
* Convert a jsoup Element to a W3C Document. The created nodes will link back to the original
* Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
* jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
* flow to the other).
* flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
* converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
*
* @param in jsoup element or doc
* @return a W3C DOM Document representing the jsoup Document or Element contents.
* @see #sourceNodes(NodeList, Class)
* @see #contextNode(Document)
*/
public Document fromJsoup(org.jsoup.nodes.Element in) {
Validate.notNull(in);
DocumentBuilder builder;
try {
builder = factory.newDocumentBuilder();
DOMImplementation impl = builder.getDOMImplementation();
Document out;

out = builder.newDocument();
Document out = builder.newDocument();
org.jsoup.nodes.Document inDoc = in.ownerDocument();
org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
if (doctype != null) {
org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
out.appendChild(documentType);
}
out.setXmlStandalone(true);

convert(in, out);
// if in is Document, use the root element, not the wrapping document, as the context:
org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.child(0) : in;
out.setUserData(ContextProperty, context, null);
convert(inDoc != null ? inDoc : in, out);
return out;
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
Expand Down Expand Up @@ -226,9 +232,25 @@ public void convert(org.jsoup.nodes.Element in, Document out) {
NodeTraversor.traverse(builder, rootEl);
}

/**
Evaluate an XPath query against the supplied document, and return the results.
@param xpath an XPath query
@param doc the document to evaluate against
@return the matches nodes
*/
public NodeList selectXpath(String xpath, Document doc) {
return selectXpath(xpath, (Node) doc);
}

/**
Evaluate an XPath query against the supplied context node, and return the results.
@param xpath an XPath query
@param contextNode the context node to evaluate against
@return the matches nodes
*/
public NodeList selectXpath(String xpath, Node contextNode) {
Validate.notEmpty(xpath);
Validate.notNull(doc);
Validate.notNull(contextNode);

NodeList nodeList;
try {
Expand All @@ -239,14 +261,21 @@ public NodeList selectXpath(String xpath, Document doc) {
XPathFactory.newInstance();

XPathExpression expression = xPathFactory.newXPath().compile(xpath);
nodeList = (NodeList) expression.evaluate(doc, XPathConstants.NODESET); // love the strong typing here /s
nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
Validate.notNull(nodeList);
} catch (XPathExpressionException | XPathFactoryConfigurationException e) {
throw new Selector.SelectorParseException("Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
}
return nodeList;
}

/**
Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
@param nodeList the W3C nodes to get the original jsoup nodes from
@param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
@param <T> node type
@return a list of the original nodes
*/
public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
Validate.notNull(nodeList);
Validate.notNull(nodeType);
Expand All @@ -262,6 +291,15 @@ public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, C
return nodes;
}

/**
For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
@param wDoc Document created by this class
@return the corresponding W3C Node to the jsoup Element that was used as the creating context.
*/
public Node contextNode(Document wDoc) {
return (Node) wDoc.getUserData(ContextNodeProperty);
}

/**
* Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc.
*
Expand All @@ -284,11 +322,13 @@ protected static class W3CBuilder implements NodeVisitor {
private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
private Node dest;
private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
@Nullable private final org.jsoup.nodes.Element contextElement;

public W3CBuilder(Document doc) {
this.doc = doc;
this.namespacesStack.push(new HashMap<>());
this.dest = doc;
namespacesStack.push(new HashMap<>());
dest = doc;
contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
}

public void head(org.jsoup.nodes.Node source, int depth) {
Expand All @@ -310,6 +350,8 @@ public void head(org.jsoup.nodes.Node source, int depth) {
doc.createElementNS(namespace, tagName);
copyAttributes(sourceEl, el);
append(el, sourceEl);
if (sourceEl == contextElement)
doc.setUserData(ContextNodeProperty, el, null);
dest = el; // descend
} catch (DOMException e) {
append(doc.createTextNode("<" + tagName + ">"), sourceEl);
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jsoup/nodes/NodeUtils.java
Expand Up @@ -44,7 +44,8 @@ static <T extends Node> List<T> selectXpath(String xpath, Element el, Class<T> n

W3CDom w3c = new W3CDom();
org.w3c.dom.Document wDoc = w3c.fromJsoup(el);
NodeList nodeList = w3c.selectXpath(xpath, wDoc);
org.w3c.dom.Node contextNode = w3c.contextNode(wDoc);
NodeList nodeList = w3c.selectXpath(xpath, contextNode);
return w3c.sourceNodes(nodeList, nodeType);
}
}
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/helper/W3CDomTest.java
Expand Up @@ -328,7 +328,7 @@ private void assertEqualsIgnoreCase(String want, String have) {
Element jDiv = jdoc.selectFirst("div");
assertNotNull(jDiv);
Document doc = w3CDom.fromJsoup(jDiv);
Node div = doc.getFirstChild();
Node div = w3CDom.contextNode(doc);

assertEquals("div", div.getLocalName());
assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty));
Expand Down
31 changes: 29 additions & 2 deletions src/test/java/org/jsoup/select/XpathTest.java
Expand Up @@ -43,13 +43,15 @@ public void supportsXpath() {

Element div = doc.selectFirst("div");
assertNotNull(div);
Element w3cDiv = div.selectXpath(".").first(); // self
assertSame(div, w3cDiv);

Elements els = div.selectXpath("/div/p");
Elements els = div.selectXpath("p");
assertEquals(1, els.size());
assertEquals("One", els.get(0).text());
assertEquals("p", els.get(0).tagName());

assertEquals(0, div.selectXpath("//body").size());
assertEquals(1, div.selectXpath("//body").size()); // the whole document is visible on the div context
assertEquals(1, doc.selectXpath("//body").size());
}

Expand Down Expand Up @@ -146,6 +148,31 @@ private static Stream<Arguments> provideEvaluators() {
assertEquals("/bar", hrefs.get(1));
}

@Test void selectOutsideOfElementTree() {
Document doc = Jsoup.parse("<p>One<p>Two<p>Three");
Elements ps = doc.selectXpath("//p");
assertEquals(3, ps.size());

Element p1 = ps.get(0);
assertEquals("One", p1.text());

Elements sibs = p1.selectXpath("following-sibling::p");
assertEquals(2, sibs.size());
assertEquals("Two", sibs.get(0).text());
assertEquals("Three", sibs.get(1).text());
}

@Test void selectAncestorsOnContextElement() {
// https://github.com/jhy/jsoup/issues/1652
Document doc = Jsoup.parse("<div><p>Hello");
Element p = doc.selectFirst("p");
assertNotNull(p);
Elements chain = p.selectXpath("ancestor-or-self::*");
assertEquals(4, chain.size());
assertEquals("html", chain.get(0).tagName());
assertEquals("p", chain.get(3).tagName());
}

@Test
public void canSupplyAlternateFactoryImpl() {
// previously we had a test to load Saxon and do an XPath 2.0 query. But we know Saxon works and so that's
Expand Down

0 comments on commit 1e4d127

Please sign in to comment.