When evaluating XPath on a context node, use the entire w3c document

And track the context in user data sections. This allows queries to evaluate against the complete document, vs just a sub-tree. Fixes #1652
jhy · Oct 19, 2021 · 1e4d127 · 1e4d127
1 parent 7f28cb0
commit 1e4d127
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 15 deletions.
diff --git a/CHANGES b/CHANGES
@@ -12,6 +12,11 @@ jsoup changelog
     useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc.
     <https://github.com/jhy/jsoup/issues/1636>
 
+  * Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the
+    query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the
+    element, e.g. ancestor-or-self::*.
+    <https://github.com/jhy/jsoup/issues/1652>
+
 *** Release 1.14.3 [2021-Sep-30]
   * Improvement: added native XPath support in Element#selectXpath(String)
     <https://github.com/jhy/jsoup/pull/1629>

diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -49,6 +49,9 @@
 public class W3CDom {
     /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
     public static final String SourceProperty = "jsoupSource";
+    private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
+    private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context
+
 
     /**
      To get support for XPath versions &gt; 1, set this property to the classname of an alternate XPathFactory
@@ -161,31 +164,34 @@ public Document fromJsoup(org.jsoup.nodes.Document in) {
     }
 
     /**
-     * Convert a jsoup Element to a W3C Document. The created nodes will link back to the original
+     * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
      * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
-     * flow to the other).
+     * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
+     * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
      *
      * @param in jsoup element or doc
      * @return a W3C DOM Document representing the jsoup Document or Element contents.
+     * @see #sourceNodes(NodeList, Class)
+     * @see #contextNode(Document)
      */
     public Document fromJsoup(org.jsoup.nodes.Element in) {
         Validate.notNull(in);
         DocumentBuilder builder;
         try {
             builder = factory.newDocumentBuilder();
             DOMImplementation impl = builder.getDOMImplementation();
-            Document out;
-
-            out = builder.newDocument();
+            Document out = builder.newDocument();
             org.jsoup.nodes.Document inDoc = in.ownerDocument();
             org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
             if (doctype != null) {
                 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
                 out.appendChild(documentType);
             }
             out.setXmlStandalone(true);
-
-            convert(in, out);
+            // if in is Document, use the root element, not the wrapping document, as the context:
+            org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.child(0) : in;
+            out.setUserData(ContextProperty, context, null);
+            convert(inDoc != null ? inDoc : in, out);
             return out;
         } catch (ParserConfigurationException e) {
             throw new IllegalStateException(e);
@@ -226,9 +232,25 @@ public void convert(org.jsoup.nodes.Element in, Document out) {
         NodeTraversor.traverse(builder, rootEl);
     }
 
+    /**
+     Evaluate an XPath query against the supplied document, and return the results.
+     @param xpath an XPath query
+     @param doc the document to evaluate against
+     @return the matches nodes
+     */
     public NodeList selectXpath(String xpath, Document doc) {
+        return selectXpath(xpath, (Node) doc);
+    }
+
+    /**
+     Evaluate an XPath query against the supplied context node, and return the results.
+     @param xpath an XPath query
+     @param contextNode the context node to evaluate against
+     @return the matches nodes
+     */
+    public NodeList selectXpath(String xpath, Node contextNode) {
         Validate.notEmpty(xpath);
-        Validate.notNull(doc);
+        Validate.notNull(contextNode);
 
         NodeList nodeList;
         try {
@@ -239,14 +261,21 @@ public NodeList selectXpath(String xpath, Document doc) {
                 XPathFactory.newInstance();
 
             XPathExpression expression = xPathFactory.newXPath().compile(xpath);
-            nodeList = (NodeList) expression.evaluate(doc, XPathConstants.NODESET); // love the strong typing here /s
+            nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
             Validate.notNull(nodeList);
         } catch (XPathExpressionException | XPathFactoryConfigurationException e) {
             throw new Selector.SelectorParseException("Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
         }
         return nodeList;
     }
 
+    /**
+     Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
+     @param nodeList the W3C nodes to get the original jsoup nodes from
+     @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
+     @param <T> node type
+     @return a list of the original nodes
+     */
     public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
         Validate.notNull(nodeList);
         Validate.notNull(nodeType);
@@ -262,6 +291,15 @@ public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, C
         return nodes;
     }
 
+    /**
+     For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
+     @param wDoc Document created by this class
+     @return the corresponding W3C Node to the jsoup Element that was used as the creating context.
+     */
+    public Node contextNode(Document wDoc) {
+        return (Node) wDoc.getUserData(ContextNodeProperty);
+    }
+
     /**
      * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc.
      *
@@ -284,11 +322,13 @@ protected static class W3CBuilder implements NodeVisitor {
         private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
         private Node dest;
         private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
+        @Nullable private final org.jsoup.nodes.Element contextElement;
 
         public W3CBuilder(Document doc) {
             this.doc = doc;
-            this.namespacesStack.push(new HashMap<>());
-            this.dest = doc;
+            namespacesStack.push(new HashMap<>());
+            dest = doc;
+            contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
         }
 
         public void head(org.jsoup.nodes.Node source, int depth) {
@@ -310,6 +350,8 @@ public void head(org.jsoup.nodes.Node source, int depth) {
                         doc.createElementNS(namespace, tagName);
                     copyAttributes(sourceEl, el);
                     append(el, sourceEl);
+                    if (sourceEl == contextElement)
+                        doc.setUserData(ContextNodeProperty, el, null);
                     dest = el; // descend
                 } catch (DOMException e) {
                     append(doc.createTextNode("<" + tagName + ">"), sourceEl);

diff --git a/src/main/java/org/jsoup/nodes/NodeUtils.java b/src/main/java/org/jsoup/nodes/NodeUtils.java
@@ -44,7 +44,8 @@ static <T extends Node> List<T> selectXpath(String xpath, Element el, Class<T> n
 
         W3CDom w3c = new W3CDom();
         org.w3c.dom.Document wDoc = w3c.fromJsoup(el);
-        NodeList nodeList = w3c.selectXpath(xpath, wDoc);
+        org.w3c.dom.Node contextNode = w3c.contextNode(wDoc);
+        NodeList nodeList = w3c.selectXpath(xpath, contextNode);
         return w3c.sourceNodes(nodeList, nodeType);
     }
 }
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -328,7 +328,7 @@ private void assertEqualsIgnoreCase(String want, String have) {
         Element jDiv = jdoc.selectFirst("div");
         assertNotNull(jDiv);
         Document doc = w3CDom.fromJsoup(jDiv);
-        Node div = doc.getFirstChild();
+        Node div = w3CDom.contextNode(doc);
 
         assertEquals("div", div.getLocalName());
         assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty));

diff --git a/src/test/java/org/jsoup/select/XpathTest.java b/src/test/java/org/jsoup/select/XpathTest.java
@@ -43,13 +43,15 @@ public void supportsXpath() {
 
         Element div = doc.selectFirst("div");
         assertNotNull(div);
+        Element w3cDiv = div.selectXpath(".").first(); // self
+        assertSame(div, w3cDiv);
 
-        Elements els = div.selectXpath("/div/p");
+        Elements els = div.selectXpath("p");
         assertEquals(1, els.size());
         assertEquals("One", els.get(0).text());
         assertEquals("p", els.get(0).tagName());
 
-        assertEquals(0, div.selectXpath("//body").size());
+        assertEquals(1, div.selectXpath("//body").size()); // the whole document is visible on the div context
         assertEquals(1, doc.selectXpath("//body").size());
     }
 
@@ -146,6 +148,31 @@ private static Stream<Arguments> provideEvaluators() {
         assertEquals("/bar", hrefs.get(1));
     }
 
+    @Test void selectOutsideOfElementTree() {
+        Document doc = Jsoup.parse("<p>One<p>Two<p>Three");
+        Elements ps = doc.selectXpath("//p");
+        assertEquals(3, ps.size());
+
+        Element p1 = ps.get(0);
+        assertEquals("One", p1.text());
+
+        Elements sibs = p1.selectXpath("following-sibling::p");
+        assertEquals(2, sibs.size());
+        assertEquals("Two", sibs.get(0).text());
+        assertEquals("Three", sibs.get(1).text());
+    }
+
+    @Test void selectAncestorsOnContextElement() {
+        // https://github.com/jhy/jsoup/issues/1652
+        Document doc = Jsoup.parse("<div><p>Hello");
+        Element p = doc.selectFirst("p");
+        assertNotNull(p);
+        Elements chain = p.selectXpath("ancestor-or-self::*");
+        assertEquals(4, chain.size());
+        assertEquals("html", chain.get(0).tagName());
+        assertEquals("p", chain.get(3).tagName());
+    }
+
     @Test
     public void canSupplyAlternateFactoryImpl() {
         // previously we had a test to load Saxon and do an XPath 2.0 query. But we know Saxon works and so that's