Skip to content

Commit

Permalink
feat: support wildcard namespaces in xpath queries
Browse files Browse the repository at this point in the history
This is almost as fast as a standard child-axis search, and much
faster than the builtin or using local-name():

  //span
       18.923k (± 8.9%) i/s -     93.906k in   5.010792s
  //*[local-name()='span']
        1.849k (± 2.8%) i/s -      9.261k in   5.011560s
  //*[nokogiri-builtin:local-name-is('span')]
        3.191k (± 2.4%) i/s -     16.150k in   5.064798s
  //*:span
       18.016k (± 4.6%) i/s -     89.900k in   5.003444s

  Comparison:
  //span:
      18922.5 i/s
  //*:span:
      18016.5 i/s - same-ish: difference falls within error
  //*[nokogiri-builtin:local-name-is('span')]:
       3190.6 i/s - 5.93x  (± 0.00) slower
  //*[local-name()='span']:
       1849.4 i/s - 10.23x  (± 0.00) slower
  • Loading branch information
flavorjones committed Dec 25, 2021
1 parent 1397529 commit 15fb707
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 3 deletions.
8 changes: 7 additions & 1 deletion lib/nokogiri/css/xpath_visitor.rb
Expand Up @@ -7,6 +7,8 @@ module CSS
# class allows for changing some of the behaviors related to builtin xpath functions and quirks
# of HTML5.
class XPathVisitor
WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc:

# Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions.
module BuiltinsConfig
# Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is
Expand Down Expand Up @@ -254,7 +256,11 @@ def visit_element_name(node)
# HTML5 has namespaces that should be ignored in CSS queries
# https://github.com/sparklemotion/nokogiri/issues/2376
if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
"*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
if WILDCARD_NAMESPACES
"*:#{node.value.first}"
else
"*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
end
else
"*[local-name()='#{node.value.first}']"
end
Expand Down
77 changes: 77 additions & 0 deletions patches/libxml2/0009-allow-wildcard-namespaces.patch
@@ -0,0 +1,77 @@
From 96e31fa0a94daa42aae174f2eb3f42e1ba7d5720 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Fri, 24 Dec 2021 19:08:01 -0500
Subject: [PATCH] attempt to hack in wildcard namespaces to xpath

I'm not confident this is a bulletproof patch.
---
xpath.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/xpath.c b/xpath.c
index 1aa2f1a..dd28d4d 100644
--- a/xpath.c
+++ b/xpath.c
@@ -146,6 +146,9 @@
#define XPATH_MAX_RECURSION_DEPTH 5000
#endif

+#define WILDCARD_PREFIX "*"
+#define IS_WILDCARD_PREFIX(p) xmlStrEqual((xmlChar*)WILDCARD_PREFIX, prefix)
+
/*
* TODO:
* There are a few spots where some tests are done which depend upon ascii
@@ -11073,12 +11076,15 @@ xmlXPathCompNodeTest(xmlXPathParserContextPtr ctxt, xmlXPathTestVal *test,
SKIP_BLANKS;

if ((name == NULL) && (CUR == '*')) {
- /*
- * All elements
- */
NEXT;
- *test = NODE_TEST_ALL;
- return(NULL);
+ if (CUR != ':') {
+ /*
+ * All elements
+ */
+ *test = NODE_TEST_ALL;
+ return(NULL);
+ }
+ name = xmlCharStrdup(WILDCARD_PREFIX);
}

if (name == NULL)
@@ -11327,6 +11333,10 @@ xmlXPathCompStep(xmlXPathParserContextPtr ctxt) {
}
#endif
if (CUR == '*') {
+ if (NXT(1) == ':') {
+ NEXT;
+ name = xmlCharStrdup(WILDCARD_PREFIX);
+ }
axis = AXIS_CHILD;
} else {
if (name == NULL)
@@ -12030,7 +12040,7 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt,
/*
* Setup namespaces.
*/
- if (prefix != NULL) {
+ if (prefix != NULL && !IS_WILDCARD_PREFIX(prefix)) {
URI = xmlXPathNsLookup(xpctxt, prefix);
if (URI == NULL) {
xmlXPathReleaseObject(xpctxt, obj);
@@ -12369,6 +12379,8 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt,
{
XP_TEST_HIT
}
+ } else if (IS_WILDCARD_PREFIX(prefix)) {
+ XP_TEST_HIT
} else {
if ((cur->ns != NULL) &&
(xmlStrEqual(URI, cur->ns->href)))
--
2.31.0

41 changes: 39 additions & 2 deletions test/css/test_xpath_visitor.rb
Expand Up @@ -565,6 +565,34 @@ def visit_pseudo_class_aaron(node)
end
end

#
# HTML5 documents have namespaces, and gumbo attaches namespaces to the relevant elements; but
# CSS selectors should not require namespaces. See #2376 for the discussion around this design
# decision, along with some of the relevant benchmarks and call stack analyses.
#
# (HTML5 today is only supported by CRuby/gumbo/libxml2 and so we'll ignore JRuby support for
# now.)
#
# The way to implement this CSS search using standard XPath 1.0 queries is to check for a match
# with `local-name()`. However, this is about ~10x slower than a standard search along the
# `child` axis.
#
# I've written a builtin function in C named `nokogiri-builtin:local-name-is()` which is a bit
# faster, but still ~7x slower than a standard search.
#
# Finally, I've patched libxml2 to support wildcard namespaces, and this is ~1.1x slower but
# only available with the packaged libxml2.
#
# In any case, the logic for the html5 builtins here goes:
#
# if ALWAYS or (OPTIMAL and libxml2)
# if we've patched libxml2 with wildcard support
# use wildard namespacing
# else
# use `nokogiri-builtin:local-name-is()`
# else
# use `local-name()`
#
describe "doctype:html5" do
let(:visitor) do
Nokogiri::CSS::XPathVisitor.new(
Expand All @@ -575,8 +603,13 @@ def visit_pseudo_class_aaron(node)

describe "builtins:always" do
let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS }

it "matches on the element's local-name, ignoring namespaces" do
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch")
assert_xpath("//*:foo", parser.parse("foo"))
else
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
end
end

it "avoids the wildcard when using namespaces" do
Expand All @@ -595,7 +628,11 @@ def visit_pseudo_class_aaron(node)
let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL }
it "matches on the element's local-name, ignoring namespaces" do
if Nokogiri.uses_libxml?
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch")
assert_xpath("//*:foo", parser.parse("foo"))
else
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
end
else
assert_xpath("//*[local-name()='foo']", parser.parse("foo"))
end
Expand Down
21 changes: 21 additions & 0 deletions test/xml/test_xpath.rb
Expand Up @@ -699,6 +699,27 @@ def collision(nodes)
end
end
end

describe "XPath wildcard namespaces" do
let(:xml) { <<~XML }
<root xmlns:ns1="http://nokogiri.org/ns1" xmlns:ns2="http://nokogiri.org/ns2">
<ns1:child>ns1 child</ns1:child>
<ns2:child>ns2 child</ns2:child>
<child>plain child</child>
</root>
XML

let(:doc) { Nokogiri::XML::Document.parse(xml) }

it "allows namespace wildcards" do
skip_unless_libxml2_patch("0009-allow-wildcard-namespaces.patch")

assert_equal(1, doc.xpath("//n:child", { "n" => "http://nokogiri.org/ns1" }).length)
assert_equal(3, doc.xpath("//*:child").length)
assert_equal(1, doc.xpath("//self::n:child", { "n" => "http://nokogiri.org/ns1" }).length)
assert_equal(3, doc.xpath("//self::*:child").length)
end
end
end
end
end

0 comments on commit 15fb707

Please sign in to comment.