From b52dde7ddbe3b6dcc5a6dd6219160ef7b71f9014 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 24 Dec 2021 20:02:57 -0500 Subject: [PATCH] feat: support wildcard namespaces in xpath queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is almost as fast as a standard child-axis search, and much faster than the builtin or using local-name(): //span 18.923k (± 8.9%) i/s - 93.906k in 5.010792s //*[local-name()='span'] 1.849k (± 2.8%) i/s - 9.261k in 5.011560s //*[nokogiri-builtin:local-name-is('span')] 3.191k (± 2.4%) i/s - 16.150k in 5.064798s //*:span 18.016k (± 4.6%) i/s - 89.900k in 5.003444s Comparison: //span: 18922.5 i/s //*:span: 18016.5 i/s - same-ish: difference falls within error //*[nokogiri-builtin:local-name-is('span')]: 3190.6 i/s - 5.93x (± 0.00) slower //*[local-name()='span']: 1849.4 i/s - 10.23x (± 0.00) slower --- lib/nokogiri/css/xpath_visitor.rb | 8 +- .../0009-allow-wildcard-namespaces.patch | 77 +++++++++++++++++++ test/css/test_xpath_visitor.rb | 41 +++++++++- test/xml/test_xpath.rb | 21 +++++ 4 files changed, 144 insertions(+), 3 deletions(-) create mode 100644 patches/libxml2/0009-allow-wildcard-namespaces.patch diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index c20231b17f..ef7a22b59a 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -7,6 +7,8 @@ module CSS # class allows for changing some of the behaviors related to builtin xpath functions and quirks # of HTML5. class XPathVisitor + WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc: + # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions. module BuiltinsConfig # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is @@ -254,7 +256,11 @@ def visit_element_name(node) # HTML5 has namespaces that should be ignored in CSS queries # https://github.com/sparklemotion/nokogiri/issues/2376 if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) - "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" + if WILDCARD_NAMESPACES + "*:#{node.value.first}" + else + "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" + end else "*[local-name()='#{node.value.first}']" end diff --git a/patches/libxml2/0009-allow-wildcard-namespaces.patch b/patches/libxml2/0009-allow-wildcard-namespaces.patch new file mode 100644 index 0000000000..bd3144acd0 --- /dev/null +++ b/patches/libxml2/0009-allow-wildcard-namespaces.patch @@ -0,0 +1,77 @@ +From 74c95ec5932c737d4fcb06b8646b0017364ada14 Mon Sep 17 00:00:00 2001 +From: Mike Dalessio +Date: Fri, 24 Dec 2021 19:08:01 -0500 +Subject: [PATCH] attempt to hack in wildcard namespaces to xpath + +I'm not confident this is a bulletproof patch. +--- + xpath.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/xpath.c b/xpath.c +index 1aa2f1a..c7f0885 100644 +--- a/xpath.c ++++ b/xpath.c +@@ -146,6 +146,9 @@ + #define XPATH_MAX_RECURSION_DEPTH 5000 + #endif + ++#define WILDCARD_PREFIX "*" ++#define IS_WILDCARD_PREFIX(p) xmlStrEqual((xmlChar*)WILDCARD_PREFIX, p) ++ + /* + * TODO: + * There are a few spots where some tests are done which depend upon ascii +@@ -11073,12 +11076,15 @@ xmlXPathCompNodeTest(xmlXPathParserContextPtr ctxt, xmlXPathTestVal *test, + SKIP_BLANKS; + + if ((name == NULL) && (CUR == '*')) { +- /* +- * All elements +- */ + NEXT; +- *test = NODE_TEST_ALL; +- return(NULL); ++ if (CUR != ':') { ++ /* ++ * All elements ++ */ ++ *test = NODE_TEST_ALL; ++ return(NULL); ++ } ++ name = xmlCharStrdup(WILDCARD_PREFIX); + } + + if (name == NULL) +@@ -11327,6 +11333,10 @@ xmlXPathCompStep(xmlXPathParserContextPtr ctxt) { + } + #endif + if (CUR == '*') { ++ if (NXT(1) == ':') { ++ NEXT; ++ name = xmlCharStrdup(WILDCARD_PREFIX); ++ } + axis = AXIS_CHILD; + } else { + if (name == NULL) +@@ -12030,7 +12040,7 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt, + /* + * Setup namespaces. + */ +- if (prefix != NULL) { ++ if (prefix != NULL && !IS_WILDCARD_PREFIX(prefix)) { + URI = xmlXPathNsLookup(xpctxt, prefix); + if (URI == NULL) { + xmlXPathReleaseObject(xpctxt, obj); +@@ -12369,6 +12379,8 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt, + { + XP_TEST_HIT + } ++ } else if (IS_WILDCARD_PREFIX(prefix)) { ++ XP_TEST_HIT + } else { + if ((cur->ns != NULL) && + (xmlStrEqual(URI, cur->ns->href))) +-- +2.31.0 + diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 1b2532d927..751afb47e8 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -565,6 +565,34 @@ def visit_pseudo_class_aaron(node) end end + # + # HTML5 documents have namespaces, and gumbo attaches namespaces to the relevant elements; but + # CSS selectors should not require namespaces. See #2376 for the discussion around this design + # decision, along with some of the relevant benchmarks and call stack analyses. + # + # (HTML5 today is only supported by CRuby/gumbo/libxml2 and so we'll ignore JRuby support for + # now.) + # + # The way to implement this CSS search using standard XPath 1.0 queries is to check for a match + # with `local-name()`. However, this is about ~10x slower than a standard search along the + # `child` axis. + # + # I've written a builtin function in C named `nokogiri-builtin:local-name-is()` which is a bit + # faster, but still ~7x slower than a standard search. + # + # Finally, I've patched libxml2 to support wildcard namespaces, and this is ~1.1x slower but + # only available with the packaged libxml2. + # + # In any case, the logic for the html5 builtins here goes: + # + # if ALWAYS or (OPTIMAL and libxml2) + # if we've patched libxml2 with wildcard support + # use wildard namespacing + # else + # use `nokogiri-builtin:local-name-is()` + # else + # use `local-name()` + # describe "doctype:html5" do let(:visitor) do Nokogiri::CSS::XPathVisitor.new( @@ -575,8 +603,13 @@ def visit_pseudo_class_aaron(node) describe "builtins:always" do let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS } + it "matches on the element's local-name, ignoring namespaces" do - assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") + assert_xpath("//*:foo", parser.parse("foo")) + else + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + end end it "avoids the wildcard when using namespaces" do @@ -595,7 +628,11 @@ def visit_pseudo_class_aaron(node) let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL } it "matches on the element's local-name, ignoring namespaces" do if Nokogiri.uses_libxml? - assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") + assert_xpath("//*:foo", parser.parse("foo")) + else + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + end else assert_xpath("//*[local-name()='foo']", parser.parse("foo")) end diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 4b3a5ccc2c..5c6589cdd5 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -699,6 +699,27 @@ def collision(nodes) end end end + + describe "XPath wildcard namespaces" do + let(:xml) { <<~XML } + + ns1 child + ns2 child + plain child + + XML + + let(:doc) { Nokogiri::XML::Document.parse(xml) } + + it "allows namespace wildcards" do + skip_unless_libxml2_patch("0009-allow-wildcard-namespaces.patch") + + assert_equal(1, doc.xpath("//n:child", { "n" => "http://nokogiri.org/ns1" }).length) + assert_equal(3, doc.xpath("//*:child").length) + assert_equal(1, doc.xpath("//self::n:child", { "n" => "http://nokogiri.org/ns1" }).length) + assert_equal(3, doc.xpath("//self::*:child").length) + end + end end end end