From 685403a6fbdd59898e45e1a760674580ed0fb9b2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 24 Dec 2021 20:02:35 -0500 Subject: [PATCH 01/13] test: comment on custom xpath functions --- test/xml/test_xpath.rb | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index bd7ef21fe4..baf631984c 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -5,19 +5,13 @@ module Nokogiri module XML class TestXPath < Nokogiri::TestCase - # ** WHY ALL THOSE _if Nokogiri.uses_libxml?_ ** - # Hi, my dear readers, # - # After reading these tests you may be wondering why all those ugly - # if Nokogiri.uses_libxml? sparsed over the whole document. Well, let - # me explain it. While using XPath in Java, you need the extension - # functions to be in a namespace. This is not required by XPath, afaik, - # but it is an usual convention though. + # Note that many of these tests vary for jruby because custom xpath functions in JRuby require + # a namespace, and libxml2 (and the original implementation of Nokogiri) do not. # - # Yours truly, + # Ideally we should change this to always require a namespace. + # See https://github.com/sparklemotion/nokogiri/issues/2147 # - # The guy whose headaches belong to Nokogiri JRuby impl. - def setup super From b24efcca08cc93881c29c95a31c2068002b591ba Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 11 Dec 2021 14:38:46 -0500 Subject: [PATCH 02/13] CSS::Node#to_xpath arguments are now required This makes it explicit that visitor must be injected. Although this is technically a breaking change, CSS::Node is essentially an internal API and I would be extremely surprised if it was being used directly by anyone. This commit also ensures that CSS::Node, CSS::Parser, and CSS::Tokenizer are omitted from API documentation (to reflect their status as an internal-only and unstable API). --- CHANGELOG.md | 1 + lib/nokogiri/css.rb | 1 + lib/nokogiri/css/node.rb | 3 ++- lib/nokogiri/css/parser.rb | 5 +++-- lib/nokogiri/css/parser.y | 1 + lib/nokogiri/css/parser_extras.rb | 1 + lib/nokogiri/css/tokenizer.rb | 3 ++- lib/nokogiri/css/tokenizer.rex | 3 ++- rakelib/rdoc.rake | 2 -- test/css/test_xpath_visitor.rb | 2 +- 10 files changed, 14 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3eca3b345..010f926756 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ This release ends support for: ### Deprecated * Passing a `Nokogiri::XML::Node` as the second parameter to `Node.new` is deprecated and will generate a warning. This will become an error in a future version of Nokogiri. [[#975](https://github.com/sparklemotion/nokogiri/issues/975)] +* `Nokogiri::CSS::Parser`, `Nokogiri::CSS::Tokenizer`, and `Nokogiri::CSS::Node` are now internal-only APIs that are no longer documented, and should not be considered stable. With the introduction of `XPathVisitor` injection into `Nokogiri::CSS.xpath_for` there should be no reason to rely on these internal APIs. ## 1.12.5 / 2021-09-27 diff --git a/lib/nokogiri/css.rb b/lib/nokogiri/css.rb index 933d5f50a9..a78ea66094 100644 --- a/lib/nokogiri/css.rb +++ b/lib/nokogiri/css.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true module Nokogiri + # Methods to translate CSS queries into XPath queries module CSS class << self ### diff --git a/lib/nokogiri/css/node.rb b/lib/nokogiri/css/node.rb index f5d499af53..df7b079157 100644 --- a/lib/nokogiri/css/node.rb +++ b/lib/nokogiri/css/node.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +# :stopdoc: module Nokogiri module CSS class Node @@ -23,7 +24,7 @@ def accept(visitor) ### # Convert this CSS node to xpath with +prefix+ using +visitor+ - def to_xpath(prefix = "//", visitor = XPathVisitor.new) + def to_xpath(prefix, visitor) prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil? prefix + visitor.accept(self) end diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb index 30938e23cb..1749a8eee0 100644 --- a/lib/nokogiri/css/parser.rb +++ b/lib/nokogiri/css/parser.rb @@ -1,13 +1,14 @@ # frozen_string_literal: true # # DO NOT MODIFY!!!! -# This file is automatically generated by Racc 1.5.2 +# This file is automatically generated by Racc 1.6.0 # from Racc grammar file "". # require 'racc/parser.rb' +# :stopdoc: require_relative "parser_extras" module Nokogiri @@ -247,7 +248,7 @@ def unescape_css_string(str) "." => 27, "*" => 28, "|" => 29, - ":" => 30, } + ":" => 30 } racc_nt_base = 31 diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y index 9ed97c8559..98667ebf99 100644 --- a/lib/nokogiri/css/parser.y +++ b/lib/nokogiri/css/parser.y @@ -253,6 +253,7 @@ end ---- header +# :stopdoc: require_relative "parser_extras" ---- inner diff --git a/lib/nokogiri/css/parser_extras.rb b/lib/nokogiri/css/parser_extras.rb index d5ace766c9..ae01259358 100644 --- a/lib/nokogiri/css/parser_extras.rb +++ b/lib/nokogiri/css/parser_extras.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +# :stopdoc: require "thread" module Nokogiri diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb index 1aa9953b01..0c44fab8a8 100644 --- a/lib/nokogiri/css/tokenizer.rb +++ b/lib/nokogiri/css/tokenizer.rb @@ -5,9 +5,10 @@ # from lexical definition file "lib/nokogiri/css/tokenizer.rex". #++ +# :stopdoc: module Nokogiri module CSS -class Tokenizer # :nodoc: +class Tokenizer require 'strscan' class ScanError < StandardError ; end diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex index a29c2e4255..84a5c146b4 100644 --- a/lib/nokogiri/css/tokenizer.rex +++ b/lib/nokogiri/css/tokenizer.rex @@ -1,6 +1,7 @@ +# :stopdoc: module Nokogiri module CSS -class Tokenizer # :nodoc: +class Tokenizer macro nl \n|\r\n|\r|\f diff --git a/rakelib/rdoc.rake b/rakelib/rdoc.rake index 62977e5a8b..f059771e33 100644 --- a/rakelib/rdoc.rake +++ b/rakelib/rdoc.rake @@ -5,8 +5,6 @@ RDoc::Task.new(rdoc: "rdoc", clobber_rdoc: "rdoc:clean", rerdoc: "rdoc:force") d rdoc.rdoc_dir = ENV["RDOC_DIR"] || "html" rdoc.rdoc_files .include("README.md", "lib/**/*.rb", "ext/**/*.c") - .exclude("lib/nokogiri/css/tokenizer.rb") - .exclude("lib/nokogiri/css/parser.rb") .exclude("ext/nokogiri/test_global_handlers.c") rdoc.options << "--embed-mixins" rdoc.options << "--main=README.md" diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 74345902d2..321e8f7ad9 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -17,7 +17,7 @@ class TestNokogiri < Nokogiri::TestCase def assert_xpath(expecteds, asts) expecteds = [expecteds].flatten expecteds.zip(asts).each do |expected, actual| - assert_equal(expected, actual.to_xpath) + assert_equal(expected, actual.to_xpath("//", Nokogiri::CSS::XPathVisitor.new)) end end From 4b6bfa9f1b43d5183fec3125199bbc1f3d38dff4 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 12 Dec 2021 10:10:31 -0500 Subject: [PATCH 03/13] prefactor: allow injection of an XPathVisitor to CSS.xpath_for Note that the method signature of internal-only method CSS::Parser.xpath_for has changed to prefer required parameters rather than an options hash with assumed defaults. Also note that I've introduced some XPath constants for XPath query prefixes, which we should start using. --- CHANGELOG.md | 1 + lib/nokogiri/css.rb | 44 ++++++++++++++++++++++++++----- lib/nokogiri/css/parser_extras.rb | 19 ++++++------- lib/nokogiri/xml/xpath.rb | 11 ++++++++ test/css/test_css.rb | 37 ++++++++++++++++++++++++++ test/test_css_cache.rb | 19 ++++++------- 6 files changed, 104 insertions(+), 27 deletions(-) create mode 100644 test/css/test_css.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 010f926756..4eace95225 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ This release ends support for: ### Improved * `{XML,HTML4}::DocumentFragment` constructors all now take an optional parse options parameter or block (similar to Document constructors). [[#1692](https://github.com/sparklemotion/nokogiri/issues/1692)] (Thanks, [@JackMc](https://github.com/JackMc)!) +* `Nokogiri::CSS.xpath_for` allows an `XPathVisitor` to be injected, for finer-grained control over how CSS queries are translated into XPath. * [CRuby] `XML::Reader#encoding` will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)] * [CRuby] Handle abruptly-closed HTML comments as recommended by WHATWG. (Thanks to [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting!) * [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES`, which is turned on by default in `ParseOptions::DEFAULT_{XML,XSLT,HTML,SCHEMA}` (Note that JRuby already supported large line numbers.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] diff --git a/lib/nokogiri/css.rb b/lib/nokogiri/css.rb index a78ea66094..fe4be39997 100644 --- a/lib/nokogiri/css.rb +++ b/lib/nokogiri/css.rb @@ -1,19 +1,49 @@ +# coding: utf-8 # frozen_string_literal: true module Nokogiri - # Methods to translate CSS queries into XPath queries + # Translate a CSS selector into an XPath 1.0 query module CSS class << self - ### - # Parse this CSS selector in +selector+. Returns an AST. - def parse(selector) + # TODO: Deprecate this method ahead of 2.0 and delete it in 2.0. + # It is not used by Nokogiri and shouldn't be part of the public API. + def parse(selector) # :nodoc: Parser.new.parse(selector) end - ### - # Get the XPath for +selector+. + # :call-seq: + # xpath_for(selector) → String + # xpath_for(selector [, prefix:] [, visitor:] [, ns:]) → String + # + # Translate a CSS selector to the equivalent XPath query. + # + # 💡 Note that translated queries are cached. + # + # [Parameters] + # - +selector+ (String) The CSS selector to be translated into XPath + # + # - +prefix:+ (String) + # + # The XPath prefix for the query, see Nokogiri::XML::XPath for some options. Default is + # +XML::XPath::GLOBAL_SEARCH_PREFIX+. + # + # - +visitor:+ (Nokogiri::CSS::XPathVisitor) + # + # The visitor class to use to transform the AST into XPath. Default is + # +Nokogiri::CSS::XPathVisitor.new+. + # + # - +ns:+ (Hash) + # + # The namespaces that are referenced in the query, if any. This is a hash where the keys are + # the namespace prefix and the values are the namespace URIs. Default is an empty Hash. + # + # [Returns] (String) The equivalent XPath query for +selector+ + # def xpath_for(selector, options = {}) - Parser.new(options[:ns] || {}).xpath_for(selector, options) + prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX) + visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new } + ns = options.fetch(:ns, {}) + Parser.new(ns).xpath_for(selector, prefix, visitor) end end end diff --git a/lib/nokogiri/css/parser_extras.rb b/lib/nokogiri/css/parser_extras.rb index ae01259358..a8a546a40a 100644 --- a/lib/nokogiri/css/parser_extras.rb +++ b/lib/nokogiri/css/parser_extras.rb @@ -72,17 +72,10 @@ def next_token end # Get the xpath for +string+ using +options+ - def xpath_for(string, options = {}) - key = "#{string}#{options[:ns]}#{options[:prefix]}" - v = self.class[key] - return v if v - - args = [ - options[:prefix] || "//", - options[:visitor] || XPathVisitor.new, - ] - self.class[key] = parse(string).map do |ast| - ast.to_xpath(*args) + def xpath_for(string, prefix, visitor) + key = cache_key(string, prefix) + self.class[key] ||= parse(string).map do |ast| + ast.to_xpath(prefix, visitor) end end @@ -91,6 +84,10 @@ def on_error(error_token_id, error_value, value_stack) after = value_stack.compact.last raise SyntaxError, "unexpected '#{error_value}' after '#{after}'" end + + def cache_key(query, prefix) + [query, prefix, @namespaces] + end end end end diff --git a/lib/nokogiri/xml/xpath.rb b/lib/nokogiri/xml/xpath.rb index 4c6f4dbf11..3b0e5bcd94 100644 --- a/lib/nokogiri/xml/xpath.rb +++ b/lib/nokogiri/xml/xpath.rb @@ -3,6 +3,17 @@ module Nokogiri module XML module XPath + # The XPath search prefix to search globally, +//+ + GLOBAL_SEARCH_PREFIX = "//" + + # The XPath search prefix to search direct descendants of the root element, +/+ + ROOT_SEARCH_PREFIX = "/" + + # The XPath search prefix to search direct descendants of the current element, +./+ + CURRENT_SEARCH_PREFIX = "./" + + # The XPath search prefix to search anywhere in the current element's subtree, +.//+ + SUBTREE_SEARCH_PREFIX = ".//" end end end diff --git a/test/css/test_css.rb b/test/css/test_css.rb new file mode 100644 index 0000000000..7e96cd3388 --- /dev/null +++ b/test/css/test_css.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +require "helper" + +class TestNokogiriCSS < Nokogiri::TestCase + describe ".xpath_for" do + it "accepts just a selector" do + assert_equal(["//foo"], Nokogiri::CSS.xpath_for("foo")) + end + + it "accepts a CSS::XPathVisitor" do + Nokogiri::CSS::Parser.without_cache do + mock_visitor = Minitest::Mock.new + mock_visitor.expect(:accept, "injected-value", [Nokogiri::CSS::Node]) + + result = Nokogiri::CSS.xpath_for("foo", visitor: mock_visitor) + + mock_visitor.verify + assert_equal(["//injected-value"], result) + end + end + + it "accepts an options hash" do + assert_equal( + ["./foo"], + Nokogiri::CSS.xpath_for("foo", { prefix: "./", visitor: Nokogiri::CSS::XPathVisitor.new }), + ) + end + + it "accepts keyword arguments" do + assert_equal( + ["./foo"], + Nokogiri::CSS.xpath_for("foo", prefix: "./", visitor: Nokogiri::CSS::XPathVisitor.new), + ) + end + end +end diff --git a/test/test_css_cache.rb b/test/test_css_cache.rb index fe4f15c7f7..d3e9353cd2 100644 --- a/test/test_css_cache.rb +++ b/test/test_css_cache.rb @@ -39,8 +39,8 @@ def teardown Nokogiri::CSS.xpath_for(@css) Nokogiri::CSS.xpath_for(@css) - Nokogiri::CSS::Parser.new.xpath_for(@css) - Nokogiri::CSS::Parser.new.xpath_for(@css) + Nokogiri::CSS::Parser.new.xpath_for(@css, "//", Nokogiri::CSS::XPathVisitor.new) + Nokogiri::CSS::Parser.new.xpath_for(@css, "//", Nokogiri::CSS::XPathVisitor.new) if cache_setting assert_equal(1, Nokogiri::CSS::Parser.class_eval { @cache.count }) @@ -66,11 +66,12 @@ def test_enabled_cache_is_used css = ".foo .bar .baz" cache = Nokogiri::CSS::Parser.instance_variable_get("@cache") - assert_nil(cache[css]) + assert_empty(cache) Nokogiri::CSS.xpath_for(css) - assert(cache[css]) + refute_empty(cache) + key = cache.keys.first - cache[css] = "this is an injected value" + cache[key] = "this is an injected value" assert_equal("this is an injected value", Nokogiri::CSS.xpath_for(css)) end @@ -81,9 +82,9 @@ def test_disabled_cache_is_not_used css = ".foo .bar .baz" cache = Nokogiri::CSS::Parser.instance_variable_get("@cache") - assert_nil(cache[css]) + assert_empty(cache) Nokogiri::CSS.xpath_for(css) - assert_nil(cache[css]) + assert_empty(cache) end def test_without_cache_avoids_cache @@ -93,11 +94,11 @@ def test_without_cache_avoids_cache css = ".foo .bar .baz" cache = Nokogiri::CSS::Parser.instance_variable_get("@cache") - assert_nil(cache[css]) + assert_empty(cache) Nokogiri::CSS::Parser.without_cache do Nokogiri::CSS.xpath_for(css) end - assert_nil(cache[css]) + assert_empty(cache) end def test_without_cache_resets_cache_value From b3ffffb3f64a11c27aab4684e5298f2ffc40daa2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 12 Dec 2021 11:55:47 -0500 Subject: [PATCH 04/13] fix: CSS cache includes XPathVisitor configuration which prevents incorrect cache results being returned in cases where different visitor configurations are used. We also deprecate XPathVisitorAlwaysUseBuiltins and XPathVisitorOptimallyUseBuiltins in favor of a simple XPathVisitor with constructor arguments. --- CHANGELOG.md | 4 +- lib/nokogiri/css/parser_extras.rb | 10 +-- lib/nokogiri/css/xpath_visitor.rb | 46 ++++++++---- lib/nokogiri/xml/searchable.rb | 2 +- test/css/test_xpath_visitor.rb | 115 +++++++++++++++++++----------- test/test_css_cache.rb | 14 ++++ 6 files changed, 129 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4eace95225..754268b805 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,8 @@ This release ends support for: ### Fixed -* XML::Builder blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!) +* `XML::Builder` blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!) +* The `Nokogiri::CSS::Parser` cache now uses the `XPathVisitor` configuration as part of the cache key, preventing incorrect cache results from being returned when multiple `XPathVisitor` options are being used. * Error recovery from in-context parsing (e.g., `Node#parse`) now always uses the correct `DocumentFragment` class. Previously `Nokogiri::HTML4::DocumentFragment` was always used, even for XML documents. [[#1158](https://github.com/sparklemotion/nokogiri/issues/1158)] * `DocumentFragment#>` now works properly, matching a CSS selector against only the fragment roots. [[#1857](https://github.com/sparklemotion/nokogiri/issues/1857)] * `XML::DocumentFragment#errors` now correctly contains any parsing errors encountered. Previously this was always empty. (Note that `HTML::DocumentFragment#errors` already did this.) @@ -63,6 +64,7 @@ This release ends support for: * Passing a `Nokogiri::XML::Node` as the second parameter to `Node.new` is deprecated and will generate a warning. This will become an error in a future version of Nokogiri. [[#975](https://github.com/sparklemotion/nokogiri/issues/975)] * `Nokogiri::CSS::Parser`, `Nokogiri::CSS::Tokenizer`, and `Nokogiri::CSS::Node` are now internal-only APIs that are no longer documented, and should not be considered stable. With the introduction of `XPathVisitor` injection into `Nokogiri::CSS.xpath_for` there should be no reason to rely on these internal APIs. +* CSS-to-XPath utility classes `Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins` and `XPathVisitorOptimallyUseBuiltins` are deprecated. Prefer `Nokogiri::CSS::XPathVisitor` with appropriate constructor arguments. These classes will be removed in a future version of Nokogiri. ## 1.12.5 / 2021-09-27 diff --git a/lib/nokogiri/css/parser_extras.rb b/lib/nokogiri/css/parser_extras.rb index a8a546a40a..7a93c11ebf 100644 --- a/lib/nokogiri/css/parser_extras.rb +++ b/lib/nokogiri/css/parser_extras.rb @@ -24,7 +24,7 @@ def set_cache(value) # rubocop:disable Naming/AccessorMethodName # Get the css selector in +string+ from the cache def [](string) - return unless cache_on? + return nil unless cache_on? @mutex.synchronize { @cache[string] } end @@ -73,7 +73,7 @@ def next_token # Get the xpath for +string+ using +options+ def xpath_for(string, prefix, visitor) - key = cache_key(string, prefix) + key = cache_key(string, prefix, visitor) self.class[key] ||= parse(string).map do |ast| ast.to_xpath(prefix, visitor) end @@ -85,8 +85,10 @@ def on_error(error_token_id, error_value, value_stack) raise SyntaxError, "unexpected '#{error_value}' after '#{after}'" end - def cache_key(query, prefix) - [query, prefix, @namespaces] + def cache_key(query, prefix, visitor) + if self.class.cache_on? + [query, prefix, @namespaces, visitor.config] + end end end end diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index 6f4cde83df..99e41e5154 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -3,6 +3,24 @@ module Nokogiri module CSS class XPathVisitor # :nodoc: + def initialize(builtins: :never) + @builtins = builtins + + if builtins == :always || (builtins == :optimal && Nokogiri.uses_libxml?) + class << self + alias_method :css_class, :css_class_builtin + end + else + class << self + alias_method :css_class, :css_class_standard + end + end + end + + def config + { builtins: @builtins } + end + def visit_function(node) msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}" return send(msg, node) if respond_to?(msg) @@ -240,23 +258,25 @@ def css_class_standard(hay, needle) def css_class_builtin(hay, needle) "nokogiri-builtin:css-class(#{hay},'#{needle}')" end - - alias_method :css_class, :css_class_standard end - class XPathVisitorAlwaysUseBuiltins < XPathVisitor # :nodoc: - private - - alias_method :css_class, :css_class_builtin + module XPathVisitorAlwaysUseBuiltins # :nodoc: + def self.new + warn( + "Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins is deprecated and will be removed in a future version of Nokogiri", + { uplevel: 1 }, + ) + XPathVisitor.new(builtins: :always) + end end - class XPathVisitorOptimallyUseBuiltins < XPathVisitor # :nodoc: - private - - if Nokogiri.uses_libxml? - alias_method :css_class, :css_class_builtin - else - alias_method :css_class, :css_class_standard + module XPathVisitorOptimallyUseBuiltins # :nodoc: + def self.new + warn( + "Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins is deprecated and will be removed in a future version of Nokogiri", + { uplevel: 1 }, + ) + XPathVisitor.new(builtins: :optimal) end end end diff --git a/lib/nokogiri/xml/searchable.rb b/lib/nokogiri/xml/searchable.rb index 9de99ca49a..332a2b2051 100644 --- a/lib/nokogiri/xml/searchable.rb +++ b/lib/nokogiri/xml/searchable.rb @@ -227,7 +227,7 @@ def css_rules_to_xpath(rules, ns) end def xpath_query_from_css_rule(rule, ns) - visitor = Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new + visitor = Nokogiri::CSS::XPathVisitor.new(builtins: :optimal) self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context| CSS.xpath_for(rule.to_s, { prefix: implied_xpath_context, ns: ns, visitor: visitor, }) diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 321e8f7ad9..d23edfa453 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -14,13 +14,19 @@ class TestNokogiri < Nokogiri::TestCase }) end + let(:visitor) { Nokogiri::CSS::XPathVisitor.new } + def assert_xpath(expecteds, asts) expecteds = [expecteds].flatten expecteds.zip(asts).each do |expected, actual| - assert_equal(expected, actual.to_xpath("//", Nokogiri::CSS::XPathVisitor.new)) + assert_equal(expected, actual.to_xpath("//", visitor)) end end + it "exposes its configuration" do + assert_equal({ builtins: :never }, visitor.config) + end + it "raises an exception on single quote" do assert_raises(Nokogiri::CSS::SyntaxError) { parser.parse("'") } end @@ -407,58 +413,63 @@ def visit_pseudo_class_aaron(node) # TODO: should we make this work? # assert_xpath ['//x/y', '//y/z'], parser.parse('x > y | y > z') end - end - describe Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins do - let(:parser) { Nokogiri::CSS::Parser.new } + describe "builtins:always" do + let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: :always) } - def assert_xpath(expecteds, asts) - expecteds = [expecteds].flatten - expecteds.zip(asts).each do |expected, actual| - assert_equal(expected, actual.to_xpath("//", Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins.new)) + it "exposes its configuration" do + assert_equal({ builtins: :always }, visitor.config) end - end - it ". class" do - assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", - parser.parse(".awesome")) - assert_xpath("//foo[nokogiri-builtin:css-class(@class,'awesome')]", - parser.parse("foo.awesome")) - assert_xpath("//foo//*[nokogiri-builtin:css-class(@class,'awesome')]", - parser.parse("foo .awesome")) - assert_xpath("//foo//*[nokogiri-builtin:css-class(@class,'awe.some')]", - parser.parse("foo .awe\\.some")) - assert_xpath("//*[nokogiri-builtin:css-class(@class,'a') and nokogiri-builtin:css-class(@class,'b')]", - parser.parse(".a.b")) - end + it ". class" do + assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", + parser.parse(".awesome")) + assert_xpath("//foo[nokogiri-builtin:css-class(@class,'awesome')]", + parser.parse("foo.awesome")) + assert_xpath("//foo//*[nokogiri-builtin:css-class(@class,'awesome')]", + parser.parse("foo .awesome")) + assert_xpath("//foo//*[nokogiri-builtin:css-class(@class,'awe.some')]", + parser.parse("foo .awe\\.some")) + assert_xpath("//*[nokogiri-builtin:css-class(@class,'a') and nokogiri-builtin:css-class(@class,'b')]", + parser.parse(".a.b")) + end - it "~=" do - assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", - parser.parse("a[@class~='bar']")) - assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", - parser.parse("a[@class ~= 'bar']")) - assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", - parser.parse("a[@class~=bar]")) - assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", - parser.parse("a[@class~=\"bar\"]")) - assert_xpath("//a[nokogiri-builtin:css-class(@data-words,'bar')]", - parser.parse("a[data-words~=\"bar\"]")) - assert_xpath("//a[nokogiri-builtin:css-class(@data-words,'bar')]", - parser.parse("a[@data-words~=\"bar\"]")) + it "~=" do + assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", + parser.parse("a[@class~='bar']")) + assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", + parser.parse("a[@class ~= 'bar']")) + assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", + parser.parse("a[@class~=bar]")) + assert_xpath("//a[nokogiri-builtin:css-class(@class,'bar')]", + parser.parse("a[@class~=\"bar\"]")) + assert_xpath("//a[nokogiri-builtin:css-class(@data-words,'bar')]", + parser.parse("a[data-words~=\"bar\"]")) + assert_xpath("//a[nokogiri-builtin:css-class(@data-words,'bar')]", + parser.parse("a[@data-words~=\"bar\"]")) + end + + describe "XPathVisitorAlwaysUseBuiltins" do + let(:visitor) { Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins.new } + + it "supports deprecated class" do + assert_output("", /XPathVisitorAlwaysUseBuiltins is deprecated/) { visitor } + assert_instance_of(Nokogiri::CSS::XPathVisitor, visitor) + assert_equal({ builtins: :always }, visitor.config) + + assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", + parser.parse(".awesome")) + end + end end - end - describe Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins do - let(:parser) { Nokogiri::CSS::Parser.new } + describe "builtins:optimal" do + let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: :optimal) } - def assert_xpath(expecteds, asts) - expecteds = [expecteds].flatten - expecteds.zip(asts).each do |expected, actual| - assert_equal(expected, actual.to_xpath("//", Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new)) + it "exposes its configuration" do + assert_equal({ builtins: :optimal }, visitor.config) end - end - describe "css class lookup" do # # only use the builtin css-class method if we're in libxml2, because incredibly the native # impl is slower in JRuby: @@ -518,6 +529,24 @@ def assert_xpath(expecteds, asts) parser.parse("a[@class~='bar']")) end end + + describe "XPathVisitorOptimallyUseBuiltins" do + let(:visitor) { Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new } + + it "supports deprecated class" do + assert_output("", /XPathVisitorOptimallyUseBuiltins is deprecated/) { visitor } + assert_instance_of(Nokogiri::CSS::XPathVisitor, visitor) + assert_equal({ builtins: :optimal }, visitor.config) + + if Nokogiri.uses_libxml? + assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", + parser.parse(".awesome")) + else + assert_xpath("//*[contains(concat(' ',normalize-space(@class),' '),' awesome ')]", + parser.parse(".awesome")) + end + end + end end end end diff --git a/test/test_css_cache.rb b/test/test_css_cache.rb index d3e9353cd2..9020226393 100644 --- a/test/test_css_cache.rb +++ b/test/test_css_cache.rb @@ -121,6 +121,20 @@ def test_without_cache_resets_cache_value_even_after_exception assert(Nokogiri::CSS::Parser.cache_on?) end + def test_cache_key_on_ns_prefix_and_visitor_config + Nokogiri::CSS::Parser.clear_cache + Nokogiri::CSS::Parser.set_cache(true) + + cache = Nokogiri::CSS::Parser.instance_variable_get("@cache") + assert_empty(cache) + + Nokogiri::CSS.xpath_for("foo") + Nokogiri::CSS.xpath_for("foo", prefix: ".//") + Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }) + Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }, visitor: Nokogiri::CSS::XPathVisitor.new(builtins: :always)) + assert_equal(4, cache.length) + end + def test_race_condition # https://github.com/sparklemotion/nokogiri/issues/1935 threads = [] From e005149d2fa29bec4b5fb71f6fe50b50bb67acec Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 13 Dec 2021 08:49:40 -0500 Subject: [PATCH 05/13] prefactor: CSS parser distinguishes attr names from element names which will allow us to more easily special-case element names in an upcoming commit. --- lib/nokogiri/css/parser.rb | 4 ++-- lib/nokogiri/css/parser.y | 4 ++-- lib/nokogiri/css/xpath_visitor.rb | 4 ++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb index 1749a8eee0..c6cabab39b 100644 --- a/lib/nokogiri/css/parser.rb +++ b/lib/nokogiri/css/parser.rb @@ -486,7 +486,7 @@ def _reduce_27(val, _values, result) end def _reduce_28(val, _values, result) - result = Node.new(:ELEMENT_NAME, + result = Node.new(:ATTRIB_NAME, [[val.first, val.last].compact.join(':')] ) @@ -496,7 +496,7 @@ def _reduce_28(val, _values, result) def _reduce_29(val, _values, result) # Default namespace is not applied to attributes. # So we don't add prefix "xmlns:" as in namespaced_ident. - result = Node.new(:ELEMENT_NAME, [val.first]) + result = Node.new(:ATTRIB_NAME, [val.first]) result end diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y index 98667ebf99..c47f8e6d69 100644 --- a/lib/nokogiri/css/parser.y +++ b/lib/nokogiri/css/parser.y @@ -96,14 +96,14 @@ rule ; attrib_name : namespace '|' IDENT { - result = Node.new(:ELEMENT_NAME, + result = Node.new(:ATTRIB_NAME, [[val.first, val.last].compact.join(':')] ) } | IDENT { # Default namespace is not applied to attributes. # So we don't add prefix "xmlns:" as in namespaced_ident. - result = Node.new(:ELEMENT_NAME, [val.first]) + result = Node.new(:ATTRIB_NAME, [val.first]) } ; function diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index 99e41e5154..3cd33c1a2d 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -197,6 +197,10 @@ def visit_element_name(node) node.value.first end + def visit_attrib_name(node) + node.value.first + end + def accept(node) node.accept(self) end From f935e5ad041ecf64feae0447fc222ee6e4a48717 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 14 Dec 2021 18:24:25 -0500 Subject: [PATCH 06/13] introduce enum CSS::XPathVisitor::BuiltinsConfig and refactor the XPathVisitor#css_class to not rely on aliasing during object initialization, which feels funny to me --- lib/nokogiri/css/xpath_visitor.rb | 36 +++++++++++++++---------------- test/css/test_xpath_visitor.rb | 11 ++++++++-- test/test_css_cache.rb | 4 ++-- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index 3cd33c1a2d..5a50e08c53 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -3,18 +3,18 @@ module Nokogiri module CSS class XPathVisitor # :nodoc: - def initialize(builtins: :never) - @builtins = builtins + module BuiltinsConfig + NEVER = :never + ALWAYS = :always + OPTIMAL = :optimal + VALUES = [NEVER, ALWAYS, OPTIMAL] + end - if builtins == :always || (builtins == :optimal && Nokogiri.uses_libxml?) - class << self - alias_method :css_class, :css_class_builtin - end - else - class << self - alias_method :css_class, :css_class_standard - end + def initialize(builtins: BuiltinsConfig::NEVER) + unless BuiltinsConfig::VALUES.include?(builtins) + raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter") end + @builtins = builtins end def config @@ -253,14 +253,14 @@ def is_of_type_pseudo_class?(node) # rubocop:disable Naming/PredicateName end end - # use only ordinary xpath functions - def css_class_standard(hay, needle) - "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')" - end - - # use the builtin implementation - def css_class_builtin(hay, needle) - "nokogiri-builtin:css-class(#{hay},'#{needle}')" + def css_class(hay, needle) + if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) + # use the builtin implementation + "nokogiri-builtin:css-class(#{hay},'#{needle}')" + else + # use only ordinary xpath functions + "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')" + end end end diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index d23edfa453..e775b7357a 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -23,6 +23,13 @@ def assert_xpath(expecteds, asts) end end + it "accepts some config parameters" do + refute_nil(Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::NEVER)) + refute_nil(Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS)) + refute_nil(Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL)) + assert_raises(ArgumentError) { Nokogiri::CSS::XPathVisitor.new(builtins: :not_valid) } + end + it "exposes its configuration" do assert_equal({ builtins: :never }, visitor.config) end @@ -415,7 +422,7 @@ def visit_pseudo_class_aaron(node) end describe "builtins:always" do - let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: :always) } + let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS) } it "exposes its configuration" do assert_equal({ builtins: :always }, visitor.config) @@ -464,7 +471,7 @@ def visit_pseudo_class_aaron(node) end describe "builtins:optimal" do - let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: :optimal) } + let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL) } it "exposes its configuration" do assert_equal({ builtins: :optimal }, visitor.config) diff --git a/test/test_css_cache.rb b/test/test_css_cache.rb index 9020226393..1d627d271c 100644 --- a/test/test_css_cache.rb +++ b/test/test_css_cache.rb @@ -131,8 +131,8 @@ def test_cache_key_on_ns_prefix_and_visitor_config Nokogiri::CSS.xpath_for("foo") Nokogiri::CSS.xpath_for("foo", prefix: ".//") Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }) - Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }, visitor: Nokogiri::CSS::XPathVisitor.new(builtins: :always)) - assert_equal(4, cache.length) + Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }, + visitor: Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS)) end def test_race_condition From 32f9ab04b34a7f670c757fd1a2ccea64fe438e94 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 13 Dec 2021 17:48:51 -0500 Subject: [PATCH 07/13] test: CSS integration tests run on XML, HTML4, and HTML5 documents we're about to introduce variations in how CSS selectors are translated into XPath and I want to ensure we have adequate test coverage. --- test/css/test_css_integration.rb | 482 ++++++++++++++++--------------- 1 file changed, 244 insertions(+), 238 deletions(-) diff --git a/test/css/test_css_integration.rb b/test/css/test_css_integration.rb index 56b192eeb2..c7f2f26e31 100644 --- a/test/css/test_css_integration.rb +++ b/test/css/test_css_integration.rb @@ -2,233 +2,245 @@ require "helper" -module Nokogiri - module CSS - class TestCssIntegration < Nokogiri::TestCase - def setup - super - doc = <<~EOF - - - - - - - - - - - - - - - - -
row1
row2
row3
row4
row5
row6
row7
row8
row9
row10
row11
row12
row13
row14
-
- bold1 - italic1 - bold2 - emphasis1 - italic2 -

para1

- bold3 -
-
- italic3 - emphasis2 - italic4 - emphasis3 - italic5 - italic6 - italic7 -
-
-

para2

-

para3

-
-
-

para4

-
- -
-

-

header1

-

-
-
-

header2

-

header3

-
-
-

header4

-
- -

-

- - EOF - @parser = Nokogiri.HTML(doc) - - @nested = Nokogiri.HTML(<<~EOF) - -
- bold -

para

-
- -
- bold - ... - ... -

para

-
+class TestNokogiriCssIntegration < Nokogiri::TestCase + let(:subject) do + subject_class.parse(<<~HTML) + + + + + + + + + + + + + + + + +
row1
row2
row3
row4
row5
row6
row7
row8
row9
row10
row11
row12
row13
row14
+
+ bold1 + italic1 + bold2 + emphasis1 + italic2 +

para1

+ bold3 +
+
+ italic3 + emphasis2 + italic4 + emphasis3 + italic5 + italic6 + italic7 +
+
+

para2

+

para3

+
+
+

para4

+
+ +
+

+

header1

+

+
+
+

header2

+

header3

+
+
+

header4

+
+ +

+

+ + HTML + end -
-
- bold -

para

-
+ let(:nested) do + subject_class.parse(<<~HTML) + +
+ bold +

para

+
+ +
+ bold + ... + ... +

para

+
+ +
+
+ bold +

para

+
+ +
+ bold + ... + ... +

para

+
+
+ +
+ bold +
+ +
+

para

+
+ HTML + end -
- bold - ... - ... -

para

-
-
+ def assert_result_rows(intarray, result, word = "row") + assert_equal(intarray.size, result.size, + "unexpected number of rows returned: '#{result.inner_text}'") + assert_equal(intarray.map { |j| "#{word}#{j}" }.join(" "), result.inner_text.strip, + result.inner_text) + end -
- bold -
+ doctypes = [Nokogiri::XML::Document, Nokogiri::HTML4::Document] + doctypes << Nokogiri::HTML5::Document if defined?(Nokogiri::HTML5::Document) -
-

para

-
- EOF - end + doctypes.each do |doctype| + describe doctype do + let(:subject_class) { doctype } - def test_even - assert_result_rows([2, 4, 6, 8, 10, 12, 14], @parser.search("table//tr:nth(even)")) + it "selects even" do + assert_result_rows([2, 4, 6, 8, 10, 12, 14], subject.search("table//tr:nth(even)")) end - def test_odd - assert_result_rows([1, 3, 5, 7, 9, 11, 13], @parser.search("table//tr:nth(odd)")) + it "selects odd" do + assert_result_rows([1, 3, 5, 7, 9, 11, 13], subject.search("table//tr:nth(odd)")) end - def test_n - assert_result_rows((1..14).to_a, @parser.search("table//tr:nth(n)")) + it "selects n" do + assert_result_rows((1..14).to_a, subject.search("table//tr:nth(n)")) end - def test_2n - assert_equal(@parser.search("table//tr:nth(even)").inner_text, @parser.search("table//tr:nth(2n)").inner_text) + it "selects 2n" do + assert_equal(subject.search("table//tr:nth(even)").inner_text, subject.search("table//tr:nth(2n)").inner_text) end - def test_2np1 - assert_equal(@parser.search("table//tr:nth(odd)").inner_text, @parser.search("table//tr:nth(2n+1)").inner_text) + it "selects 2np1" do + assert_equal(subject.search("table//tr:nth(odd)").inner_text, subject.search("table//tr:nth(2n+1)").inner_text) end - def test_4np3 - assert_result_rows([3, 7, 11], @parser.search("table//tr:nth(4n+3)")) + it "selects 4np3" do + assert_result_rows([3, 7, 11], subject.search("table//tr:nth(4n+3)")) end - def test_3np4 - assert_result_rows([4, 7, 10, 13], @parser.search("table//tr:nth(3n+4)")) + it "selects 3np4" do + assert_result_rows([4, 7, 10, 13], subject.search("table//tr:nth(3n+4)")) end - def test_mnp3 - assert_result_rows([1, 2, 3], @parser.search("table//tr:nth(-n+3)")) + it "selects mnp3" do + assert_result_rows([1, 2, 3], subject.search("table//tr:nth(-n+3)")) end - def test_4nm1 - assert_result_rows([3, 7, 11], @parser.search("table//tr:nth(4n-1)")) + it "selects 4nm1" do + assert_result_rows([3, 7, 11], subject.search("table//tr:nth(4n-1)")) end - def test_np3 - assert_result_rows([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], @parser.search("table//tr:nth(n+3)")) + it "selects np3" do + assert_result_rows([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], subject.search("table//tr:nth(n+3)")) end - def test_first - assert_result_rows([1], @parser.search("table//tr:first")) - assert_result_rows([1], @parser.search("table//tr:first()")) + it "selects first" do + assert_result_rows([1], subject.search("table//tr:first")) + assert_result_rows([1], subject.search("table//tr:first()")) end - def test_last - assert_result_rows([14], @parser.search("table//tr:last")) - assert_result_rows([14], @parser.search("table//tr:last()")) + it "selects last" do + assert_result_rows([14], subject.search("table//tr:last")) + assert_result_rows([14], subject.search("table//tr:last()")) end - def test_first_child - assert_result_rows([1], @parser.search("div/b:first-child"), "bold") - assert_result_rows([1], @parser.search("table//tr:first-child")) - assert_result_rows([2, 4], @parser.search("div/h1.c:first-child"), "header") + it "selects first_child" do + assert_result_rows([1], subject.search("div/b:first-child"), "bold") + assert_result_rows([1], subject.search("table//tr:first-child")) + assert_result_rows([2, 4], subject.search("div/h1.c:first-child"), "header") end - def test_last_child - assert_result_rows([3], @parser.search("div/b:last-child"), "bold") - assert_result_rows([14], @parser.search("table//tr:last-child")) - assert_result_rows([3, 4], @parser.search("div/h1.c:last-child"), "header") + it "selects last_child" do + assert_result_rows([3], subject.search("div/b:last-child"), "bold") + assert_result_rows([14], subject.search("table//tr:last-child")) + assert_result_rows([3, 4], subject.search("div/h1.c:last-child"), "header") end - def test_nth_child - assert_result_rows([2], @parser.search("div/b:nth-child(3)"), "bold") - assert_result_rows([5], @parser.search("table//tr:nth-child(5)")) - assert_result_rows([1, 3], @parser.search("div/h1.c:nth-child(2)"), "header") - assert_result_rows([3, 4], @parser.search("div/i.b:nth-child(2n+1)"), "italic") + it "selects nth_child" do + assert_result_rows([2], subject.search("div/b:nth-child(3)"), "bold") + assert_result_rows([5], subject.search("table//tr:nth-child(5)")) + assert_result_rows([1, 3], subject.search("div/h1.c:nth-child(2)"), "header") + assert_result_rows([3, 4], subject.search("div/i.b:nth-child(2n+1)"), "italic") end - def test_first_of_type - assert_result_rows([1], @parser.search("table//tr:first-of-type")) - assert_result_rows([1], @parser.search("div/b:first-of-type"), "bold") - assert_result_rows([2], @parser.search("div/b.a:first-of-type"), "bold") - assert_result_rows([3], @parser.search("div/i.b:first-of-type"), "italic") + it "selects first_of_type" do + assert_result_rows([1], subject.search("table//tr:first-of-type")) + assert_result_rows([1], subject.search("div/b:first-of-type"), "bold") + assert_result_rows([2], subject.search("div/b.a:first-of-type"), "bold") + assert_result_rows([3], subject.search("div/i.b:first-of-type"), "italic") end - def test_last_of_type - assert_result_rows([14], @parser.search("table//tr:last-of-type")) - assert_result_rows([3], @parser.search("div/b:last-of-type"), "bold") - assert_result_rows([2, 7], @parser.search("div/i:last-of-type"), "italic") - assert_result_rows([2, 6, 7], @parser.search("div i:last-of-type"), "italic") - assert_result_rows([4], @parser.search("div/i.b:last-of-type"), "italic") + it "selects last_of_type" do + assert_result_rows([14], subject.search("table//tr:last-of-type")) + assert_result_rows([3], subject.search("div/b:last-of-type"), "bold") + assert_result_rows([2, 7], subject.search("div/i:last-of-type"), "italic") + assert_result_rows([2, 6, 7], subject.search("div i:last-of-type"), "italic") + assert_result_rows([4], subject.search("div/i.b:last-of-type"), "italic") end - def test_nth_of_type - assert_result_rows([1], @parser.search("div/b:nth-of-type(1)"), "bold") - assert_result_rows([2], @parser.search("div/b:nth-of-type(2)"), "bold") - assert_result_rows([2], @parser.search("div/.a:nth-of-type(1)"), "bold") - assert_result_rows([2, 4, 7], @parser.search("div i:nth-of-type(2n)"), "italic") - assert_result_rows([1, 3, 5, 6], @parser.search("div i:nth-of-type(2n+1)"), "italic") - assert_result_rows([1], @parser.search("div .a:nth-of-type(2n)"), "emphasis") - assert_result_rows([2, 3], @parser.search("div .a:nth-of-type(2n+1)"), "bold") + it "selects nth_of_type" do + assert_result_rows([1], subject.search("div/b:nth-of-type(1)"), "bold") + assert_result_rows([2], subject.search("div/b:nth-of-type(2)"), "bold") + assert_result_rows([2], subject.search("div/.a:nth-of-type(1)"), "bold") + assert_result_rows([2, 4, 7], subject.search("div i:nth-of-type(2n)"), "italic") + assert_result_rows([1, 3, 5, 6], subject.search("div i:nth-of-type(2n+1)"), "italic") + assert_result_rows([1], subject.search("div .a:nth-of-type(2n)"), "emphasis") + assert_result_rows([2, 3], subject.search("div .a:nth-of-type(2n+1)"), "bold") end - def test_nth_last_of_type - assert_result_rows([14], @parser.search("table//tr:nth-last-of-type(1)")) - assert_result_rows([12], @parser.search("table//tr:nth-last-of-type(3)")) - assert_result_rows([2, 6, 7], @parser.search("div i:nth-last-of-type(1)"), "italic") - assert_result_rows([1, 5], @parser.search("div i:nth-last-of-type(2)"), "italic") - assert_result_rows([4], @parser.search("div/i.b:nth-last-of-type(1)"), "italic") - assert_result_rows([3], @parser.search("div/i.b:nth-last-of-type(2)"), "italic") + it "selects nth_last_of_type" do + assert_result_rows([14], subject.search("table//tr:nth-last-of-type(1)")) + assert_result_rows([12], subject.search("table//tr:nth-last-of-type(3)")) + assert_result_rows([2, 6, 7], subject.search("div i:nth-last-of-type(1)"), "italic") + assert_result_rows([1, 5], subject.search("div i:nth-last-of-type(2)"), "italic") + assert_result_rows([4], subject.search("div/i.b:nth-last-of-type(1)"), "italic") + assert_result_rows([3], subject.search("div/i.b:nth-last-of-type(2)"), "italic") end - def test_only_of_type - assert_result_rows([1, 4], @parser.search("div/p:only-of-type"), "para") - assert_result_rows([5], @parser.search("div/i.c:only-of-type"), "italic") + it "selects only_of_type" do + assert_result_rows([1, 4], subject.search("div/p:only-of-type"), "para") + assert_result_rows([5], subject.search("div/i.c:only-of-type"), "italic") end - def test_only_child - assert_result_rows([4], @parser.search("div/p:only-child"), "para") - assert_result_rows([4], @parser.search("div/h1.c:only-child"), "header") + it "selects only_child" do + assert_result_rows([4], subject.search("div/p:only-child"), "para") + assert_result_rows([4], subject.search("div/h1.c:only-child"), "header") end - def test_empty - result = @parser.search("p:empty") + it "selects empty" do + result = subject.search("p:empty") assert_equal(1, result.size, "unexpected number of rows returned: '#{result.inner_text}'") assert_equal("empty", result.first["class"]) end - def test_parent - result = @parser.search("p:parent") + it "selects parent" do + result = subject.search("p:parent") assert_equal(5, result.size) 0.upto(3) do |j| assert_equal("para#{j + 1} ", result[j].inner_text) @@ -236,125 +248,119 @@ def test_parent assert_equal("not-empty", result[4]["class"]) end - def test_siblings - doc = <<~EOF + it "selects siblings" do + html = <<~HTML

p1

p2

p3

p4

p5

- EOF - parser = Nokogiri.HTML(doc) - assert_equal(2, parser.search("#3 ~ p").size) - assert_equal("p4 p5 ", parser.search("#3 ~ p").inner_text) - assert_equal(0, parser.search("#5 ~ p").size) + HTML + doc = subject_class.parse(html) + assert_equal(2, doc.search("#3 ~ p").size) + assert_equal("p4 p5 ", doc.search("#3 ~ p").inner_text) + assert_equal(0, doc.search("#5 ~ p").size) - assert_equal(1, parser.search("#3 + p").size) - assert_equal("p4 ", parser.search("#3 + p").inner_text) - assert_equal(0, parser.search("#5 + p").size) + assert_equal(1, doc.search("#3 + p").size) + assert_equal("p4 ", doc.search("#3 + p").inner_text) + assert_equal(0, doc.search("#5 + p").size) end - def test_has_a - result = @nested.css("div:has(b)") + it "selects has_a" do + result = nested.css("div:has(b)") expected = [ - @nested.at_css(".unnested.direct"), - @nested.at_css(".unnested.indirect"), - @nested.at_css(".nested-parent"), - @nested.at_css(".nested-child.direct"), - @nested.at_css(".nested-child.indirect"), - @nested.at_css(".has-bold"), + nested.at_css(".unnested.direct"), + nested.at_css(".unnested.indirect"), + nested.at_css(".nested-parent"), + nested.at_css(".nested-child.direct"), + nested.at_css(".nested-child.indirect"), + nested.at_css(".has-bold"), ] assert_equal(expected, result.to_a) end - def test_has_a_gt_b - result = @nested.css("body *:has(div > b)") + it "selects has_a_gt_b" do + result = nested.css("body *:has(div > b)") expected = [ - @nested.at_css(".nested-parent"), + nested.at_css(".nested-parent"), ] assert_equal(expected, result.to_a) end - def test_has_gt_b - result = @nested.css("body *:has(> b)") + it "selects has_gt_b" do + result = nested.css("body *:has(> b)") expected = [ - @nested.at_css(".unnested.direct"), - @nested.at_css(".unnested.indirect"), - @nested.at_css(".nested-child.direct"), - @nested.at_css(".nested-child.indirect"), - @nested.at_css(".has-bold"), + nested.at_css(".unnested.direct"), + nested.at_css(".unnested.indirect"), + nested.at_css(".nested-child.direct"), + nested.at_css(".nested-child.indirect"), + nested.at_css(".has-bold"), ] assert_equal(expected, result.to_a) end - def test_has_a_plus_b - result = @nested.css("div:has(b + p)") + it "selects has_a_plus_b" do + result = nested.css("div:has(b + p)") expected = [ - @nested.at_css(".unnested.direct"), - @nested.at_css(".nested-parent"), - @nested.at_css(".nested-child.direct"), + nested.at_css(".unnested.direct"), + nested.at_css(".nested-parent"), + nested.at_css(".nested-child.direct"), ] assert_equal(expected, result.to_a) end - def test_has_plus_b - result = @nested.css("b:has(+ p)") + it "selects has_plus_b" do + result = nested.css("b:has(+ p)") expected = [ - @nested.at_css(".unnested.direct b"), - @nested.at_css(".nested-child.direct b"), + nested.at_css(".unnested.direct b"), + nested.at_css(".nested-child.direct b"), ] assert_equal(expected, result.to_a) end - def test_has_a_tilde_b - result = @nested.css("div:has(b ~ p)") + it "selects has_a_tilde_b" do + result = nested.css("div:has(b ~ p)") expected = [ - @nested.at_css(".unnested.direct"), - @nested.at_css(".unnested.indirect"), - @nested.at_css(".nested-parent"), - @nested.at_css(".nested-child.direct"), - @nested.at_css(".nested-child.indirect"), + nested.at_css(".unnested.direct"), + nested.at_css(".unnested.indirect"), + nested.at_css(".nested-parent"), + nested.at_css(".nested-child.direct"), + nested.at_css(".nested-child.indirect"), ] assert_equal(expected, result.to_a) end - def test_has_tilde_b - result = @nested.css("b:has(~ p)") + it "selects has_tilde_b" do + result = nested.css("b:has(~ p)") expected = [ - @nested.at_css(".unnested.direct b"), - @nested.at_css(".unnested.indirect b"), - @nested.at_css(".nested-child.direct b"), - @nested.at_css(".nested-child.indirect b"), + nested.at_css(".unnested.direct b"), + nested.at_css(".unnested.indirect b"), + nested.at_css(".nested-child.direct b"), + nested.at_css(".nested-child.indirect b"), ].flatten assert_equal(expected, result.to_a) end - def test_class_attr_selector - doc = Nokogiri::HTML::Document.parse(<<~EOF) + it "selects class_attr_selector" do + doc = subject_class.parse(<<~HTML)
space-delimited
tab-delimited
newline-delimited
carriage-return-delimited
- EOF + HTML result = doc.css("div[class~='asdf']") assert_equal(4, result.length) + result = doc.css("div[@class~='asdf']") + assert_equal(4, result.length) + expected = doc.css("div") assert_equal(expected, result) end - - private - - def assert_result_rows(intarray, result, word = "row") - assert_equal(intarray.size, result.size, - "unexpected number of rows returned: '#{result.inner_text}'") - assert_equal(intarray.map { |j| "#{word}#{j}" }.join(" "), result.inner_text.strip, - result.inner_text) - end end end end From 85ac956d8d4bc8756b7be8f47bc61c27abc90eaf Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 14 Dec 2021 18:29:15 -0500 Subject: [PATCH 08/13] fix: CSS searches in HTML5 documents should not require namespaces In HTML5, foreign elements have namespaces; but those namespaces should not be considered for the purposes of CSS searches. Unfortunately, this implementation's use of local-name() is ~10x slower than using the normal inline element name matching, which subsequent commits will explore. --- CHANGELOG.md | 1 + lib/nokogiri/css/xpath_visitor.rb | 24 ++++++- lib/nokogiri/html4/document.rb | 11 ++++ lib/nokogiri/html5/document.rb | 10 +++ lib/nokogiri/xml/document.rb | 10 +++ lib/nokogiri/xml/searchable.rb | 5 +- test/css/test_xpath_visitor.rb | 27 ++++++-- test/test_css_cache.rb | 4 ++ test/xml/test_xpath.rb | 105 ++++++++++++++++++++++++++++++ 9 files changed, 188 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 754268b805..e4d3b86481 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ This release ends support for: ### Fixed +* CSS queries on HTML5 documents now correctly match foreign elements (SVG, MathML) when namespaces are not specified in the query. [[#2376](https://github.com/sparklemotion/nokogiri/issues/2376)] * `XML::Builder` blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!) * The `Nokogiri::CSS::Parser` cache now uses the `XPathVisitor` configuration as part of the cache key, preventing incorrect cache results from being returned when multiple `XPathVisitor` options are being used. * Error recovery from in-context parsing (e.g., `Node#parse`) now always uses the correct `DocumentFragment` class. Previously `Nokogiri::HTML4::DocumentFragment` was always used, even for XML documents. [[#1158](https://github.com/sparklemotion/nokogiri/issues/1158)] diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index 5a50e08c53..9bdc241593 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -10,15 +10,27 @@ module BuiltinsConfig VALUES = [NEVER, ALWAYS, OPTIMAL] end - def initialize(builtins: BuiltinsConfig::NEVER) + module DoctypeConfig + XML = :xml + HTML4 = :html4 + HTML5 = :html5 + VALUES = [XML, HTML4, HTML5] + end + + def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML) unless BuiltinsConfig::VALUES.include?(builtins) raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter") end + unless DoctypeConfig::VALUES.include?(doctype) + raise(ArgumentError, "Invalid values #{doctype.inspect} for doctype: keyword parameter") + end + @builtins = builtins + @doctype = doctype end def config - { builtins: @builtins } + { builtins: @builtins, doctype: @doctype } end def visit_function(node) @@ -194,7 +206,13 @@ def visit_conditional_selector(node) end def visit_element_name(node) - node.value.first + if @doctype == DoctypeConfig::HTML5 && node.value.first != "*" + # HTML5 has namespaces that should be ignored in CSS queries + # https://github.com/sparklemotion/nokogiri/issues/2376 + "*[local-name()='#{node.value.first}']" + else + node.value.first + end end def visit_attrib_name(node) diff --git a/lib/nokogiri/html4/document.rb b/lib/nokogiri/html4/document.rb index 40ff91740e..378145f9aa 100644 --- a/lib/nokogiri/html4/document.rb +++ b/lib/nokogiri/html4/document.rb @@ -1,3 +1,4 @@ +# coding: utf-8 # frozen_string_literal: true require "pathname" @@ -148,6 +149,16 @@ def fragment(tags = nil) DocumentFragment.new(self, tags, root) end + # :call-seq: + # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig + # + # [Returns] The document type which determines CSS-to-XPath translation. + # + # See XPathVisitor for more information. + def xpath_doctype + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4 + end + class << self ### # Parse HTML. +string_or_io+ may be a String, or any object that diff --git a/lib/nokogiri/html5/document.rb b/lib/nokogiri/html5/document.rb index 563b8fbab7..a6260c0995 100644 --- a/lib/nokogiri/html5/document.rb +++ b/lib/nokogiri/html5/document.rb @@ -62,6 +62,16 @@ def to_xml(options = {}, &block) XML::Node.instance_method(:to_xml).bind(self).call(options, &block) end + # :call-seq: + # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig + # + # [Returns] The document type which determines CSS-to-XPath translation. + # + # See XPathVisitor for more information. + def xpath_doctype + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5 + end + private def self.do_parse(string_or_io, url, encoding, options) diff --git a/lib/nokogiri/xml/document.rb b/lib/nokogiri/xml/document.rb index 24d7720d26..941b96d8ad 100644 --- a/lib/nokogiri/xml/document.rb +++ b/lib/nokogiri/xml/document.rb @@ -390,6 +390,16 @@ def add_child(node_or_tags) end alias_method :<<, :add_child + # :call-seq: + # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig + # + # [Returns] The document type which determines CSS-to-XPath translation. + # + # See XPathVisitor for more information. + def xpath_doctype + Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML + end + private def self.empty_doc?(string_or_io) diff --git a/lib/nokogiri/xml/searchable.rb b/lib/nokogiri/xml/searchable.rb index 332a2b2051..b99dc83b80 100644 --- a/lib/nokogiri/xml/searchable.rb +++ b/lib/nokogiri/xml/searchable.rb @@ -227,7 +227,10 @@ def css_rules_to_xpath(rules, ns) end def xpath_query_from_css_rule(rule, ns) - visitor = Nokogiri::CSS::XPathVisitor.new(builtins: :optimal) + visitor = Nokogiri::CSS::XPathVisitor.new( + builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL, + doctype: document.xpath_doctype, + ) self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context| CSS.xpath_for(rule.to_s, { prefix: implied_xpath_context, ns: ns, visitor: visitor, }) diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index e775b7357a..8f738ea1d9 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -28,10 +28,19 @@ def assert_xpath(expecteds, asts) refute_nil(Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS)) refute_nil(Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL)) assert_raises(ArgumentError) { Nokogiri::CSS::XPathVisitor.new(builtins: :not_valid) } + + refute_nil(Nokogiri::CSS::XPathVisitor.new(doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML)) + refute_nil(Nokogiri::CSS::XPathVisitor.new(doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4)) + refute_nil(Nokogiri::CSS::XPathVisitor.new(doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5)) + assert_raises(ArgumentError) { Nokogiri::CSS::XPathVisitor.new(doctype: :not_valid) } end it "exposes its configuration" do - assert_equal({ builtins: :never }, visitor.config) + expected = { + builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::NEVER, + doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML, + } + assert_equal(expected, visitor.config) end it "raises an exception on single quote" do @@ -425,7 +434,7 @@ def visit_pseudo_class_aaron(node) let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS) } it "exposes its configuration" do - assert_equal({ builtins: :always }, visitor.config) + assert_equal({ builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS, doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML }, visitor.config) end it ". class" do @@ -462,7 +471,7 @@ def visit_pseudo_class_aaron(node) it "supports deprecated class" do assert_output("", /XPathVisitorAlwaysUseBuiltins is deprecated/) { visitor } assert_instance_of(Nokogiri::CSS::XPathVisitor, visitor) - assert_equal({ builtins: :always }, visitor.config) + assert_equal({ builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS, doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML }, visitor.config) assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", parser.parse(".awesome")) @@ -474,7 +483,7 @@ def visit_pseudo_class_aaron(node) let(:visitor) { Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL) } it "exposes its configuration" do - assert_equal({ builtins: :optimal }, visitor.config) + assert_equal({ builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL, doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML }, visitor.config) end # @@ -543,7 +552,7 @@ def visit_pseudo_class_aaron(node) it "supports deprecated class" do assert_output("", /XPathVisitorOptimallyUseBuiltins is deprecated/) { visitor } assert_instance_of(Nokogiri::CSS::XPathVisitor, visitor) - assert_equal({ builtins: :optimal }, visitor.config) + assert_equal({ builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL, doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML }, visitor.config) if Nokogiri.uses_libxml? assert_xpath("//*[nokogiri-builtin:css-class(@class,'awesome')]", @@ -555,5 +564,13 @@ def visit_pseudo_class_aaron(node) end end end + + describe "doctype:html5" do + let(:visitor) { Nokogiri::CSS::XPathVisitor.new(doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5) } + + it "matches on the element's local-name, ignoring namespaces" do + assert_xpath("//*[local-name()='foo']", parser.parse("foo")) + end + end end end diff --git a/test/test_css_cache.rb b/test/test_css_cache.rb index 1d627d271c..14a471011d 100644 --- a/test/test_css_cache.rb +++ b/test/test_css_cache.rb @@ -133,6 +133,10 @@ def test_cache_key_on_ns_prefix_and_visitor_config Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }) Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }, visitor: Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS)) + Nokogiri::CSS.xpath_for("foo", prefix: ".//", ns: { "example" => "http://example.com/" }, + visitor: Nokogiri::CSS::XPathVisitor.new(builtins: Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS, + doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5)) + assert_equal(5, cache.length) end def test_race_condition diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index baf631984c..3ee7dd51b4 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -600,6 +600,111 @@ def collision(nodes) assert_equal(1, @handler.things.length) end end + + describe "Document#xpath_doctype" do + it "Nokogiri::XML::Document" do + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML, + Nokogiri::XML::Document.parse("").xpath_doctype, + ) + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML, + Nokogiri::XML::DocumentFragment.parse("").document.xpath_doctype, + ) + end + + it "Nokogiri::HTML4::Document" do + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4, + Nokogiri::HTML4::Document.parse("").xpath_doctype, + ) + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4, + Nokogiri::HTML4::DocumentFragment.parse("").document.xpath_doctype, + ) + end + + it "Nokogiri::HTML5::Document" do + skip("HTML5 is not supported") unless defined?(Nokogiri::HTML5) + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5, + Nokogiri::HTML5::Document.parse("").xpath_doctype, + ) + assert_equal( + Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5, + Nokogiri::HTML5::DocumentFragment.parse("").document.xpath_doctype, + ) + end + end + + describe "HTML5 foreign elements" do + # https://github.com/sparklemotion/nokogiri/issues/2376 + let(:html) { <<~HTML } + + + + + + svg test + + +
+ + + + SVG + +
+ + + HTML + + let(:ns) { { "nsfoo" => "http://www.w3.org/2000/svg" } } + + describe "in an XML doc" do + let(:doc) { Nokogiri::XML::Document.parse(html) } + + it "requires namespace in XPath queries" do + assert_empty(doc.xpath("//svg")) + refute_empty(doc.xpath("//nsfoo:svg", ns)) + end + + it "requires namespace in CSS queries" do + assert_empty(doc.css("svg")) + refute_empty(doc.css("nsfoo|svg", ns)) + end + end + + describe "in an HTML4 doc" do + let(:doc) { Nokogiri::HTML4::Document.parse(html) } + + it "omits namespace in XPath queries" do + refute_empty(doc.xpath("//svg")) + assert_empty(doc.xpath("//nsfoo:svg", ns)) + end + + it "omits namespace in CSS queries" do + refute_empty(doc.css("svg")) + assert_empty(doc.css("nsfoo|svg", ns)) + end + end + + describe "in an HTML5 doc" do + let(:doc) { Nokogiri::HTML5::Document.parse(html) } + + it "requires namespace in XPath queries" do + skip("HTML5 is not supported") unless defined?(Nokogiri::HTML5) + assert_empty(doc.xpath("//svg")) + refute_empty(doc.xpath("//nsfoo:svg", ns)) + end + + it "omits namespace in CSS queries" do + skip("HTML5 is not supported") unless defined?(Nokogiri::HTML5) + refute_empty(doc.css("svg")) + assert_empty(doc.css("nsfoo|svg", ns)) + end + end + end end end end From d0303815a0f3e13bb6e99a24ff0c470d49ce0c4e Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 15 Dec 2021 10:18:01 -0500 Subject: [PATCH 09/13] doc: document XPathVisitor Also tweak :nodoc: directives to avoid accidentally excluding important modules (like Nokogiri::CSS) and avoid including code that's not usually relevant to the user API (like Nokogiri::XML:PP, Nokogiri::CSS:Tokenizer, and Nokogiri::HTML4::Document::EncodingReader) --- lib/nokogiri/css/node.rb | 3 +- lib/nokogiri/css/parser.rb | 9 +++++- lib/nokogiri/css/parser.y | 9 +++++- lib/nokogiri/css/parser_extras.rb | 3 +- lib/nokogiri/css/tokenizer.rb | 2 +- lib/nokogiri/css/tokenizer.rex | 2 +- lib/nokogiri/css/xpath_visitor.rb | 43 ++++++++++++++++++++++++++- lib/nokogiri/html4/document.rb | 7 +++-- lib/nokogiri/xml/pp/character_data.rb | 5 ++-- lib/nokogiri/xml/pp/node.rb | 5 ++-- lib/xsd/xmlparser/nokogiri.rb | 4 +-- 11 files changed, 74 insertions(+), 18 deletions(-) diff --git a/lib/nokogiri/css/node.rb b/lib/nokogiri/css/node.rb index df7b079157..e60c4a8dd9 100644 --- a/lib/nokogiri/css/node.rb +++ b/lib/nokogiri/css/node.rb @@ -1,9 +1,8 @@ # frozen_string_literal: true -# :stopdoc: module Nokogiri module CSS - class Node + class Node # :nodoc: ALLOW_COMBINATOR_ON_SELF = [:DIRECT_ADJACENT_SELECTOR, :FOLLOWING_SELECTOR, :CHILD_SELECTOR] # Get the type of this node diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb index c6cabab39b..5cdc80d44b 100644 --- a/lib/nokogiri/css/parser.rb +++ b/lib/nokogiri/css/parser.rb @@ -8,9 +8,16 @@ require 'racc/parser.rb' -# :stopdoc: require_relative "parser_extras" +module Nokogiri + module CSS + # :nodoc: all + class Parser < Racc::Parser + end + end +end + module Nokogiri module CSS class Parser < Racc::Parser diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y index c47f8e6d69..d5c4af1f6c 100644 --- a/lib/nokogiri/css/parser.y +++ b/lib/nokogiri/css/parser.y @@ -253,9 +253,16 @@ end ---- header -# :stopdoc: require_relative "parser_extras" +module Nokogiri + module CSS + # :nodoc: all + class Parser < Racc::Parser + end + end +end + ---- inner def unescape_css_identifier(identifier) diff --git a/lib/nokogiri/css/parser_extras.rb b/lib/nokogiri/css/parser_extras.rb index 7a93c11ebf..587256ec82 100644 --- a/lib/nokogiri/css/parser_extras.rb +++ b/lib/nokogiri/css/parser_extras.rb @@ -1,11 +1,10 @@ # frozen_string_literal: true -# :stopdoc: require "thread" module Nokogiri module CSS - class Parser < Racc::Parser + class Parser < Racc::Parser # :nodoc: CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off @cache = {} diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb index 0c44fab8a8..d173bc2c38 100644 --- a/lib/nokogiri/css/tokenizer.rb +++ b/lib/nokogiri/css/tokenizer.rb @@ -5,9 +5,9 @@ # from lexical definition file "lib/nokogiri/css/tokenizer.rex". #++ -# :stopdoc: module Nokogiri module CSS +# :nodoc: all class Tokenizer require 'strscan' diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex index 84a5c146b4..7db1d0e156 100644 --- a/lib/nokogiri/css/tokenizer.rex +++ b/lib/nokogiri/css/tokenizer.rex @@ -1,6 +1,6 @@ -# :stopdoc: module Nokogiri module CSS +# :nodoc: all class Tokenizer macro diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index 9bdc241593..a447b015b4 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -1,22 +1,57 @@ +# coding: utf-8 # frozen_string_literal: true module Nokogiri module CSS - class XPathVisitor # :nodoc: + # When translating CSS selectors to XPath queries with Nokogiri::CSS.xpath_for, the XPathVisitor + # class allows for changing some of the behaviors related to builtin xpath functions and quirks + # of HTML5. + class XPathVisitor + # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions. module BuiltinsConfig + # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is + # the default when calling Nokogiri::CSS.xpath_for directly. NEVER = :never + + # Always use Nokogiri builtin functions whenever possible. This is probably only useful for testing. ALWAYS = :always + + # Only use Nokogiri builtin functions when they will be faster than vanilla XPath. This is + # the behavior chosen when searching for CSS selectors on a Nokogiri document, fragment, or + # node. OPTIMAL = :optimal + + # :nodoc: array of values for validation VALUES = [NEVER, ALWAYS, OPTIMAL] end + # Enum to direct XPathVisitor when to tweak the XPath query to suit the nature of the document + # being searched. Note that searches for CSS selectors from a Nokogiri document, fragment, or + # node will choose the correct option automatically. module DoctypeConfig + # The document being searched is an XML document. This is the default. XML = :xml + + # The document being searched is an HTML4 document. HTML4 = :html4 + + # The document being searched is an HTML5 document. HTML5 = :html5 + + # :nodoc: array of values for validation VALUES = [XML, HTML4, HTML5] end + # :call-seq: + # new() → XPathVisitor + # new(builtins:, doctype:) → XPathVisitor + # + # [Parameters] + # - +builtins:+ (BuiltinsConfig) Determine when to use Nokogiri's built-in xpath functions for performance improvements. + # - +doctype:+ (DoctypeConfig) Make document-type-specific accommodations for CSS queries. + # + # [Returns] XPathVisitor + # def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML) unless BuiltinsConfig::VALUES.include?(builtins) raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter") @@ -29,10 +64,16 @@ def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML) @doctype = doctype end + # :call-seq: config() → Hash + # + # [Returns] + # a Hash representing the configuration of the XPathVisitor, suitable for use as + # part of the CSS cache key. def config { builtins: @builtins, doctype: @doctype } end + # :stopdoc: def visit_function(node) msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}" return send(msg, node) if respond_to?(msg) diff --git a/lib/nokogiri/html4/document.rb b/lib/nokogiri/html4/document.rb index 378145f9aa..2529fe867a 100644 --- a/lib/nokogiri/html4/document.rb +++ b/lib/nokogiri/html4/document.rb @@ -220,7 +220,7 @@ def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions:: end end - class EncodingFound < StandardError # :nodoc: + class EncodingFound < StandardError # :nodoc: all attr_reader :found_encoding def initialize(encoding) @@ -229,8 +229,9 @@ def initialize(encoding) end end - class EncodingReader # :nodoc: - class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc: + # :nodoc: all + class EncodingReader + class SAXHandler < Nokogiri::XML::SAX::Document attr_reader :encoding def initialize diff --git a/lib/nokogiri/xml/pp/character_data.rb b/lib/nokogiri/xml/pp/character_data.rb index 57491c5465..8a02101005 100644 --- a/lib/nokogiri/xml/pp/character_data.rb +++ b/lib/nokogiri/xml/pp/character_data.rb @@ -2,16 +2,17 @@ module Nokogiri module XML + # :nodoc: all module PP module CharacterData - def pretty_print(pp) # :nodoc: + def pretty_print(pp) nice_name = self.class.name.split("::").last pp.group(2, "#(#{nice_name} ", ")") do pp.pp(text) end end - def inspect # :nodoc: + def inspect "#<#{self.class.name}:#{format("0x%x", object_id)} #{text.inspect}>" end end diff --git a/lib/nokogiri/xml/pp/node.rb b/lib/nokogiri/xml/pp/node.rb index 5cef94fc60..d6bd275f26 100644 --- a/lib/nokogiri/xml/pp/node.rb +++ b/lib/nokogiri/xml/pp/node.rb @@ -2,9 +2,10 @@ module Nokogiri module XML + # :nodoc: all module PP module Node - def inspect # :nodoc: + def inspect attributes = inspect_attributes.reject do |x| attribute = send(x) !attribute || (attribute.respond_to?(:empty?) && attribute.empty?) @@ -16,7 +17,7 @@ def inspect # :nodoc: "#<#{self.class.name}:#{format("0x%x", object_id)} #{attributes}>" end - def pretty_print(pp) # :nodoc: + def pretty_print(pp) nice_name = self.class.name.split("::").last pp.group(2, "#(#{nice_name}:#{format("0x%x", object_id)} {", "})") do pp.breakable diff --git a/lib/xsd/xmlparser/nokogiri.rb b/lib/xsd/xmlparser/nokogiri.rb index 55c5ee4e8d..f75f963ebe 100644 --- a/lib/xsd/xmlparser/nokogiri.rb +++ b/lib/xsd/xmlparser/nokogiri.rb @@ -2,8 +2,8 @@ require "nokogiri" -module XSD # :nodoc: - module XMLParser # :nodoc: +module XSD + module XMLParser ### # Nokogiri XML parser for soap4r. # From 42068de11400dcaff29498edbdff2b11554658bf Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 15 Dec 2021 10:21:59 -0500 Subject: [PATCH 10/13] doc: document XML::Notation and other small changes to doc strings --- ext/nokogiri/xml_dtd.c | 4 ++-- lib/nokogiri/css.rb | 4 ++-- lib/nokogiri/version/info.rb | 7 ++----- lib/nokogiri/xml/notation.rb | 11 +++++++++++ 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/ext/nokogiri/xml_dtd.c b/ext/nokogiri/xml_dtd.c index f9cb71fea6..15908d3399 100644 --- a/ext/nokogiri/xml_dtd.c +++ b/ext/nokogiri/xml_dtd.c @@ -57,9 +57,9 @@ entities(VALUE self) /* * call-seq: - * notations + * notations() → Hash * - * Get a hash of the notations for this DTD. + * [Returns] All the notations for this DTD in a Hash of Notation +name+ to Notation. */ static VALUE notations(VALUE self) diff --git a/lib/nokogiri/css.rb b/lib/nokogiri/css.rb index fe4be39997..4e8ffc17d6 100644 --- a/lib/nokogiri/css.rb +++ b/lib/nokogiri/css.rb @@ -17,8 +17,6 @@ def parse(selector) # :nodoc: # # Translate a CSS selector to the equivalent XPath query. # - # 💡 Note that translated queries are cached. - # # [Parameters] # - +selector+ (String) The CSS selector to be translated into XPath # @@ -39,6 +37,8 @@ def parse(selector) # :nodoc: # # [Returns] (String) The equivalent XPath query for +selector+ # + # 💡 Note that translated queries are cached for performance concerns. + # def xpath_for(selector, options = {}) prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX) visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new } diff --git a/lib/nokogiri/version/info.rb b/lib/nokogiri/version/info.rb index 2b18557bd4..5da3fb4a96 100644 --- a/lib/nokogiri/version/info.rb +++ b/lib/nokogiri/version/info.rb @@ -201,12 +201,9 @@ def self.jruby? # :nodoc: VersionInfo.instance.jruby? end - # Ensure constants used in this file are loaded - see #1896 - if Nokogiri.jruby? - require_relative "../jruby/dependencies" - end + require_relative "../jruby/dependencies" if Nokogiri.jruby? require_relative "../extension" - # More complete version information about libxml + # Detailed version info about Nokogiri and the installed extension dependencies. VERSION_INFO = VersionInfo.instance.to_hash end diff --git a/lib/nokogiri/xml/notation.rb b/lib/nokogiri/xml/notation.rb index f79369bd09..1032403893 100644 --- a/lib/nokogiri/xml/notation.rb +++ b/lib/nokogiri/xml/notation.rb @@ -2,7 +2,18 @@ module Nokogiri module XML + # Struct representing an {XML Schema Notation}[https://www.w3.org/TR/xml/#Notations] class Notation < Struct.new(:name, :public_id, :system_id) + # dead comment to ensure rdoc processing + + # :attr: name (String) + # The name for the element. + + # :attr: public_id (String) + # The URI corresponding to the public identifier + + # :attr: system_id (String,nil) + # The URI corresponding to the system identifier end end end From 2096ef9faa3d6208308a344d76dfbf47ba4c99c0 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 15 Dec 2021 17:12:13 -0500 Subject: [PATCH 11/13] feat: introduce builtin xpath function local-name-is --- ext/nokogiri/xml_xpath_context.c | 22 +++++++++++++++++++ lib/nokogiri/css/xpath_visitor.rb | 9 +++++++- test/css/test_xpath_visitor.rb | 36 ++++++++++++++++++++++++++++--- test/xml/test_xpath.rb | 8 ++----- 4 files changed, 65 insertions(+), 10 deletions(-) diff --git a/ext/nokogiri/xml_xpath_context.c b/ext/nokogiri/xml_xpath_context.c index 93dbc4413a..c481900e85 100644 --- a/ext/nokogiri/xml_xpath_context.c +++ b/ext/nokogiri/xml_xpath_context.c @@ -86,6 +86,26 @@ xpath_builtin_css_class(xmlXPathParserContextPtr ctxt, int nargs) xmlXPathFreeObject(needle); } + +/* xmlXPathFunction to select nodes whose local name matches, for HTML5 CSS queries that should ignore namespaces */ +static void +xpath_builtin_local_name_is(xmlXPathParserContextPtr ctxt, int nargs) +{ + xmlXPathObjectPtr element_name; + + assert(ctxt->context->node); + + CHECK_ARITY(1); + CAST_TO_STRING; + CHECK_TYPE(XPATH_STRING); + element_name = valuePop(ctxt); + + valuePush(ctxt, xmlXPathNewBoolean(xmlStrEqual(ctxt->context->node->name, element_name->stringval))); + + xmlXPathFreeObject(element_name); +} + + /* * call-seq: * register_ns(prefix, uri) @@ -361,6 +381,8 @@ new (VALUE klass, VALUE nodeobj) xmlXPathRegisterNs(ctx, NOKOGIRI_BUILTIN_PREFIX, NOKOGIRI_BUILTIN_URI); xmlXPathRegisterFuncNS(ctx, (const xmlChar *)"css-class", NOKOGIRI_BUILTIN_URI, xpath_builtin_css_class); + xmlXPathRegisterFuncNS(ctx, (const xmlChar *)"local-name-is", NOKOGIRI_BUILTIN_URI, + xpath_builtin_local_name_is); self = Data_Wrap_Struct(klass, 0, deallocate, ctx); return self; diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index a447b015b4..c20231b17f 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -248,9 +248,16 @@ def visit_conditional_selector(node) def visit_element_name(node) if @doctype == DoctypeConfig::HTML5 && node.value.first != "*" + # if there is already a namespace, use it as normal + return node.value.first if node.value.first.include?(":") + # HTML5 has namespaces that should be ignored in CSS queries # https://github.com/sparklemotion/nokogiri/issues/2376 - "*[local-name()='#{node.value.first}']" + if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) + "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" + else + "*[local-name()='#{node.value.first}']" + end else node.value.first end diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 8f738ea1d9..1b2532d927 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -566,10 +566,40 @@ def visit_pseudo_class_aaron(node) end describe "doctype:html5" do - let(:visitor) { Nokogiri::CSS::XPathVisitor.new(doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5) } + let(:visitor) do + Nokogiri::CSS::XPathVisitor.new( + doctype: Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5, + builtins: builtins, + ) + end + + describe "builtins:always" do + let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS } + it "matches on the element's local-name, ignoring namespaces" do + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + end - it "matches on the element's local-name, ignoring namespaces" do - assert_xpath("//*[local-name()='foo']", parser.parse("foo")) + it "avoids the wildcard when using namespaces" do + assert_xpath("//ns1:foo", parser.parse("ns1|foo")) + end + end + + describe "builtins:never" do + let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::NEVER } + it "matches on the element's local-name, ignoring namespaces" do + assert_xpath("//*[local-name()='foo']", parser.parse("foo")) + end + end + + describe "builtins:optimal" do + let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL } + it "matches on the element's local-name, ignoring namespaces" do + if Nokogiri.uses_libxml? + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + else + assert_xpath("//*[local-name()='foo']", parser.parse("foo")) + end + end end end end diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 3ee7dd51b4..85844c3a29 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -642,11 +642,6 @@ def collision(nodes) let(:html) { <<~HTML } - - - - svg test -
@@ -701,7 +696,8 @@ def collision(nodes) it "omits namespace in CSS queries" do skip("HTML5 is not supported") unless defined?(Nokogiri::HTML5) refute_empty(doc.css("svg")) - assert_empty(doc.css("nsfoo|svg", ns)) + refute_empty(doc.css("nsfoo|svg", ns)) # if they specify the valid ns, use it + assert_empty(doc.css("nsbar|svg", { "nsbar" => "http://example.com/nsbar" })) end end end From 0fd4de42a90ba190f0f3def44de72523a292c3b0 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 24 Dec 2021 19:56:06 -0500 Subject: [PATCH 12/13] dev: introduce some helpers for introspecting on libxml2 patches --- lib/nokogiri/version/info.rb | 18 +++++++++++++++--- test/helper.rb | 10 ++++++++++ test/html4/test_attributes_properly_escaped.rb | 12 +++--------- test/html4/test_comments.rb | 4 ++-- test/test_version.rb | 3 +++ test/xml/sax/test_parser.rb | 3 +-- test/xml/test_xpath.rb | 4 +--- 7 files changed, 35 insertions(+), 19 deletions(-) diff --git a/lib/nokogiri/version/info.rb b/lib/nokogiri/version/info.rb index 5da3fb4a96..bd5a735ce5 100644 --- a/lib/nokogiri/version/info.rb +++ b/lib/nokogiri/version/info.rb @@ -187,20 +187,32 @@ def to_markdown end end - def self.uses_libxml?(requirement = nil) # :nodoc: + # :nodoc: + def self.uses_libxml?(requirement = nil) return false unless VersionInfo.instance.libxml2? return true unless requirement Gem::Requirement.new(requirement).satisfied_by?(VersionInfo.instance.loaded_libxml_version) end - def self.uses_gumbo? # :nodoc: + # :nodoc: + def self.uses_gumbo? uses_libxml? # TODO: replace with Gumbo functionality end - def self.jruby? # :nodoc: + # :nodoc: + def self.jruby? VersionInfo.instance.jruby? end + # :nodoc: + def self.libxml2_patches + if VersionInfo.instance.libxml2_using_packaged? + Nokogiri::VERSION_INFO["libxml"]["patches"] + else + [] + end + end + require_relative "../jruby/dependencies" if Nokogiri.jruby? require_relative "../extension" diff --git a/test/helper.rb b/test/helper.rb index 0ec60b86c4..8f296e687f 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -157,6 +157,16 @@ def skip_unless_libxml2(msg = "this test should only run with libxml2") skip(msg) unless Nokogiri.uses_libxml? end + def skip_unless_libxml2_patch(patch_name) + patch_dir = File.join(__dir__, "..", "patches", "libxml2") + if File.directory?(patch_dir) && !File.exist?(File.join(patch_dir, patch_name)) + raise("checking for nonexistent patch file #{patch_name.inspect}") + end + unless Nokogiri.libxml2_patches.include?(patch_name) + skip("this test needs libxml2 patched with #{patch_name}") + end + end + def skip_unless_jruby(msg = "this test should only run with jruby") skip(msg) unless Nokogiri.jruby? end diff --git a/test/html4/test_attributes_properly_escaped.rb b/test/html4/test_attributes_properly_escaped.rb index 349b3991a6..43fdc20f25 100755 --- a/test/html4/test_attributes_properly_escaped.rb +++ b/test/html4/test_attributes_properly_escaped.rb @@ -6,9 +6,7 @@ module Nokogiri module HTML class TestAttributesProperlyEscaped < Nokogiri::TestCase def test_attribute_macros_are_escaped - if Nokogiri.uses_libxml? && !Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0001-Remove-script-macro-support.patch") - skip("libxml2 has not been patched to be safe against attribute macros") - end + skip_unless_libxml2_patch("0001-Remove-script-macro-support.patch") if Nokogiri.uses_libxml? html = "

}\">

" document = Nokogiri::HTML::Document.new @@ -18,9 +16,7 @@ def test_attribute_macros_are_escaped end def test_libxml_escapes_server_side_includes - if Nokogiri.uses_libxml? && !Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0002-Update-entities-to-remove-handling-of-ssi.patch") - skip("libxml2 has not been patched to be safe against SSI") - end + skip_unless_libxml2_patch("0002-Update-entities-to-remove-handling-of-ssi.patch") if Nokogiri.uses_libxml? original_html = %(

) document = Nokogiri::HTML::Document.new @@ -30,9 +26,7 @@ def test_libxml_escapes_server_side_includes end def test_libxml_escapes_server_side_includes_without_nested_quotes - if Nokogiri.uses_libxml? && !Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0002-Update-entities-to-remove-handling-of-ssi.patch") - skip("libxml2 has not been patched to be safe against SSI") - end + skip_unless_libxml2_patch("0002-Update-entities-to-remove-handling-of-ssi.patch") if Nokogiri.uses_libxml? original_html = %(

) document = Nokogiri::HTML::Document.new diff --git a/test/html4/test_comments.rb b/test/html4/test_comments.rb index f5f56aeae9..39ab13a070 100644 --- a/test/html4/test_comments.rb +++ b/test/html4/test_comments.rb @@ -23,7 +23,7 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - if Nokogiri::VersionInfo.instance.libxml2_using_packaged? && Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") it "behaves as if the comment is closed correctly" do # COMPLIANT assert_equal 1, subject.children.length assert subject.children.first.comment? @@ -54,7 +54,7 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - if Nokogiri::VersionInfo.instance.libxml2_using_packaged? && Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") it "behaves as if the comment is closed correctly" do # COMPLIANT assert_equal 1, subject.children.length assert subject.children.first.comment? diff --git a/test/test_version.rb b/test/test_version.rb index edd2918527..b1872d88bc 100644 --- a/test/test_version.rb +++ b/test/test_version.rb @@ -33,6 +33,7 @@ def test_version_info_for_xerces_and_nekohtml skip_unless_jruby("xerces/nekohtml is only used for JRuby") assert_equal(Nokogiri::XERCES_VERSION, version_info["other_libraries"]["xerces"]) assert_equal(Nokogiri::NEKO_VERSION, version_info["other_libraries"]["nekohtml"]) + assert_empty(Nokogiri.libxml2_patches) end def test_version_info_for_libxml @@ -41,12 +42,14 @@ def test_version_info_for_libxml if Nokogiri::VersionInfo.instance.libxml2_using_packaged? assert_equal("packaged", version_info["libxml"]["source"]) assert(version_info["libxml"]["patches"]) + assert_equal(Nokogiri.libxml2_patches, version_info["libxml"]["patches"]) assert_equal(Nokogiri::VersionInfo.instance.libxml2_precompiled?, version_info["libxml"]["precompiled"]) end if Nokogiri::VersionInfo.instance.libxml2_using_system? assert_equal("system", version_info["libxml"]["source"]) refute(version_info["libxml"].key?("precompiled")) refute(version_info["libxml"].key?("patches")) + assert_empty(Nokogiri.libxml2_patches) end assert_equal(Nokogiri::LIBXML_COMPILED_VERSION, version_info["libxml"]["compiled"]) diff --git a/test/xml/sax/test_parser.rb b/test/xml/sax/test_parser.rb index 9d8ed279a7..db7b11d2f6 100644 --- a/test/xml/sax/test_parser.rb +++ b/test/xml/sax/test_parser.rb @@ -396,8 +396,7 @@ def call_parse_io_with_encoding(encoding) end it :test_large_cdata_is_handled do - # see #2132 and https://gitlab.gnome.org/GNOME/libxml2/-/issues/200 - skip("Upstream libxml2 <= 2.9.10 needs to be patched") if Nokogiri::VersionInfo.instance.libxml2_using_system? + skip("see #2132 and https://gitlab.gnome.org/GNOME/libxml2/-/issues/200") if Nokogiri.uses_libxml?("<=2.9.10") template = <<~EOF diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 85844c3a29..4b3a5ccc2c 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -477,9 +477,7 @@ def test_xpath_syntax_error_should_not_display_line_and_column_if_both_are_zero end def test_huge_xpath_query - if Nokogiri.uses_libxml?("~>2.9.11") && !Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0007-Fix-XPath-recursion-limit.patch") - skip("libxml2 under test is broken with respect to xpath query recusion depth") - end + skip_unless_libxml2_patch("0007-Fix-XPath-recursion-limit.patch") if Nokogiri.uses_libxml?("~>2.9.11") # real world example # from https://github.com/sparklemotion/nokogiri/issues/2257 From d1a710ef162652309f26bd8af0ce41c0ee196c01 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 24 Dec 2021 20:02:57 -0500 Subject: [PATCH 13/13] feat: support wildcard namespaces in xpath queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is almost as fast as a standard child-axis search, and much faster than the builtin or using local-name(): //span 18.923k (± 8.9%) i/s - 93.906k in 5.010792s //*[local-name()='span'] 1.849k (± 2.8%) i/s - 9.261k in 5.011560s //*[nokogiri-builtin:local-name-is('span')] 3.191k (± 2.4%) i/s - 16.150k in 5.064798s //*:span 18.016k (± 4.6%) i/s - 89.900k in 5.003444s Comparison: //span: 18922.5 i/s //*:span: 18016.5 i/s - same-ish: difference falls within error //*[nokogiri-builtin:local-name-is('span')]: 3190.6 i/s - 5.93x (± 0.00) slower //*[local-name()='span']: 1849.4 i/s - 10.23x (± 0.00) slower --- lib/nokogiri/css/xpath_visitor.rb | 8 +- .../0009-allow-wildcard-namespaces.patch | 77 +++++++++++++++++++ test/css/test_xpath_visitor.rb | 41 +++++++++- test/xml/test_xpath.rb | 21 +++++ 4 files changed, 144 insertions(+), 3 deletions(-) create mode 100644 patches/libxml2/0009-allow-wildcard-namespaces.patch diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index c20231b17f..ef7a22b59a 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -7,6 +7,8 @@ module CSS # class allows for changing some of the behaviors related to builtin xpath functions and quirks # of HTML5. class XPathVisitor + WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc: + # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions. module BuiltinsConfig # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is @@ -254,7 +256,11 @@ def visit_element_name(node) # HTML5 has namespaces that should be ignored in CSS queries # https://github.com/sparklemotion/nokogiri/issues/2376 if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) - "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" + if WILDCARD_NAMESPACES + "*:#{node.value.first}" + else + "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" + end else "*[local-name()='#{node.value.first}']" end diff --git a/patches/libxml2/0009-allow-wildcard-namespaces.patch b/patches/libxml2/0009-allow-wildcard-namespaces.patch new file mode 100644 index 0000000000..bd3144acd0 --- /dev/null +++ b/patches/libxml2/0009-allow-wildcard-namespaces.patch @@ -0,0 +1,77 @@ +From 74c95ec5932c737d4fcb06b8646b0017364ada14 Mon Sep 17 00:00:00 2001 +From: Mike Dalessio +Date: Fri, 24 Dec 2021 19:08:01 -0500 +Subject: [PATCH] attempt to hack in wildcard namespaces to xpath + +I'm not confident this is a bulletproof patch. +--- + xpath.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/xpath.c b/xpath.c +index 1aa2f1a..c7f0885 100644 +--- a/xpath.c ++++ b/xpath.c +@@ -146,6 +146,9 @@ + #define XPATH_MAX_RECURSION_DEPTH 5000 + #endif + ++#define WILDCARD_PREFIX "*" ++#define IS_WILDCARD_PREFIX(p) xmlStrEqual((xmlChar*)WILDCARD_PREFIX, p) ++ + /* + * TODO: + * There are a few spots where some tests are done which depend upon ascii +@@ -11073,12 +11076,15 @@ xmlXPathCompNodeTest(xmlXPathParserContextPtr ctxt, xmlXPathTestVal *test, + SKIP_BLANKS; + + if ((name == NULL) && (CUR == '*')) { +- /* +- * All elements +- */ + NEXT; +- *test = NODE_TEST_ALL; +- return(NULL); ++ if (CUR != ':') { ++ /* ++ * All elements ++ */ ++ *test = NODE_TEST_ALL; ++ return(NULL); ++ } ++ name = xmlCharStrdup(WILDCARD_PREFIX); + } + + if (name == NULL) +@@ -11327,6 +11333,10 @@ xmlXPathCompStep(xmlXPathParserContextPtr ctxt) { + } + #endif + if (CUR == '*') { ++ if (NXT(1) == ':') { ++ NEXT; ++ name = xmlCharStrdup(WILDCARD_PREFIX); ++ } + axis = AXIS_CHILD; + } else { + if (name == NULL) +@@ -12030,7 +12040,7 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt, + /* + * Setup namespaces. + */ +- if (prefix != NULL) { ++ if (prefix != NULL && !IS_WILDCARD_PREFIX(prefix)) { + URI = xmlXPathNsLookup(xpctxt, prefix); + if (URI == NULL) { + xmlXPathReleaseObject(xpctxt, obj); +@@ -12369,6 +12379,8 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt, + { + XP_TEST_HIT + } ++ } else if (IS_WILDCARD_PREFIX(prefix)) { ++ XP_TEST_HIT + } else { + if ((cur->ns != NULL) && + (xmlStrEqual(URI, cur->ns->href))) +-- +2.31.0 + diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 1b2532d927..751afb47e8 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -565,6 +565,34 @@ def visit_pseudo_class_aaron(node) end end + # + # HTML5 documents have namespaces, and gumbo attaches namespaces to the relevant elements; but + # CSS selectors should not require namespaces. See #2376 for the discussion around this design + # decision, along with some of the relevant benchmarks and call stack analyses. + # + # (HTML5 today is only supported by CRuby/gumbo/libxml2 and so we'll ignore JRuby support for + # now.) + # + # The way to implement this CSS search using standard XPath 1.0 queries is to check for a match + # with `local-name()`. However, this is about ~10x slower than a standard search along the + # `child` axis. + # + # I've written a builtin function in C named `nokogiri-builtin:local-name-is()` which is a bit + # faster, but still ~7x slower than a standard search. + # + # Finally, I've patched libxml2 to support wildcard namespaces, and this is ~1.1x slower but + # only available with the packaged libxml2. + # + # In any case, the logic for the html5 builtins here goes: + # + # if ALWAYS or (OPTIMAL and libxml2) + # if we've patched libxml2 with wildcard support + # use wildard namespacing + # else + # use `nokogiri-builtin:local-name-is()` + # else + # use `local-name()` + # describe "doctype:html5" do let(:visitor) do Nokogiri::CSS::XPathVisitor.new( @@ -575,8 +603,13 @@ def visit_pseudo_class_aaron(node) describe "builtins:always" do let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS } + it "matches on the element's local-name, ignoring namespaces" do - assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") + assert_xpath("//*:foo", parser.parse("foo")) + else + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + end end it "avoids the wildcard when using namespaces" do @@ -595,7 +628,11 @@ def visit_pseudo_class_aaron(node) let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL } it "matches on the element's local-name, ignoring namespaces" do if Nokogiri.uses_libxml? - assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") + assert_xpath("//*:foo", parser.parse("foo")) + else + assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo")) + end else assert_xpath("//*[local-name()='foo']", parser.parse("foo")) end diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 4b3a5ccc2c..5c6589cdd5 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -699,6 +699,27 @@ def collision(nodes) end end end + + describe "XPath wildcard namespaces" do + let(:xml) { <<~XML } + + ns1 child + ns2 child + plain child + + XML + + let(:doc) { Nokogiri::XML::Document.parse(xml) } + + it "allows namespace wildcards" do + skip_unless_libxml2_patch("0009-allow-wildcard-namespaces.patch") + + assert_equal(1, doc.xpath("//n:child", { "n" => "http://nokogiri.org/ns1" }).length) + assert_equal(3, doc.xpath("//*:child").length) + assert_equal(1, doc.xpath("//self::n:child", { "n" => "http://nokogiri.org/ns1" }).length) + assert_equal(3, doc.xpath("//self::*:child").length) + end + end end end end