Skip to content

Commit

Permalink
Merge pull request #2422 from sparklemotion/2419-css-at-attribute-syntax
Browse files Browse the repository at this point in the history
fix: regression with XPath attributes in CSS selectors
  • Loading branch information
flavorjones committed Jan 13, 2022
2 parents 3240a07 + 538e11d commit 9b8fd5a
Show file tree
Hide file tree
Showing 10 changed files with 1,096 additions and 959 deletions.
691 changes: 351 additions & 340 deletions lib/nokogiri/css/parser.rb

Large diffs are not rendered by default.

485 changes: 241 additions & 244 deletions lib/nokogiri/css/parser.y

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions lib/nokogiri/css/tokenizer.rb
Expand Up @@ -63,10 +63,10 @@ def _next_token
when (text = @ss.scan(/has\([\s]*/))
action { [:HAS, text] }

when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
action { [:FUNCTION, text] }

when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
action { [:IDENT, text] }

when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/css/tokenizer.rex
Expand Up @@ -13,7 +13,7 @@ macro
escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
nmstart [_A-Za-z]|{nonascii}|{escape}
ident [-@]?({nmstart})({nmchar})*
ident -?({nmstart})({nmchar})*
name ({nmchar})+
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
Expand Down
34 changes: 16 additions & 18 deletions lib/nokogiri/css/xpath_visitor.rb
Expand Up @@ -128,8 +128,11 @@ def visit_function(node)
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
".#{"//" unless is_direct}#{node.value[1].accept(self)}"
else
# non-standard. this looks like a function call.
args = ["."] + node.value[1..-1]
# xpath function call, let's marshal those arguments
args = ["."]
args += node.value[1..-1].map do |n|
n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n
end
"#{node.value.first}#{args.join(",")})"
end
end
Expand All @@ -149,17 +152,8 @@ def visit_id(node)
end

def visit_attribute_condition(node)
attribute = if (node.value.first.type == :FUNCTION) || (node.value.first.value.first =~ /::/)
""
else
"@"
end
attribute += node.value.first.accept(self)

# non-standard. attributes starting with '@'
attribute.gsub!(/^@@/, "@")

return attribute unless node.value.length == 3
attribute = node.value.first.accept(self)
return attribute if node.value.length == 1

value = node.value.last
value = "'#{value}'" unless /^['"]/.match?(value)
Expand Down Expand Up @@ -249,10 +243,7 @@ def visit_conditional_selector(node)
end

def visit_element_name(node)
if @doctype == DoctypeConfig::HTML5 && node.value.first != "*"
# if there is already a namespace, use it as normal
return node.value.first if node.value.first.include?(":")

if @doctype == DoctypeConfig::HTML5 && html5_element_name_needs_namespace_handling(node)
# HTML5 has namespaces that should be ignored in CSS queries
# https://github.com/sparklemotion/nokogiri/issues/2376
if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
Expand All @@ -270,7 +261,7 @@ def visit_element_name(node)
end

def visit_attrib_name(node)
node.value.first
"@#{node.value.first}"
end

def accept(node)
Expand All @@ -279,6 +270,13 @@ def accept(node)

private

def html5_element_name_needs_namespace_handling(node)
# if this is the wildcard selector "*", use it as normal
node.value.first != "*" &&
# if there is already a namespace (i.e., it is a prefixed QName), use it as normal
!node.value.first.include?(":")
end

def nth(node, options = {})
raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4

Expand Down
91 changes: 51 additions & 40 deletions lib/nokogiri/xml/searchable.rb
Expand Up @@ -6,7 +6,7 @@ module XML
#
# The Searchable module declares the interface used for searching your DOM.
#
# It implements the public methods `search`, `css`, and `xpath`,
# It implements the public methods #search, #css, and #xpath,
# as well as allowing specific implementations to specialize some
# of the important behaviors.
#
Expand All @@ -30,25 +30,22 @@ module Searchable
# node.search('.//bike:tire', {'bike' => 'http://schwinn.com/'})
# node.search('bike|tire', {'bike' => 'http://schwinn.com/'})
#
# For XPath queries, a hash of variable bindings may also be
# appended to the namespace bindings. For example:
# For XPath queries, a hash of variable bindings may also be appended to the namespace
# bindings. For example:
#
# node.search('.//address[@domestic=$value]', nil, {:value => 'Yes'})
#
# Custom XPath functions and CSS pseudo-selectors may also be
# defined. To define custom functions create a class and
# implement the function you want to define. The first argument
# to the method will be the current matching NodeSet. Any other
# arguments are ones that you pass in. Note that this class may
# appear anywhere in the argument list. For example:
#
# node.search('.//title[regex(., "\w+")]', 'div.employee:regex("[0-9]+")'
# Class.new {
# def regex node_set, regex
# node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
# end
# }.new
# )
# 💡 Custom XPath functions and CSS pseudo-selectors may also be defined. To define custom
# functions create a class and implement the function you want to define. The first argument
# to the method will be the current matching NodeSet. Any other arguments are ones that you
# pass in. Note that this class may appear anywhere in the argument list. For example:
#
# handler = Class.new {
# def regex node_set, regex
# node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
# end
# }.new
# node.search('.//title[regex(., "\w+")]', 'div.employee:regex("[0-9]+")', handler)
#
# See Searchable#xpath and Searchable#css for further usage help.
def search(*args)
Expand Down Expand Up @@ -92,25 +89,40 @@ def at(*args)
#
# node.css('bike|tire', {'bike' => 'http://schwinn.com/'})
#
# Custom CSS pseudo classes may also be defined. To define
# custom pseudo classes, create a class and implement the custom
# pseudo class you want defined. The first argument to the
# method will be the current matching NodeSet. Any other
# arguments are ones that you pass in. For example:
# 💡 Custom CSS pseudo classes may also be defined which are mapped to a custom XPath
# function. To define custom pseudo classes, create a class and implement the custom pseudo
# class you want defined. The first argument to the method will be the matching context
# NodeSet. Any other arguments are ones that you pass in. For example:
#
# node.css('title:regex("\w+")', Class.new {
# def regex node_set, regex
# handler = Class.new {
# def regex(node_set, regex)
# node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
# end
# }.new)
# }.new
# node.css('title:regex("\w+")', handler)
#
# 💡 Some XPath syntax is supported in CSS queries. For example, to query for an attribute:
#
# Note that the CSS query string is case-sensitive with regards
# to your document type. That is, if you're looking for "H1" in
# an HTML document, you'll never find anything, since HTML tags
# will match only lowercase CSS queries. However, "H1" might be
# found in an XML document, where tags names are case-sensitive
# (e.g., "H1" is distinct from "h1").
# node.css('img > @href') # returns all +href+ attributes on an +img+ element
# node.css('img / @href') # same
#
# # ⚠ this returns +class+ attributes from all +div+ elements AND THEIR CHILDREN!
# node.css('div @class')
#
# node.css
#
# 💡 Array-like syntax is supported in CSS queries as an alternative to using +:nth-child()+.
#
# ⚠ NOTE that indices are 1-based like +:nth-child+ and not 0-based like Ruby Arrays. For
# example:
#
# # equivalent to 'li:nth-child(2)'
# node.css('li[2]') # retrieve the second li element in a list
#
# ⚠ NOTE that the CSS query string is case-sensitive with regards to your document type. HTML
# tags will match only lowercase CSS queries, so if you search for "H1" in an HTML document,
# you'll never find anything. However, "H1" might be found in an XML document, where tags
# names are case-sensitive (e.g., "H1" is distinct from "h1").
def css(*args)
rules, handler, ns, _ = extract_params(args)

Expand Down Expand Up @@ -147,18 +159,17 @@ def at_css(*args)
#
# node.xpath('.//address[@domestic=$value]', nil, {:value => 'Yes'})
#
# Custom XPath functions may also be defined. To define custom
# functions create a class and implement the function you want
# to define. The first argument to the method will be the
# current matching NodeSet. Any other arguments are ones that
# you pass in. Note that this class may appear anywhere in the
# argument list. For example:
# 💡 Custom XPath functions may also be defined. To define custom functions create a class and
# implement the function you want to define. The first argument to the method will be the
# current matching NodeSet. Any other arguments are ones that you pass in. Note that this
# class may appear anywhere in the argument list. For example:
#
# node.xpath('.//title[regex(., "\w+")]', Class.new {
# def regex node_set, regex
# handler = Class.new {
# def regex(node_set, regex)
# node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
# end
# }.new)
# }.new
# node.xpath('.//title[regex(., "\w+")]', handler)
#
def xpath(*args)
paths, handler, ns, binds = extract_params(args)
Expand Down

0 comments on commit 9b8fd5a

Please sign in to comment.