Skip to content

Commit

Permalink
fix: regression with XPath attributes in CSS selectors
Browse files Browse the repository at this point in the history
This commit removes "@" from the IDENT token so that we can create a
new grammar rule in the parser for XPath attributes.

Fixes #2419
  • Loading branch information
flavorjones committed Jan 12, 2022
1 parent f52f45f commit 516902a
Show file tree
Hide file tree
Showing 8 changed files with 373 additions and 247 deletions.
516 changes: 278 additions & 238 deletions lib/nokogiri/css/parser.rb

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions lib/nokogiri/css/parser.y
Expand Up @@ -21,6 +21,14 @@ rule
| SLASH { result = :CHILD_SELECTOR }
;

xpath_attribute_name:
'@' IDENT { result = val.join }
;

xpath_attribute:
xpath_attribute_name { result = Node.new(:ATTRIB_NAME, [val[0]]) }
;

simple_selector:
element_name hcap_0toN {
result = if val[1].nil?
Expand All @@ -41,6 +49,7 @@ rule
[Node.new(:ELEMENT_NAME, ['*']), val[0]]
)
}
| xpath_attribute
;

prefixless_combinator_selector:
Expand Down Expand Up @@ -115,6 +124,7 @@ rule
# So we don't add prefix "xmlns:" as in namespaced_ident.
result = Node.new(:ATTRIB_NAME, [val[0]])
}
| xpath_attribute
;
function:
Expand All @@ -139,6 +149,7 @@ rule
NUMBER COMMA expr { result = [val[0], val[2]] }
| STRING COMMA expr { result = [val[0], val[2]] }
| IDENT COMMA expr { result = [val[0], val[2]] }
| xpath_attribute_name COMMA expr { result = [val[0], val[2]] }
| NUMBER
| STRING
| IDENT {
Expand All @@ -153,6 +164,7 @@ rule
result = val
end
}
| xpath_attribute_name
;

nth:
Expand Down
4 changes: 2 additions & 2 deletions lib/nokogiri/css/tokenizer.rb
Expand Up @@ -63,10 +63,10 @@ def _next_token
when (text = @ss.scan(/has\([\s]*/))
action { [:HAS, text] }

when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
action { [:FUNCTION, text] }

when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
action { [:IDENT, text] }

when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/css/tokenizer.rex
Expand Up @@ -13,7 +13,7 @@ macro
escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
nmstart [_A-Za-z]|{nonascii}|{escape}
ident [-@]?({nmstart})({nmchar})*
ident -?({nmstart})({nmchar})*
name ({nmchar})+
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
Expand Down
42 changes: 42 additions & 0 deletions test/css/test_css_integration.rb
Expand Up @@ -362,6 +362,48 @@ def assert_result_rows(intarray, result, word = "row")
expected = doc.css("div")
assert_equal(expected, result)
end

it "handles xpath attribute selectors" do
doc = subject_class.parse(<<~HTML)
<html><body>
<div class="first">
<span class="child"></span>
</div>
<div class="second"></div>
<div class="third"></div>
<div class="fourth"></div>
</body></html>
HTML

result = doc.css("div > @class")
assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))

result = doc.css("div/@class")
assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))

result = doc.css("div @class")
assert_equal(["first", "child", "second", "third", "fourth"], result.map(&:to_s))
end

it "handles xpath functions" do
doc = subject_class.parse(<<~HTML)
<html><body>
<div>first<span>child</span></div>
<div>second</div>
<div>third</div>
<div>fourth</div>
</body></html>
HTML

result = doc.css("div > text()")
assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))

result = doc.css("div/text()")
assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))

result = doc.css("div text()")
assert_equal(["first", "child", "second", "third", "fourth"], result.map(&:to_s))
end
end
end
end
Expand Down
8 changes: 8 additions & 0 deletions test/css/test_parser.rb
Expand Up @@ -37,5 +37,13 @@ class TestNokogiri < Nokogiri::TestCase
[:FUNCTION, ["nth-child("], ["2"]],],], asts.first.to_a
)
end

it "parses xpath attributes" do
ast = parser.parse("a/@href").first
assert_equal(
[:CHILD_SELECTOR, [:ELEMENT_NAME, ["a"]], [:ATTRIB_NAME, ["@href"]]],
ast.to_a
)
end
end
end
24 changes: 18 additions & 6 deletions test/css/test_tokenizer.rb
Expand Up @@ -19,6 +19,14 @@ def setup
@scanner = Nokogiri::CSS::Tokenizer.new
end

def assert_tokens(tokens, scanner)
toks = []
while (tok = @scanner.next_token)
toks << tok
end
assert_equal(tokens, toks)
end

def test_has
@scanner.scan("a:has(b)")
assert_tokens(
Expand Down Expand Up @@ -189,12 +197,16 @@ def test_significant_space
[:RSQUARE, "]"],], @scanner)
end

def assert_tokens(tokens, scanner)
toks = []
while (tok = @scanner.next_token)
toks << tok
end
assert_equal(tokens, toks)
def test_xpath_attributes
@scanner.scan("a/@href")
assert_tokens([[:IDENT, "a"], [:SLASH, "/"], ["@", "@"], [:IDENT, "href"]],
@scanner)
end

def test_xpath_functions
@scanner.scan("a/text()")
assert_tokens([[:IDENT, "a"], [:SLASH, "/"], [:FUNCTION, "text("], [:RPAREN, ")"]],
@scanner)
end
end
end
Expand Down
12 changes: 12 additions & 0 deletions test/css/test_xpath_visitor.rb
Expand Up @@ -371,6 +371,10 @@ def assert_xpath(expecteds, asts)
it "handles text() (non-standard)" do
assert_xpath("//a[child::text()]", parser.parse("a[text()]"))
assert_xpath("//child::text()", parser.parse("text()"))
assert_xpath("//a//child::text()", parser.parse("a text()"))
assert_xpath("//a/child::text()", parser.parse("a / text()"))
assert_xpath("//a/child::text()", parser.parse("a > text()"))
assert_xpath("//a//child::text()", parser.parse("a text()"))
end

it "handles comment() (non-standard)" do
Expand Down Expand Up @@ -615,13 +619,21 @@ def visit_pseudo_class_aaron(node)
it "avoids the wildcard when using namespaces" do
assert_xpath("//ns1:foo", parser.parse("ns1|foo"))
end

it "avoids the wildcard when using attribute selectors" do
assert_xpath("//*:a/@href", parser.parse("a/@href"))
end
end

describe "builtins:never" do
let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::NEVER }
it "matches on the element's local-name, ignoring namespaces" do
assert_xpath("//*[local-name()='foo']", parser.parse("foo"))
end

it "avoids the wildcard when using attribute selectors" do
assert_xpath("//*[local-name()='a']/@href", parser.parse("a/@href"))
end
end

describe "builtins:optimal" do
Expand Down

0 comments on commit 516902a

Please sign in to comment.