fix: regression with XPath attributes in CSS selectors

This commit removes "@" from the IDENT token so that we can create a new grammar rule in the parser for XPath attributes. Fixes #2419
sparklemotion · Jan 12, 2022 · 516902a · 516902a
1 parent f52f45f
commit 516902a
Show file tree

Hide file tree

Showing 8 changed files with 373 additions and 247 deletions.
diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb
diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y
@@ -21,6 +21,14 @@ rule
   | SLASH { result = :CHILD_SELECTOR }
   ;
 
+  xpath_attribute_name:
+    '@' IDENT { result = val.join }
+  ;
+
+  xpath_attribute:
+    xpath_attribute_name { result = Node.new(:ATTRIB_NAME, [val[0]]) }
+  ;
+
   simple_selector:
     element_name hcap_0toN {
       result =  if val[1].nil?
@@ -41,6 +49,7 @@ rule
         [Node.new(:ELEMENT_NAME, ['*']), val[0]]
       )
     }
+  | xpath_attribute
   ;
 
   prefixless_combinator_selector:
@@ -115,6 +124,7 @@ rule
       # So we don't add prefix "xmlns:" as in namespaced_ident.
       result = Node.new(:ATTRIB_NAME, [val[0]])
     }
+  | xpath_attribute
   ;
 
   function:
@@ -139,6 +149,7 @@ rule
     NUMBER COMMA expr { result = [val[0], val[2]] }
   | STRING COMMA expr { result = [val[0], val[2]] }
   | IDENT COMMA expr { result = [val[0], val[2]] }
+  | xpath_attribute_name COMMA expr { result = [val[0], val[2]] }
   | NUMBER
   | STRING
   | IDENT {
@@ -153,6 +164,7 @@ rule
         result = val
       end
     }
+  | xpath_attribute_name
   ;
 
   nth:

diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb
@@ -63,10 +63,10 @@ def _next_token
                   when (text = @ss.scan(/has\([\s]*/))
                      action { [:HAS, text] }
 
-                  when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
+                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
                      action { [:FUNCTION, text] }
 
-                  when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
+                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
                      action { [:IDENT, text] }
 
                   when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))

diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex
@@ -13,7 +13,7 @@ macro
   escape    {unicode}|\\[^\n\r\f0-9A-Fa-f]
   nmchar    [_A-Za-z0-9-]|{nonascii}|{escape}
   nmstart   [_A-Za-z]|{nonascii}|{escape}
-  ident     [-@]?({nmstart})({nmchar})*
+  ident     -?({nmstart})({nmchar})*
   name      ({nmchar})+
   string1   "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
   string2   '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'

diff --git a/test/css/test_css_integration.rb b/test/css/test_css_integration.rb
@@ -362,6 +362,48 @@ def assert_result_rows(intarray, result, word = "row")
           expected = doc.css("div")
           assert_equal(expected, result)
         end
+
+        it "handles xpath attribute selectors" do
+          doc = subject_class.parse(<<~HTML)
+            <html><body>
+              <div class="first">
+                <span class="child"></span>
+              </div>
+              <div class="second"></div>
+              <div class="third"></div>
+              <div class="fourth"></div>
+            </body></html>
+          HTML
+
+          result = doc.css("div > @class")
+          assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))
+
+          result = doc.css("div/@class")
+          assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))
+
+          result = doc.css("div @class")
+          assert_equal(["first", "child", "second", "third", "fourth"], result.map(&:to_s))
+        end
+
+        it "handles xpath functions" do
+          doc = subject_class.parse(<<~HTML)
+            <html><body>
+              <div>first<span>child</span></div>
+              <div>second</div>
+              <div>third</div>
+              <div>fourth</div>
+            </body></html>
+          HTML
+
+          result = doc.css("div > text()")
+          assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))
+
+          result = doc.css("div/text()")
+          assert_equal(["first", "second", "third", "fourth"], result.map(&:to_s))
+
+          result = doc.css("div text()")
+          assert_equal(["first", "child", "second", "third", "fourth"], result.map(&:to_s))
+        end
       end
     end
   end

diff --git a/test/css/test_parser.rb b/test/css/test_parser.rb
@@ -37,5 +37,13 @@ class TestNokogiri < Nokogiri::TestCase
           [:FUNCTION, ["nth-child("], ["2"]],],], asts.first.to_a
       )
     end
+
+    it "parses xpath attributes" do
+      ast = parser.parse("a/@href").first
+      assert_equal(
+        [:CHILD_SELECTOR, [:ELEMENT_NAME, ["a"]], [:ATTRIB_NAME, ["@href"]]],
+        ast.to_a
+      )
+    end
   end
 end
diff --git a/test/css/test_tokenizer.rb b/test/css/test_tokenizer.rb
@@ -19,6 +19,14 @@ def setup
         @scanner = Nokogiri::CSS::Tokenizer.new
       end
 
+      def assert_tokens(tokens, scanner)
+        toks = []
+        while (tok = @scanner.next_token)
+          toks << tok
+        end
+        assert_equal(tokens, toks)
+      end
+
       def test_has
         @scanner.scan("a:has(b)")
         assert_tokens(
@@ -189,12 +197,16 @@ def test_significant_space
                        [:RSQUARE, "]"],], @scanner)
       end
 
-      def assert_tokens(tokens, scanner)
-        toks = []
-        while (tok = @scanner.next_token)
-          toks << tok
-        end
-        assert_equal(tokens, toks)
+      def test_xpath_attributes
+        @scanner.scan("a/@href")
+        assert_tokens([[:IDENT, "a"], [:SLASH, "/"], ["@", "@"], [:IDENT, "href"]],
+          @scanner)
+      end
+
+      def test_xpath_functions
+        @scanner.scan("a/text()")
+        assert_tokens([[:IDENT, "a"], [:SLASH, "/"], [:FUNCTION, "text("], [:RPAREN, ")"]],
+          @scanner)
       end
     end
   end

diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb
@@ -371,6 +371,10 @@ def assert_xpath(expecteds, asts)
       it "handles text() (non-standard)" do
         assert_xpath("//a[child::text()]", parser.parse("a[text()]"))
         assert_xpath("//child::text()", parser.parse("text()"))
+        assert_xpath("//a//child::text()", parser.parse("a text()"))
+        assert_xpath("//a/child::text()", parser.parse("a / text()"))
+        assert_xpath("//a/child::text()", parser.parse("a > text()"))
+        assert_xpath("//a//child::text()", parser.parse("a text()"))
       end
 
       it "handles comment() (non-standard)" do
@@ -615,13 +619,21 @@ def visit_pseudo_class_aaron(node)
         it "avoids the wildcard when using namespaces" do
           assert_xpath("//ns1:foo", parser.parse("ns1|foo"))
         end
+
+        it "avoids the wildcard when using attribute selectors" do
+          assert_xpath("//*:a/@href", parser.parse("a/@href"))
+        end
       end
 
       describe "builtins:never" do
         let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::NEVER }
         it "matches on the element's local-name, ignoring namespaces" do
           assert_xpath("//*[local-name()='foo']", parser.parse("foo"))
         end
+
+        it "avoids the wildcard when using attribute selectors" do
+          assert_xpath("//*[local-name()='a']/@href", parser.parse("a/@href"))
+        end
       end
 
       describe "builtins:optimal" do