-
-
Notifications
You must be signed in to change notification settings - Fork 897
/
xpath_visitor.rb
357 lines (318 loc) · 12.3 KB
/
xpath_visitor.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# coding: utf-8
# frozen_string_literal: true
module Nokogiri
module CSS
# When translating CSS selectors to XPath queries with Nokogiri::CSS.xpath_for, the XPathVisitor
# class allows for changing some of the behaviors related to builtin xpath functions and quirks
# of HTML5.
class XPathVisitor
WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc:
# Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions.
module BuiltinsConfig
# Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is
# the default when calling Nokogiri::CSS.xpath_for directly.
NEVER = :never
# Always use Nokogiri builtin functions whenever possible. This is probably only useful for testing.
ALWAYS = :always
# Only use Nokogiri builtin functions when they will be faster than vanilla XPath. This is
# the behavior chosen when searching for CSS selectors on a Nokogiri document, fragment, or
# node.
OPTIMAL = :optimal
# :nodoc: array of values for validation
VALUES = [NEVER, ALWAYS, OPTIMAL]
end
# Enum to direct XPathVisitor when to tweak the XPath query to suit the nature of the document
# being searched. Note that searches for CSS selectors from a Nokogiri document, fragment, or
# node will choose the correct option automatically.
module DoctypeConfig
# The document being searched is an XML document. This is the default.
XML = :xml
# The document being searched is an HTML4 document.
HTML4 = :html4
# The document being searched is an HTML5 document.
HTML5 = :html5
# :nodoc: array of values for validation
VALUES = [XML, HTML4, HTML5]
end
# :call-seq:
# new() → XPathVisitor
# new(builtins:, doctype:) → XPathVisitor
#
# [Parameters]
# - +builtins:+ (BuiltinsConfig) Determine when to use Nokogiri's built-in xpath functions for performance improvements.
# - +doctype:+ (DoctypeConfig) Make document-type-specific accommodations for CSS queries.
#
# [Returns] XPathVisitor
#
def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML)
unless BuiltinsConfig::VALUES.include?(builtins)
raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
end
unless DoctypeConfig::VALUES.include?(doctype)
raise(ArgumentError, "Invalid values #{doctype.inspect} for doctype: keyword parameter")
end
@builtins = builtins
@doctype = doctype
end
# :call-seq: config() → Hash
#
# [Returns]
# a Hash representing the configuration of the XPathVisitor, suitable for use as
# part of the CSS cache key.
def config
{ builtins: @builtins, doctype: @doctype }
end
# :stopdoc:
def visit_function(node)
msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}"
return send(msg, node) if respond_to?(msg)
case node.value.first
when /^text\(/
"child::text()"
when /^self\(/
"self::#{node.value[1]}"
when /^eq\(/
"position()=#{node.value[1]}"
when /^(nth|nth-of-type)\(/
if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
nth(node.value[1])
else
"position()=#{node.value[1]}"
end
when /^nth-child\(/
if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
nth(node.value[1], child: true)
else
"count(preceding-sibling::*)=#{node.value[1].to_i - 1}"
end
when /^nth-last-of-type\(/
if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
nth(node.value[1], last: true)
else
index = node.value[1].to_i - 1
index == 0 ? "position()=last()" : "position()=last()-#{index}"
end
when /^nth-last-child\(/
if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
nth(node.value[1], last: true, child: true)
else
"count(following-sibling::*)=#{node.value[1].to_i - 1}"
end
when /^(first|first-of-type)\(/
"position()=1"
when /^(last|last-of-type)\(/
"position()=last()"
when /^contains\(/
"contains(.,#{node.value[1]})"
when /^gt\(/
"position()>#{node.value[1]}"
when /^only-child\(/
"last()=1"
when /^comment\(/
"comment()"
when /^has\(/
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
".#{"//" unless is_direct}#{node.value[1].accept(self)}"
else
# xpath function call, let's marshal those arguments
args = ["."]
args += node.value[1..-1].map do |n|
n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n
end
"#{node.value.first}#{args.join(",")})"
end
end
def visit_not(node)
child = node.value.first
if :ELEMENT_NAME == child.type
"not(self::#{child.accept(self)})"
else
"not(#{child.accept(self)})"
end
end
def visit_id(node)
node.value.first =~ /^#(.*)$/
"@id='#{Regexp.last_match(1)}'"
end
def visit_attribute_condition(node)
attribute = node.value.first.accept(self)
return attribute if node.value.length == 1
value = node.value.last
value = "'#{value}'" unless /^['"]/.match?(value)
# quoted values - see test_attribute_value_with_quotes in test/css/test_parser.rb
if (value[0] == value[-1]) && %q{"'}.include?(value[0])
str_value = value[1..-2]
if str_value.include?(value[0])
value = 'concat("' + str_value.split('"', -1).join(%q{",'"',"}) + '","")'
end
end
case node.value[1]
when :equal
attribute + "=" + value.to_s
when :not_equal
attribute + "!=" + value.to_s
when :substring_match
"contains(#{attribute},#{value})"
when :prefix_match
"starts-with(#{attribute},#{value})"
when :dash_match
"#{attribute}=#{value} or starts-with(#{attribute},concat(#{value},'-'))"
when :includes
value = value[1..-2] # strip quotes
css_class(attribute, value)
when :suffix_match
"substring(#{attribute},string-length(#{attribute})-string-length(#{value})+1,string-length(#{value}))=#{value}"
else
attribute + " #{node.value[1]} " + value.to_s
end
end
def visit_pseudo_class(node)
if node.value.first.is_a?(Nokogiri::CSS::Node) && (node.value.first.type == :FUNCTION)
node.value.first.accept(self)
else
msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, "")}"
return send(msg, node) if respond_to?(msg)
case node.value.first
when "first" then "position()=1"
when "first-child" then "count(preceding-sibling::*)=0"
when "last" then "position()=last()"
when "last-child" then "count(following-sibling::*)=0"
when "first-of-type" then "position()=1"
when "last-of-type" then "position()=last()"
when "only-child" then "count(preceding-sibling::*)=0 and count(following-sibling::*)=0"
when "only-of-type" then "last()=1"
when "empty" then "not(node())"
when "parent" then "node()"
when "root" then "not(parent::*)"
else
node.value.first + "(.)"
end
end
end
def visit_class_condition(node)
css_class("@class", node.value.first)
end
def visit_combinator(node)
if is_of_type_pseudo_class?(node.value.last)
"#{node.value.first&.accept(self)}][#{node.value.last.accept(self)}"
else
"#{node.value.first&.accept(self)} and #{node.value.last.accept(self)}"
end
end
{
"direct_adjacent_selector" => "/following-sibling::*[1]/self::",
"following_selector" => "/following-sibling::",
"descendant_selector" => "//",
"child_selector" => "/",
}.each do |k, v|
class_eval %{
def visit_#{k} node
"\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
end
}
end
def visit_conditional_selector(node)
node.value.first.accept(self) + "[" +
node.value.last.accept(self) + "]"
end
def visit_element_name(node)
if @doctype == DoctypeConfig::HTML5 && html5_element_name_needs_namespace_handling(node)
# HTML5 has namespaces that should be ignored in CSS queries
# https://github.com/sparklemotion/nokogiri/issues/2376
if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
if WILDCARD_NAMESPACES
"*:#{node.value.first}"
else
"*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
end
else
"*[local-name()='#{node.value.first}']"
end
else
node.value.first
end
end
def visit_attrib_name(node)
"@#{node.value.first}"
end
def accept(node)
node.accept(self)
end
private
def html5_element_name_needs_namespace_handling(node)
# if this is the wildcard selector "*", use it as normal
node.value.first != "*" &&
# if there is already a namespace (i.e., it is a prefixed QName), use it as normal
!node.value.first.include?(":")
end
def nth(node, options = {})
raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
a, b = read_a_and_positive_b(node.value)
position = if options[:child]
options[:last] ? "(count(following-sibling::*)+1)" : "(count(preceding-sibling::*)+1)"
else
options[:last] ? "(last()-position()+1)" : "position()"
end
if b.zero?
"(#{position} mod #{a})=0"
else
compare = a < 0 ? "<=" : ">="
if a.abs == 1
"#{position}#{compare}#{b}"
else
"(#{position}#{compare}#{b}) and (((#{position}-#{b}) mod #{a.abs})=0)"
end
end
end
def read_a_and_positive_b(values)
op = values[2]
if op == "+"
a = values[0].to_i
b = values[3].to_i
elsif op == "-"
a = values[0].to_i
b = a - (values[3].to_i % a)
else
raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}"
end
[a, b]
end
def is_of_type_pseudo_class?(node) # rubocop:disable Naming/PredicateName
if node.type == :PSEUDO_CLASS
if node.value[0].is_a?(Nokogiri::CSS::Node) && (node.value[0].type == :FUNCTION)
node.value[0].value[0]
else
node.value[0]
end =~ /(nth|first|last|only)-of-type(\()?/
end
end
def css_class(hay, needle)
if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
# use the builtin implementation
"nokogiri-builtin:css-class(#{hay},'#{needle}')"
else
# use only ordinary xpath functions
"contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')"
end
end
end
module XPathVisitorAlwaysUseBuiltins # :nodoc:
def self.new
warn(
"Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins is deprecated and will be removed in a future version of Nokogiri",
{ uplevel: 1 },
)
XPathVisitor.new(builtins: :always)
end
end
module XPathVisitorOptimallyUseBuiltins # :nodoc:
def self.new
warn(
"Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins is deprecated and will be removed in a future version of Nokogiri",
{ uplevel: 1 },
)
XPathVisitor.new(builtins: :optimal)
end
end
end
end