diff --git a/CHANGELOG.md b/CHANGELOG.md index a2e0bfce24..5bf4fcad36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ A related discussion about Trust exists at [#2357](https://github.com/sparklemot ### Fixed * XML::Builder blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!) +* Error recovery from in-context parsing (e.g., `Node#parse`) now always uses the correct `DocumentFragment` class. Previously `Nokogiri::HTML4::DocumentFragment` was always used, even for XML documents. [[#1158](https://github.com/sparklemotion/nokogiri/issues/1158)] * [CRuby] Fix memory leak in `Document#canonicalize` when inclusive namespaces are passed in. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] * [CRuby] Fix memory leak in `Document#canonicalize` when an argument type error is raised. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] * [CRuby] Fix memory leak in `EncodingHandler` where iconv handlers were not being cleaned up. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] diff --git a/lib/nokogiri.rb b/lib/nokogiri.rb index a4d22e45b5..76fa9b662f 100644 --- a/lib/nokogiri.rb +++ b/lib/nokogiri.rb @@ -115,6 +115,7 @@ def Nokogiri(*args, &block) end require_relative "nokogiri/version" +require_relative "nokogiri/class_resolver" require_relative "nokogiri/syntax_error" require_relative "nokogiri/xml" require_relative "nokogiri/xslt" diff --git a/lib/nokogiri/class_resolver.rb b/lib/nokogiri/class_resolver.rb new file mode 100644 index 0000000000..e2b21f6a6c --- /dev/null +++ b/lib/nokogiri/class_resolver.rb @@ -0,0 +1,67 @@ +# coding: utf-8 +# frozen_string_literal: true + +require "set" + +module Nokogiri + # + # Some classes in Nokogiri are namespaced as a group, for example + # Document, DocumentFragment, and Builder. + # + # It's sometimes necessary to look up the related class, e.g.: + # + # XML::Builder → XML::Document + # HTML4::Builder → HTML4::Document + # HTML5::Document → HTML5::DocumentFragment + # + # This module is included into those key classes who need to do this. + # + module ClassResolver + # #related_class restricts matching namespaces to those matching this set. + VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"]) + + # :call-seq: + # related_class(class_name) → Class + # + # Find a class constant within the + # + # Some examples: + # + # Nokogiri::XML::Document.new.related_class("DocumentFragment") + # # => Nokogiri::XML::DocumentFragment + # Nokogiri::HTML4::Document.new.related_class("DocumentFragment") + # # => Nokogiri::HTML4::DocumentFragment + # + # Note this will also work for subclasses that follow the same convention, e.g.: + # + # Loofah::HTML::Document.new.related_class("DocumentFragment") + # # => Loofah::HTML::DocumentFragment + # + # And even if it's a subclass, this will iterate through the superclasses: + # + # class ThisIsATopLevelClass < Nokogiri::HTML4::Builder ; end + # ThisIsATopLevelClass.new.related_class("Document") + # # => Nokogiri::HTML4::Document + # + def related_class(class_name) + klass = nil + inspecting = self.class + + while inspecting + namespace_path = inspecting.name.split("::")[0..-2] + inspecting = inspecting.superclass + + next unless VALID_NAMESPACES.include?(namespace_path.last) + + related_class_name = (namespace_path << class_name).join("::") + klass = begin + Object.const_get(related_class_name) + rescue NameError + nil + end + break if klass + end + klass + end + end +end diff --git a/lib/nokogiri/xml/builder.rb b/lib/nokogiri/xml/builder.rb index 80428f6d4f..5eb8dbe8d8 100644 --- a/lib/nokogiri/xml/builder.rb +++ b/lib/nokogiri/xml/builder.rb @@ -262,6 +262,8 @@ module XML # # class Builder + include Nokogiri::ClassResolver + DEFAULT_DOCUMENT_OPTIONS = { namespace_inheritance: true } # The current Document object being built @@ -307,13 +309,7 @@ def initialize(options = {}, root = nil, &block) @doc = root.document @parent = root else - klassname = "::" + (self.class.name.split("::")[0..-2] + ["Document"]).join("::") - klass = begin - Object.const_get(klassname) - rescue NameError - Nokogiri::XML::Document - end - @parent = @doc = klass.new + @parent = @doc = related_class("Document").new end @context = nil diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index 16a2463e1a..acc4cfc858 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -56,6 +56,7 @@ module XML class Node include Nokogiri::XML::PP::Node include Nokogiri::XML::Searchable + include Nokogiri::ClassResolver include Enumerable # Element node type, see Nokogiri::XML::Node#element? @@ -936,8 +937,7 @@ def matches?(selector) # Create a DocumentFragment containing +tags+ that is relative to _this_ # context node. def fragment(tags) - type = document.html? ? Nokogiri::HTML : Nokogiri::XML - type::DocumentFragment.new(document, tags, self) + document.related_class("DocumentFragment").new(document, tags, self) end ### @@ -988,7 +988,7 @@ def parse(string_or_io, options = nil) node_set = in_context(contents, options.to_i) if node_set.empty? && (document.errors.length > error_count) if options.recover? - fragment = Nokogiri::HTML4::DocumentFragment.parse(contents) + fragment = document.related_class("DocumentFragment").parse(contents) node_set = fragment.children else raise document.errors[error_count] diff --git a/nokogiri.gemspec b/nokogiri.gemspec index 7ce50c1790..9db6356c3d 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -232,6 +232,7 @@ Gem::Specification.new do |spec| "lib/nekodtd.jar", "lib/nekohtml.jar", "lib/nokogiri.rb", + "lib/nokogiri/class_resolver.rb", "lib/nokogiri/css.rb", "lib/nokogiri/css/node.rb", "lib/nokogiri/css/parser.rb", diff --git a/test/xml/test_builder.rb b/test/xml/test_builder.rb index 89a9ac522a..30932b6f5b 100644 --- a/test/xml/test_builder.rb +++ b/test/xml/test_builder.rb @@ -424,11 +424,14 @@ def test_builder_uses_proper_document_class xml_builder = Nokogiri::XML::Builder.new assert_instance_of(Nokogiri::XML::Document, xml_builder.doc) - html_builder = Nokogiri::HTML::Builder.new - assert_instance_of(Nokogiri::HTML::Document, html_builder.doc) + html_builder = Nokogiri::HTML4::Builder.new + assert_instance_of(Nokogiri::HTML4::Document, html_builder.doc) foo_builder = ThisIsATestBuilder.new assert_instance_of(Nokogiri::XML::Document, foo_builder.doc) + + foo_builder = ThisIsAnotherTestBuilder.new + assert_instance_of(Nokogiri::HTML4::Document, foo_builder.doc) end private @@ -443,3 +446,7 @@ def namespaces_defined_on(node) class ThisIsATestBuilder < Nokogiri::XML::Builder # this exists for the test_builder_uses_proper_document_class and should be empty end + +class ThisIsAnotherTestBuilder < Nokogiri::HTML4::Builder + # this exists for the test_builder_uses_proper_document_class and should be empty +end diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index b179a6fdaf..94302c766d 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -91,33 +91,45 @@ def test_parse_config_option end def test_node_context_parsing_of_malformed_html_fragment - doc = HTML.parse("
") + doc = HTML4.parse("
") context_node = doc.at_css("div") nodeset = context_node.parse("
") assert_equal(1, doc.errors.length) assert_equal(1, nodeset.length) assert_equal("
", nodeset.to_s) + assert_instance_of(Nokogiri::HTML4::Document, nodeset.document) + assert_instance_of(Nokogiri::HTML4::Document, nodeset.first.document) end def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_corrected - doc = HTML.parse("
") + doc = HTML4.parse("
") context_node = doc.at_css("div") nodeset = context_node.parse("
", &:recover) assert_equal(1, doc.errors.length) assert_equal(1, nodeset.length) assert_equal("
", nodeset.to_s) + assert_instance_of(Nokogiri::HTML4::Document, nodeset.document) + assert_instance_of(Nokogiri::HTML4::Document, nodeset.first.document) end def test_node_context_parsing_of_malformed_html_fragment_without_recover_is_not_corrected - doc = HTML.parse("
") + doc = HTML4.parse("
") context_node = doc.at_css("div") assert_raises(Nokogiri::XML::SyntaxError) do context_node.parse("
", &:strict) end end + def test_node_context_parsing_of_malformed_xml_fragment_uses_the_right_class_to_recover + doc = XML.parse("
") + context_node = doc.at_css("div") + nodeset = context_node.parse("
") @@ -421,6 +433,14 @@ def test_duplicate_node_removes_namespace end end + def test_fragment_creates_appropriate_class + frag = Nokogiri.XML("").at_css("child").fragment("") + assert_instance_of(Nokogiri::XML::DocumentFragment, frag) + + frag = Nokogiri.HTML4("").at_css("child").fragment("") + assert_instance_of(Nokogiri::HTML4::DocumentFragment, frag) + end + def test_fragment_creates_elements apple = xml.fragment("") apple.children.each do |child|