From 71bd4b256687861c12fd2a53cee798415865e54c Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 24 Dec 2021 15:06:56 -0500 Subject: [PATCH] feat: DocumentFragment constructors all take parse options param Similar to Document constructors, we can now pass in parse options as a parameter or via a config block. Fixes #1692 Fixes #1844 Co-authored-by: Jack McCracken --- CHANGELOG.md | 2 + ext/java/nokogiri/XmlDocumentFragment.java | 7 +- .../internals/NokogiriStrictErrorHandler.java | 2 +- ext/nokogiri/xml_document_fragment.c | 2 - lib/nokogiri/html4.rb | 4 +- lib/nokogiri/html4/document_fragment.rb | 13 +- lib/nokogiri/xml.rb | 4 +- lib/nokogiri/xml/document_fragment.rb | 28 ++-- test/html4/test_document_fragment.rb | 149 +++++++++++++++++ test/xml/test_document_fragment.rb | 150 ++++++++++++++++++ 10 files changed, 333 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5085fd3294..9f7ea16d0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ This release ends support for: * XML::Builder blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!) * Error recovery from in-context parsing (e.g., `Node#parse`) now always uses the correct `DocumentFragment` class. Previously `Nokogiri::HTML4::DocumentFragment` was always used, even for XML documents. [[#1158](https://github.com/sparklemotion/nokogiri/issues/1158)] * `DocumentFragment#>` now works properly, matching a CSS selector against only the fragment roots. [[#1857](https://github.com/sparklemotion/nokogiri/issues/1857)] +* `XML::DocumentFragment#errors` now correctly contains any parsing errors encountered. Previously this was always empty. (Note that `HTML::DocumentFragment#errors` did not need to be fixed.) * [CRuby] Fix memory leak in `Document#canonicalize` when inclusive namespaces are passed in. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] * [CRuby] Fix memory leak in `Document#canonicalize` when an argument type error is raised. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] * [CRuby] Fix memory leak in `EncodingHandler` where iconv handlers were not being cleaned up. [[#2345](https://github.com/sparklemotion/nokogiri/issues/2345)] @@ -48,6 +49,7 @@ This release ends support for: ### Improved +* `{XML,HTML4}::DocumentFragment` constructors all now take an optional parse options parameter or block (similar to Document constructors). [[#1692](https://github.com/sparklemotion/nokogiri/issues/1692)] (Thanks, [@JackMc](https://github.com/JackMc)!) * [CRuby] XML::Reader#encoding will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)] * [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!) * [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] diff --git a/ext/java/nokogiri/XmlDocumentFragment.java b/ext/java/nokogiri/XmlDocumentFragment.java index 7d8572605f..4281df113e 100644 --- a/ext/java/nokogiri/XmlDocumentFragment.java +++ b/ext/java/nokogiri/XmlDocumentFragment.java @@ -17,6 +17,7 @@ import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; +import org.jruby.runtime.Block; import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; @@ -46,9 +47,9 @@ public class XmlDocumentFragment extends XmlNode super(ruby, klazz); } - @JRubyMethod(name = "new", meta = true, required = 1, optional = 2) + @JRubyMethod(name = "new", meta = true, required = 1, optional = 3) public static IRubyObject - rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args) + rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block) { if (args.length < 1) { throw context.runtime.newArgumentError(args.length, 1); @@ -73,7 +74,7 @@ public class XmlDocumentFragment extends XmlNode fragment.setDocument(context, doc); fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment()); - Helpers.invoke(context, fragment, "initialize", args); + Helpers.invoke(context, fragment, "initialize", args, block); return fragment; } diff --git a/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java b/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java index 10cbc6f441..78118d1de7 100644 --- a/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +++ b/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java @@ -57,6 +57,6 @@ public class NokogiriStrictErrorHandler extends NokogiriErrorHandler warning(String domain, String key, XMLParseException e) throws XMLParseException { if (!nowarning) { throw e; } - if (!usesNekoHtml(domain)) { addError(e); } + else { addError(e); } } } diff --git a/ext/nokogiri/xml_document_fragment.c b/ext/nokogiri/xml_document_fragment.c index f509815fda..81639576d7 100644 --- a/ext/nokogiri/xml_document_fragment.c +++ b/ext/nokogiri/xml_document_fragment.c @@ -28,8 +28,6 @@ new (int argc, VALUE *argv, VALUE klass) rb_node = noko_xml_node_wrap(klass, node); rb_obj_call_init(rb_node, argc, argv); - if (rb_block_given_p()) { rb_yield(rb_node); } - return rb_node; } diff --git a/lib/nokogiri/html4.rb b/lib/nokogiri/html4.rb index fe543d6d88..e755391a9c 100644 --- a/lib/nokogiri/html4.rb +++ b/lib/nokogiri/html4.rb @@ -26,8 +26,8 @@ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT #### # Parse a fragment from +string+ in to a NodeSet. - def fragment(string, encoding = nil) - HTML4::DocumentFragment.parse(string, encoding) + def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + HTML4::DocumentFragment.parse(string, encoding, options, &block) end end diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb index 838a4f776a..9bfddef5da 100644 --- a/lib/nokogiri/html4/document_fragment.rb +++ b/lib/nokogiri/html4/document_fragment.rb @@ -5,7 +5,7 @@ module HTML4 class DocumentFragment < Nokogiri::XML::DocumentFragment #### # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+ - def self.parse(tags, encoding = nil) + def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) doc = HTML4::Document.new encoding ||= if tags.respond_to?(:encoding) @@ -21,15 +21,18 @@ def self.parse(tags, encoding = nil) doc.encoding = encoding - new(doc, tags) + new(doc, tags, nil, options, &block) end - def initialize(document, tags = nil, ctx = nil) + def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) return self unless tags + options = Nokogiri::XML::ParseOptions.new(options) if Integer === options + yield options if block_given? + if ctx preexisting_errors = document.errors.dup - node_set = ctx.parse("
#{tags}
") + node_set = ctx.parse("
#{tags}
", options) node_set.first.children.each { |child| child.parent = self } unless node_set.empty? self.errors = document.errors - preexisting_errors else @@ -40,7 +43,7 @@ def initialize(document, tags = nil, ctx = nil) "/html/body/node()" end - temp_doc = HTML4::Document.parse("#{tags}", nil, document.encoding) + temp_doc = HTML4::Document.parse("#{tags}", nil, document.encoding, options) temp_doc.xpath(path).each { |child| child.parent = self } self.errors = temp_doc.errors end diff --git a/lib/nokogiri/xml.rb b/lib/nokogiri/xml.rb index 0b01daefa2..9db8fe1ebf 100644 --- a/lib/nokogiri/xml.rb +++ b/lib/nokogiri/xml.rb @@ -39,8 +39,8 @@ def parse(thing, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, #### # Parse a fragment from +string+ in to a NodeSet. - def fragment(string) - XML::DocumentFragment.parse(string) + def fragment(string, options = ParseOptions::DEFAULT_XML, &block) + XML::DocumentFragment.parse(string, options, &block) end end end diff --git a/lib/nokogiri/xml/document_fragment.rb b/lib/nokogiri/xml/document_fragment.rb index 36070ac4e4..b5b3e6a667 100644 --- a/lib/nokogiri/xml/document_fragment.rb +++ b/lib/nokogiri/xml/document_fragment.rb @@ -3,26 +3,36 @@ module Nokogiri module XML class DocumentFragment < Nokogiri::XML::Node + #### + # Create a Nokogiri::XML::DocumentFragment from +tags+ + def self.parse(tags, options = ParseOptions::DEFAULT_XML, &block) + new(XML::Document.new, tags, nil, options, &block) + end + ## # Create a new DocumentFragment from +tags+. # # If +ctx+ is present, it is used as a context node for the # subtree created, e.g., namespaces will be resolved relative # to +ctx+. - def initialize(document, tags = nil, ctx = nil) + def initialize(document, tags = nil, ctx = nil, options = ParseOptions::DEFAULT_XML) return self unless tags + options = Nokogiri::XML::ParseOptions.new(options) if Integer === options + yield options if block_given? + children = if ctx # Fix for issue#490 if Nokogiri.jruby? # fix for issue #770 - ctx.parse("#{tags}").children + ctx.parse("#{tags}", options).children else - ctx.parse(tags) + ctx.parse(tags, options) end else - XML::Document.parse("#{tags}") \ - .xpath("/root/node()") + wrapper_doc = XML::Document.parse("#{tags}", nil, nil, options) + self.errors = wrapper_doc.errors + wrapper_doc.xpath("/root/node()") end children.each { |child| child.parent = self } end @@ -125,14 +135,6 @@ def search(*rules) alias_method :serialize, :to_s - class << self - #### - # Create a Nokogiri::XML::DocumentFragment from +tags+ - def parse(tags) - new(XML::Document.new, tags) - end - end - # A list of Nokogiri::XML::SyntaxError found when parsing a document def errors document.errors diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index a659d1445e..d9ba2e9616 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -306,6 +306,155 @@ def test_dup_should_create_an_html_document_fragment assert_instance_of(Nokogiri::HTML::DocumentFragment, duplicate) end + describe "parse options" do + let(:html4_default) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML) + end + + let(:html4_strict) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML).norecover + end + + let(:input) { "
foofoo
", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::HTML4.fragment(input, nil, html4_default) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4.fragment(input, nil, html4_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::HTML4.fragment(input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4.fragment(input) do |config| + config.norecover + end + end + end + end + + describe "HTML4::DocumentFragment.parse" do + it "has sane defaults" do + frag = Nokogiri::HTML4::DocumentFragment.parse(input) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_default) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::HTML4::DocumentFragment.parse(input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.parse(input) do |config| + config.norecover + end + end + end + end + + describe "HTML4::DocumentFragment.new" do + describe "without a context node" do + it "has sane defaults" do + frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_default) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config| + config.norecover + end + end + end + end + + describe "with a context node" do + let (:document) { Nokogiri::HTML4::Document.parse("") } + let (:context_node) { document.at_css("context") } + + it "has sane defaults" do + frag = Nokogiri::HTML4::DocumentFragment.new(document, input, context_node) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::HTML4::DocumentFragment.new(document, input, context_node, html4_default) + assert_equal("
foo
", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.new(document, input, context_node, html4_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::HTML4::DocumentFragment.new(document, input, context_node) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4::DocumentFragment.new(document, input, context_node) do |config| + config.norecover + end + end + end + end + end + end + describe "subclassing" do let(:klass) do Class.new(Nokogiri::HTML::DocumentFragment) do diff --git a/test/xml/test_document_fragment.rb b/test/xml/test_document_fragment.rb index 9069ac516e..175d2f64fe 100644 --- a/test/xml/test_document_fragment.rb +++ b/test/xml/test_document_fragment.rb @@ -332,6 +332,156 @@ def test_for_libxml_in_context_memory_badness_when_encountering_encoding_errors end end + describe "parse options" do + let(:xml_default) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_XML) + end + + let(:xml_strict) do + Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_XML).norecover + end + + let(:input) { "foofoo", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::XML.fragment(input, xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML.fragment(input, xml_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::XML.fragment(input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML.fragment(input) do |config| + config.norecover + end + end + end + end + + describe "XML::DocumentFragment.parse" do + it "has sane defaults" do + frag = Nokogiri::XML::DocumentFragment.parse(input) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::XML::DocumentFragment.parse(input, xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.parse(input, xml_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::XML::DocumentFragment.parse(input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.parse(input) do |config| + config.norecover + end + end + end + end + + describe "XML::DocumentFragment.new" do + describe "without a context node" do + it "has sane defaults" do + frag = Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input, nil, xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input, nil, xml_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input) do |config| + config.norecover + end + end + end + end + + describe "with a context node" do + let (:document) { Nokogiri::XML::Document.parse("") } + let (:context_node) { document.at_css("context") } + + it "has sane defaults" do + frag = Nokogiri::XML::DocumentFragment.new(document, input, context_node) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + end + + it "accepts options" do + frag = Nokogiri::XML::DocumentFragment.new(document, input, context_node, xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.new(document, input, context_node, xml_strict) + end + end + + it "takes a config block" do + default_config = nil + Nokogiri::XML::DocumentFragment.new(document, input, context_node) do |config| + default_config = config + end + refute(default_config.strict?) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.new(document, input, context_node) do |config| + config.norecover + end + end + end + end + end + end + describe "subclassing" do let(:klass) do Class.new(Nokogiri::XML::DocumentFragment) do