From 43f9434b18a3269e42a53bc9a3f84ea9196ce188 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 31 Jan 2021 10:16:28 -0500 Subject: [PATCH 1/3] format: test/xml/sax/test_parser.rb Also, use skip instead of avoiding test definition. --- test/xml/sax/test_parser.rb | 304 ++++++++++++++++++------------------ 1 file changed, 148 insertions(+), 156 deletions(-) diff --git a/test/xml/sax/test_parser.rb b/test/xml/sax/test_parser.rb index b259369bdb..328deb88f7 100644 --- a/test/xml/sax/test_parser.rb +++ b/test/xml/sax/test_parser.rb @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# frozen_string_literal: true require "helper" @@ -13,181 +13,175 @@ def setup def test_parser_context_yielded_io doc = Doc.new - parser = XML::SAX::Parser.new doc + parser = XML::SAX::Parser.new(doc) xml = "" block_called = false - parser.parse(StringIO.new(xml)) { |ctx| + parser.parse(StringIO.new(xml)) do |ctx| block_called = true ctx.replace_entities = true - } + end - assert block_called + assert(block_called) - assert_equal [["foo", [["a", "&b"]]]], doc.start_elements + assert_equal([["foo", [["a", "&b"]]]], doc.start_elements) end def test_parser_context_yielded_in_memory doc = Doc.new - parser = XML::SAX::Parser.new doc + parser = XML::SAX::Parser.new(doc) xml = "" block_called = false - parser.parse(xml) { |ctx| + parser.parse(xml) do |ctx| block_called = true ctx.replace_entities = true - } + end - assert block_called + assert(block_called) - assert_equal [["foo", [["a", "&b"]]]], doc.start_elements + assert_equal([["foo", [["a", "&b"]]]], doc.start_elements) end def test_empty_decl parser = XML::SAX::Parser.new(Doc.new) xml = "" - parser.parse xml - assert parser.document.start_document_called, xml - assert_nil parser.document.xmldecls, xml + parser.parse(xml) + assert(parser.document.start_document_called, xml) + assert_nil(parser.document.xmldecls, xml) end def test_xml_decl [ - ['', - ["1.0"]], - ['', - ["1.0", "UTF-8"]], - ['', - ["1.0", "yes"]], - ['', - ["1.0", "no"]], - ['', - ["1.0", "UTF-8", "no"]], - ['', - ["1.0", "ISO-8859-1", "yes"]], + ['', ["1.0"]], + ['', ["1.0", "UTF-8"]], + ['', ["1.0", "yes"]], + ['', ["1.0", "no"]], + ['', ["1.0", "UTF-8", "no"]], + ['', ["1.0", "ISO-8859-1", "yes"]], ].each do |decl, value| parser = XML::SAX::Parser.new(Doc.new) xml = "#{decl}\n" - parser.parse xml - assert parser.document.start_document_called, xml - assert_equal value, parser.document.xmldecls, xml + parser.parse(xml) + assert(parser.document.start_document_called, xml) + assert_equal(value, parser.document.xmldecls, xml) end end def test_parse_empty - assert_raises RuntimeError do + assert_raises(RuntimeError) do @parser.parse("") end end def test_namespace_declaration_order_is_saved - @parser.parse <<-eoxml - - - + @parser.parse(<<~eoxml) + + + eoxml - assert_equal 2, @parser.document.start_elements_namespace.length + assert_equal(2, @parser.document.start_elements_namespace.length) el = @parser.document.start_elements_namespace.first namespaces = el.last - assert_equal ["foo", "http://foo.example.com/"], namespaces.first - assert_equal [nil, "http://example.com/"], namespaces.last + assert_equal(["foo", "http://foo.example.com/"], namespaces.first) + assert_equal([nil, "http://example.com/"], namespaces.last) end def test_bad_document_calls_error_handler @parser.parse("") - assert @parser.document.errors - assert @parser.document.errors.length > 0 + assert(@parser.document.errors) + assert(@parser.document.errors.length > 0) end def test_namespace_are_super_fun_to_parse - @parser.parse <<-eoxml - - - - - - hello world - + @parser.parse(<<~eoxml) + + + + + + hello world + eoxml - assert @parser.document.start_elements_namespace.length > 0 + assert(@parser.document.start_elements_namespace.length > 0) el = @parser.document.start_elements_namespace[1] - assert_equal "a", el.first - assert_equal 1, el[1].length + assert_equal("a", el.first) + assert_equal(1, el[1].length) attribute = el[1].first - assert_equal "bar", attribute.localname - assert_equal "foo", attribute.prefix - assert_equal "hello", attribute.value - assert_equal "http://foo.example.com/", attribute.uri + assert_equal("bar", attribute.localname) + assert_equal("foo", attribute.prefix) + assert_equal("hello", attribute.value) + assert_equal("http://foo.example.com/", attribute.uri) end def test_sax_v1_namespace_attribute_declarations - @parser.parse <<-eoxml - - - - - - hello world - + @parser.parse(<<~eoxml) + + + + + + hello world + eoxml - assert @parser.document.start_elements.length > 0 + assert(@parser.document.start_elements.length > 0) elm = @parser.document.start_elements.first - assert_equal "root", elm.first - assert elm[1].include?(["xmlns:foo", "http://foo.example.com/"]) - assert elm[1].include?(["xmlns", "http://example.com/"]) + assert_equal("root", elm.first) + assert(elm[1].include?(["xmlns:foo", "http://foo.example.com/"])) + assert(elm[1].include?(["xmlns", "http://example.com/"])) end def test_sax_v1_namespace_nodes - @parser.parse <<-eoxml - - - - - - hello world - + @parser.parse(<<~eoxml) + + + + + + hello world + eoxml - assert_equal 5, @parser.document.start_elements.length - assert @parser.document.start_elements.map(&:first).include?("foo:bar") - assert @parser.document.end_elements.map(&:first).include?("foo:bar") + assert_equal(5, @parser.document.start_elements.length) + assert(@parser.document.start_elements.map(&:first).include?("foo:bar")) + assert(@parser.document.end_elements.map(&:first).include?("foo:bar")) end def test_start_is_called_without_namespace - @parser.parse(<<-eoxml) - - - + @parser.parse(<<~eoxml) + + + eoxml - assert_equal ["root", "foo:f", "bar"], - @parser.document.start_elements.map(&:first) + assert_equal(["root", "foo:f", "bar"], + @parser.document.start_elements.map(&:first)) end def test_parser_sets_encoding parser = XML::SAX::Parser.new(Doc.new, "UTF-8") - assert_equal "UTF-8", parser.encoding + assert_equal("UTF-8", parser.encoding) end def test_errors_set_after_parsing_bad_dom doc = Nokogiri::XML("") - assert doc.errors + assert(doc.errors) @parser.parse("") - assert @parser.document.errors - assert @parser.document.errors.length > 0 + assert(@parser.document.errors) + assert(@parser.document.errors.length > 0) doc.errors.each do |error| - assert_equal "UTF-8", error.message.encoding.name + assert_equal("UTF-8", error.message.encoding.name) end # when using JRuby Nokogiri, more errors will be generated as the DOM # parser continue to parse an ill formed document, while the sax parser # will stop at the first error unless Nokogiri.jruby? - assert_equal doc.errors.length, @parser.document.errors.length + assert_equal(doc.errors.length, @parser.document.errors.length) end end @@ -197,76 +191,76 @@ def test_parse_with_memory_argument end def test_parse_with_io_argument - File.open(XML_FILE, "rb") { |f| + File.open(XML_FILE, "rb") do |f| @parser.parse(f) - } + end assert(@parser.document.cdata_blocks.length > 0) end def test_parse_io - call_parse_io_with_encoding "UTF-8" + call_parse_io_with_encoding("UTF-8") end # issue #828 def test_parse_io_lower_case_encoding - call_parse_io_with_encoding "utf-8" + call_parse_io_with_encoding("utf-8") end def call_parse_io_with_encoding(encoding) - File.open(XML_FILE, "rb") { |f| + File.open(XML_FILE, "rb") do |f| @parser.parse_io(f, encoding) - } + end assert(@parser.document.cdata_blocks.length > 0) called = false @parser.document.start_elements.flatten.each do |thing| - assert_equal "UTF-8", thing.encoding.name + assert_equal("UTF-8", thing.encoding.name) called = true end - assert called + assert(called) called = false @parser.document.end_elements.flatten.each do |thing| - assert_equal "UTF-8", thing.encoding.name + assert_equal("UTF-8", thing.encoding.name) called = true end - assert called + assert(called) called = false @parser.document.data.each do |thing| - assert_equal "UTF-8", thing.encoding.name + assert_equal("UTF-8", thing.encoding.name) called = true end - assert called + assert(called) called = false @parser.document.comments.flatten.each do |thing| - assert_equal "UTF-8", thing.encoding.name + assert_equal("UTF-8", thing.encoding.name) called = true end - assert called + assert(called) called = false @parser.document.cdata_blocks.flatten.each do |thing| - assert_equal "UTF-8", thing.encoding.name + assert_equal("UTF-8", thing.encoding.name) called = true end - assert called + assert(called) end def test_parse_file @parser.parse_file(XML_FILE) - assert_raises(ArgumentError) { + assert_raises(ArgumentError) do @parser.parse_file(nil) - } + end - assert_raises(Errno::ENOENT) { + assert_raises(Errno::ENOENT) do @parser.parse_file("") - } - assert_raises(Errno::EISDIR) { + end + assert_raises(Errno::EISDIR) do @parser.parse_file(File.expand_path(File.dirname(__FILE__))) - } + end end def test_render_parse_nil_param @@ -279,122 +273,120 @@ def test_bad_encoding_args end def test_ctag - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal [" This is a comment "], @parser.document.cdata_blocks + assert_equal([" This is a comment "], @parser.document.cdata_blocks) end def test_comment - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal [" This is a comment "], @parser.document.comments + assert_equal([" This is a comment "], @parser.document.comments) end def test_characters - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal ["Paragraph 1"], @parser.document.data + assert_equal(["Paragraph 1"], @parser.document.data) end def test_end_document - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert @parser.document.end_document_called + assert(@parser.document.end_document_called) end def test_end_element - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal [["p"]], - @parser.document.end_elements + assert_equal([["p"]], @parser.document.end_elements) end def test_start_element_attrs - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal [["p", [["id", "asdfasdf"]]]], - @parser.document.start_elements + assert_equal([["p", [["id", "asdfasdf"]]]], @parser.document.start_elements) end def test_start_element_attrs_include_namespaces - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml)

Paragraph 1

eoxml - assert_equal [["p", [["xmlns:foo", "http://foo.example.com/"]]]], - @parser.document.start_elements + assert_equal([["p", [["xmlns:foo", "http://foo.example.com/"]]]], + @parser.document.start_elements) end def test_processing_instruction - @parser.parse_memory(<<-eoxml) + @parser.parse_memory(<<~eoxml) eoxml - assert_equal [["xml-stylesheet", 'href="a.xsl" type="text/xsl"']], - @parser.document.processing_instructions + assert_equal([["xml-stylesheet", 'href="a.xsl" type="text/xsl"']], + @parser.document.processing_instructions) end - if Nokogiri.uses_libxml? # JRuby SAXParser only parses well-formed XML documents - def test_parse_document - @parser.parse_memory(<<-eoxml) -

Paragraph 1

-

Paragraph 2

- eoxml - end + def test_parse_document + skip("JRuby SAXParser only parses well-formed XML documents") unless Nokogiri.uses_libxml? + @parser.parse_memory(<<~eoxml) +

Paragraph 1

+

Paragraph 2

+ eoxml end def test_parser_attributes - xml = <<-eoxml - + xml = <<~eoxml + eoxml block_called = false - @parser.parse(xml) { |ctx| + @parser.parse(xml) do |ctx| block_called = true ctx.replace_entities = true - } + end - assert block_called + assert(block_called) - assert_equal [["root", []], ["foo", [["a", "&b"], ["c", ">d"]]]], @parser.document.start_elements + assert_equal([["root", []], ["foo", [["a", "&b"], ["c", ">d"]]]], @parser.document.start_elements) end def test_recovery_from_incorrect_xml - xml = <<-eoxml -heyhey yourself + xml = <<~eoxml + heyhey yourself eoxml block_called = false - @parser.parse(xml) { |ctx| + @parser.parse(xml) do |ctx| block_called = true ctx.recovery = true - } + end - assert block_called + assert(block_called) - assert_equal [["Root", []], ["Data", []], ["Item", []], ["Data", []], ["Item", []]], @parser.document.start_elements + assert_equal([["Root", []], ["Data", []], ["Item", []], ["Data", []], ["Item", []]], + @parser.document.start_elements) end def test_square_bracket_in_text # issue 1261 - xml = <<-eoxml - - en:#:home_page:#:stories:#:[6]:#:name - Sandy S. - + xml = <<~eoxml + + en:#:home_page:#:stories:#:[6]:#:name + Sandy S. + eoxml @parser.parse(xml) - assert_includes @parser.document.data, "en:#:home_page:#:stories:#:[6]:#:name" + assert_includes(@parser.document.data, "en:#:home_page:#:stories:#:[6]:#:name") end end end From bc9edbf619b535b315389e625bdc00542ee1a646 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 31 Jan 2021 10:40:30 -0500 Subject: [PATCH 2/3] test: reproduce large cdata parse failure from #2132 --- test/xml/sax/test_parser.rb | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/test/xml/sax/test_parser.rb b/test/xml/sax/test_parser.rb index 328deb88f7..161ad0c625 100644 --- a/test/xml/sax/test_parser.rb +++ b/test/xml/sax/test_parser.rb @@ -388,6 +388,38 @@ def test_square_bracket_in_text # issue 1261 @parser.parse(xml) assert_includes(@parser.document.data, "en:#:home_page:#:stories:#:[6]:#:name") end + + def test_large_cdata_is_handled + # see #2132 and https://gitlab.gnome.org/GNOME/libxml2/-/issues/200 + skip("Upstream libxml2 <= 2.9.10 needs to be patched") if Nokogiri::VersionInfo.instance.libxml2_using_system? + + template = <<~EOF + + + + + gorilla + secret + + + + + + + + + EOF + + factor = 10 + huge_data = "a" * (1024 * 1024 * factor) + xml = StringIO.new(template % (huge_data)) + + handler = Nokogiri::SAX::TestCase::Doc.new + parser = Nokogiri::XML::SAX::Parser.new(handler) + parser.parse(xml) + + assert_predicate(handler.errors, :empty?) + end end end end From e0da4d229f36c8a6035bdc675d429e9df0918aa6 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 31 Jan 2021 10:07:14 -0500 Subject: [PATCH 3/3] fix(cruby): patch libxml2 to address GNOME/libxml2#200 This patch shrinks the libxml2 input buffer in a few parser functions. Fixes #2132 --- CHANGELOG.md | 1 + ...nk-the-input-buffer-when-appropriate.patch | 70 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 patches/libxml2/0010-parser.c-shrink-the-input-buffer-when-appropriate.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index 02f4c831cf..e2c1da982f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Fixed * [CRuby] `NodeSet` may now safely contain `Node` objects from multiple documents. Previously the GC lifecycle of the parent `Document` objects could lead to contained nodes being GCed while still in scope. [[#1952](https://github.com/sparklemotion/nokogiri/issues/1952)] +* [CRuby] Patch libxml2 to avoid "huge input lookup" errors on large CDATA elements. (See upstream [GNOME/libxml2#200](https://gitlab.gnome.org/GNOME/libxml2/-/issues/200) and [GNOME/libxml2!100](https://gitlab.gnome.org/GNOME/libxml2/-/merge_requests/100).) [[#2132](https://github.com/sparklemotion/nokogiri/issues/2132)]. * [CRuby] `{XML,HTML}::Document.parse` now invokes `#initialize` exactly once. Previously `#initialize` was invoked twice on each object. * [JRuby] `{XML,HTML}::Document.parse` now invokes `#initialize` exactly once. Previously `#initialize` was not called, which was a problem for subclassing such as done by `Loofah`. diff --git a/patches/libxml2/0010-parser.c-shrink-the-input-buffer-when-appropriate.patch b/patches/libxml2/0010-parser.c-shrink-the-input-buffer-when-appropriate.patch new file mode 100644 index 0000000000..d3d9ad46a4 --- /dev/null +++ b/patches/libxml2/0010-parser.c-shrink-the-input-buffer-when-appropriate.patch @@ -0,0 +1,70 @@ +From ca565c1edef9a455453fa8564270cc9c5813e1b9 Mon Sep 17 00:00:00 2001 +From: Mike Dalessio +Date: Sun, 31 Jan 2021 09:53:56 -0500 +Subject: [PATCH] parser.c: shrink the input buffer when appropriate + +Fixes GNOME/libxml2#200 + +Also see discussions at: +- GNOME/libxml2#192 +- https://gitlab.gnome.org/nwellnhof/libxml2/-/commit/99bda1e +- https://github.com/sparklemotion/nokogiri/issues/2132 +--- + parser.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/parser.c b/parser.c +index a7bdc7f..efde672 100644 +--- a/parser.c ++++ b/parser.c +@@ -4204,6 +4204,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { + } + count++; + if (count > 50) { ++ SHRINK; + GROW; + count = 0; + if (ctxt->instate == XML_PARSER_EOF) { +@@ -4291,6 +4292,7 @@ xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) { + buf[len++] = cur; + count++; + if (count > 50) { ++ SHRINK; + GROW; + count = 0; + if (ctxt->instate == XML_PARSER_EOF) { +@@ -4571,6 +4573,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int cdata) { + } + count++; + if (count > 50) { ++ SHRINK; + GROW; + count = 0; + if (ctxt->instate == XML_PARSER_EOF) +@@ -4776,6 +4779,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf, + + count++; + if (count > 50) { ++ SHRINK; + GROW; + count = 0; + if (ctxt->instate == XML_PARSER_EOF) { +@@ -5186,6 +5190,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) { + } + count++; + if (count > 50) { ++ SHRINK; + GROW; + if (ctxt->instate == XML_PARSER_EOF) { + xmlFree(buf); +@@ -9783,6 +9788,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) { + sl = l; + count++; + if (count > 50) { ++ SHRINK; + GROW; + if (ctxt->instate == XML_PARSER_EOF) { + xmlFree(buf); +-- +2.25.1 +