diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index b27aad7f78..73ebb0b9ea 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -12,6 +12,8 @@ jobs: runs-on: ubuntu-latest container: image: ghcr.io/sparklemotion/nokogiri-test:mri-3.1 + env: + CI_UPSTREAM_XMLSOFT: t steps: - uses: actions/checkout@v2 with: @@ -37,6 +39,8 @@ jobs: runs-on: ubuntu-latest container: image: ghcr.io/sparklemotion/nokogiri-test:mri-3.1 + env: + CI_UPSTREAM_XMLSOFT: t steps: - uses: actions/checkout@v2 with: diff --git a/dependencies.yml b/dependencies.yml index 5e9b83bbc6..66e926e318 100644 --- a/dependencies.yml +++ b/dependencies.yml @@ -1,7 +1,7 @@ libxml2: - version: "2.9.13" - sha256: "276130602d12fe484ecc03447ee5e759d0465558fbc9d6bd144e3745306ebf0e" - # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.9/libxml2-2.9.13.sha256sum + version: "2.9.14" + sha256: "60d74a257d1ccec0475e749cba2f21559e48139efba6ff28224357c7c798dfee" + # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.9/libxml2-2.9.14.sha256sum libxslt: version: "1.1.35" diff --git a/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch b/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch deleted file mode 100644 index 614e227269..0000000000 --- a/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch +++ /dev/null @@ -1,45 +0,0 @@ -From ddc5f3d22644e0f6fbcc20541c86825757ffee62 Mon Sep 17 00:00:00 2001 -From: Mike Dalessio -Date: Mon, 21 Feb 2022 18:27:45 -0500 -Subject: [PATCH] Revert "Different approach to fix quadratic behavior in HTML - push parser" - -This reverts commit 798bdf13f6964a650b9a0b7b4b3a769f6f1d509a. ---- - HTMLparser.c | 14 +------------- - 1 file changed, 1 insertion(+), 13 deletions(-) - -diff --git a/HTMLparser.c b/HTMLparser.c -index eba2d7c..c0b8119 100644 ---- a/HTMLparser.c -+++ b/HTMLparser.c -@@ -3960,25 +3960,13 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { - htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, - "htmlParseStartTag: invalid element name\n", - NULL, NULL); -- /* -- * The recovery code is disabled for now as it can result in -- * quadratic behavior with the push parser. htmlParseStartTag -- * must consume all content up to the final '>' in order to avoid -- * rescanning for this terminator. -- * -- * For a proper fix in line with HTML5, htmlParseStartTag and -- * htmlParseElement should only be called when there's an ASCII -- * alpha character following the initial '<'. Otherwise, the '<' -- * should be emitted as text (unless followed by '!', '/' or '?'). -- */ --#if 0 - /* if recover preserve text on classic misconstructs */ - if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || - (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { - htmlParseCharDataInternal(ctxt, '<'); - return(-1); - } --#endif -+ - - /* Dump the bogus tag like browsers do */ - while ((CUR != 0) && (CUR != '>') && --- -2.31.0 - diff --git a/test/helper.rb b/test/helper.rb index 39229a237e..d0202ed107 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -132,6 +132,14 @@ class TestCase < MiniTest::Spec @@test_count = 0 # rubocop:disable Style/ClassVars @@gc_level = nil # rubocop:disable Style/ClassVars + def self.upstream_xmlsoft? + ENV["CI_UPSTREAM_XMLSOFT"] || Nokogiri::LIBXML_LOADED_VERSION.include?("-GIT") + end + + def upstream_xmlsoft? + self.class.upstream_xmlsoft? + end + def initialize_nokogiri_test_gc_level return if Nokogiri.jruby? return if @@gc_level diff --git a/test/html4/test_comments.rb b/test/html4/test_comments.rb index 32d7a87855..96af26a866 100644 --- a/test/html4/test_comments.rb +++ b/test/html4/test_comments.rb @@ -23,7 +23,7 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") || upstream_xmlsoft? it "behaves as if the comment is closed correctly" do # COMPLIANT assert_equal 1, subject.children.length assert_predicate subject.children.first, :comment? @@ -54,7 +54,7 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + if Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") || upstream_xmlsoft? it "behaves as if the comment is closed correctly" do # COMPLIANT assert_equal 1, subject.children.length assert_predicate subject.children.first, :comment? @@ -173,7 +173,7 @@ class TestComment < Nokogiri::TestCase let(:body) { doc.at_css("body") } let(:subject) { doc.at_css("div#under-test") } - if Nokogiri.uses_libxml? + if Nokogiri.uses_libxml?("<=2.9.13") && !upstream_xmlsoft? it "ignores up to the next '>'" do # NON-COMPLIANT assert_equal 2, body.children.length assert_equal body.children[0], subject @@ -183,10 +183,33 @@ class TestComment < Nokogiri::TestCase assert_predicate body.children[1], :text? assert_equal "-->hello", body.children[1].content end + elsif Nokogiri.uses_libxml? + it "parses as pcdata" do # NON-COMPLIANT + assert_equal 1, body.children.length + assert_equal subject, body.children.first + + assert_equal 3, subject.children.length + subject.children[0].tap do |child| + assert_predicate(child, :text?) + assert_equal("hello", child.content) + end + end end if Nokogiri.jruby? it "ignores up to the next '-->'" do # NON-COMPLIANT + assert_equal 1, body.children.length + assert_equal subject, body.children.first + assert_equal 1, subject.children.length assert_predicate subject.children[0], :text? assert_equal "hello", subject.children[0].content diff --git a/test/html4/test_document.rb b/test/html4/test_document.rb index 7468ece78b..ce7d56f151 100644 --- a/test/html4/test_document.rb +++ b/test/html4/test_document.rb @@ -779,7 +779,7 @@ def test_leaking_dtd_nodes_after_internal_subset_removal doc = Nokogiri::HTML4::Document.parse(html) expected = if Nokogiri.jruby? [Nokogiri::XML::Node::COMMENT_NODE, Nokogiri::XML::Node::PI_NODE] - elsif Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + elsif Nokogiri.libxml2_patches.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") || upstream_xmlsoft? [Nokogiri::XML::Node::COMMENT_NODE] else [] @@ -801,18 +801,25 @@ def test_leaking_dtd_nodes_after_internal_subset_removal it "skips to the next start tag" do # see https://github.com/sparklemotion/nokogiri/issues/2461 for why we're testing this edge case - if Nokogiri.uses_libxml?(">= 2.9.13") - skip_unless_libxml2_patch("0010-Revert-Different-approach-to-fix-quadratic-behavior.patch") - end - doc = Nokogiri::HTML4.parse(input) body = doc.at_xpath("//body") - expected_error_snippet = Nokogiri.uses_libxml? ? "invalid element name" : "Missing start element name" - assert_includes(doc.errors.first.to_s, expected_error_snippet) - - assert_equal("this < that", body.children.first.text, body.to_html) - assert_equal(["div", "div"], body.children.map(&:name), body.to_html) + if Nokogiri.uses_libxml?("= 2.9.13") && !upstream_xmlsoft? + #
this
second element
+ assert_equal(1, body.children.length) + body.children.first.tap do |div| + assert_equal(2, div.children.length) + assert_equal("this ", div.children[0].content) + assert_equal("div", div.children[1].name) + assert_equal("second element", div.children[1].content) + end + else + #
this < that
second element
+ assert_equal(2, body.children.length) + assert_equal(["div", "div"], body.children.map(&:name), body.to_html) + assert_equal("this < that", body.children[0].text, body.to_html) + assert_equal("second element", body.children[1].text, body.to_html) + end end end