diff --git a/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch b/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch new file mode 100644 index 0000000000..614e227269 --- /dev/null +++ b/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch @@ -0,0 +1,45 @@ +From ddc5f3d22644e0f6fbcc20541c86825757ffee62 Mon Sep 17 00:00:00 2001 +From: Mike Dalessio +Date: Mon, 21 Feb 2022 18:27:45 -0500 +Subject: [PATCH] Revert "Different approach to fix quadratic behavior in HTML + push parser" + +This reverts commit 798bdf13f6964a650b9a0b7b4b3a769f6f1d509a. +--- + HTMLparser.c | 14 +------------- + 1 file changed, 1 insertion(+), 13 deletions(-) + +diff --git a/HTMLparser.c b/HTMLparser.c +index eba2d7c..c0b8119 100644 +--- a/HTMLparser.c ++++ b/HTMLparser.c +@@ -3960,25 +3960,13 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { + htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, + "htmlParseStartTag: invalid element name\n", + NULL, NULL); +- /* +- * The recovery code is disabled for now as it can result in +- * quadratic behavior with the push parser. htmlParseStartTag +- * must consume all content up to the final '>' in order to avoid +- * rescanning for this terminator. +- * +- * For a proper fix in line with HTML5, htmlParseStartTag and +- * htmlParseElement should only be called when there's an ASCII +- * alpha character following the initial '<'. Otherwise, the '<' +- * should be emitted as text (unless followed by '!', '/' or '?'). +- */ +-#if 0 + /* if recover preserve text on classic misconstructs */ + if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || + (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { + htmlParseCharDataInternal(ctxt, '<'); + return(-1); + } +-#endif ++ + + /* Dump the bogus tag like browsers do */ + while ((CUR != 0) && (CUR != '>') && +-- +2.31.0 + diff --git a/test/html4/test_document.rb b/test/html4/test_document.rb index 0506fb1bea..04ca56a285 100644 --- a/test/html4/test_document.rb +++ b/test/html4/test_document.rb @@ -783,6 +783,26 @@ def test_leaking_dtd_nodes_after_internal_subset_removal assert(html_strict.strict?) end + describe "ill-formed < character" do + let(:input) { %{
this < that
second element
} } + + it "skips to the next start tag" do + # see https://github.com/sparklemotion/nokogiri/issues/2461 for why we're testing this edge case + if Nokogiri.uses_libxml?(">= 2.9.13") + skip_unless_libxml2_patch("0010-Revert-Different-approach-to-fix-quadratic-behavior.patch") + end + + doc = Nokogiri::HTML4.parse(input) + body = doc.at_xpath("//body") + + expected_error_snippet = Nokogiri.uses_libxml? ? "invalid element name" : "Missing start element name" + assert_includes(doc.errors.first.to_s, expected_error_snippet) + + assert_equal("this < that", body.children.first.text, body.to_html) + assert_equal(["div", "div"], body.children.map(&:name), body.to_html) + end + end + describe "read memory" do let(:input) { "