Skip to content

Commit

Permalink
fix: revert libxml2 regression with HTML4 recovery
Browse files Browse the repository at this point in the history
Fixes #2461
  • Loading branch information
flavorjones committed Feb 21, 2022
1 parent 3b0cfed commit 16b4fa2
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
@@ -0,0 +1,45 @@
From ddc5f3d22644e0f6fbcc20541c86825757ffee62 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 21 Feb 2022 18:27:45 -0500
Subject: [PATCH] Revert "Different approach to fix quadratic behavior in HTML
push parser"

This reverts commit 798bdf13f6964a650b9a0b7b4b3a769f6f1d509a.
---
HTMLparser.c | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/HTMLparser.c b/HTMLparser.c
index eba2d7c..c0b8119 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -3960,25 +3960,13 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
- /*
- * The recovery code is disabled for now as it can result in
- * quadratic behavior with the push parser. htmlParseStartTag
- * must consume all content up to the final '>' in order to avoid
- * rescanning for this terminator.
- *
- * For a proper fix in line with HTML5, htmlParseStartTag and
- * htmlParseElement should only be called when there's an ASCII
- * alpha character following the initial '<'. Otherwise, the '<'
- * should be emitted as text (unless followed by '!', '/' or '?').
- */
-#if 0
/* if recover preserve text on classic misconstructs */
if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
(CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
htmlParseCharDataInternal(ctxt, '<');
return(-1);
}
-#endif
+

/* Dump the bogus tag like browsers do */
while ((CUR != 0) && (CUR != '>') &&
--
2.31.0

20 changes: 20 additions & 0 deletions test/html4/test_document.rb
Expand Up @@ -783,6 +783,26 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
assert(html_strict.strict?)
end

describe "ill-formed < character" do
let(:input) { %{<html><body><div>this < that</div><div>second element</div></body></html>} }

it "skips to the next start tag" do
# see https://github.com/sparklemotion/nokogiri/issues/2461 for why we're testing this edge case
if Nokogiri.uses_libxml?(">= 2.9.13")
skip_unless_libxml2_patch("0010-Revert-Different-approach-to-fix-quadratic-behavior.patch")
end

doc = Nokogiri::HTML4.parse(input)
body = doc.at_xpath("//body")

expected_error_snippet = Nokogiri.uses_libxml? ? "invalid element name" : "Missing start element name"
assert_includes(doc.errors.first.to_s, expected_error_snippet)

assert_equal("this < that", body.children.first.text, body.to_html)
assert_equal(["div", "div"], body.children.map(&:name), body.to_html)
end
end

describe "read memory" do
let(:input) { "<html><body><div" }

Expand Down

0 comments on commit 16b4fa2

Please sign in to comment.