Gracefully fallback to html5lib for parsing non-compliant index pages

This reworks the HTML parsing logic, to gracefully use `html5lib` on non-compliant HTML 5 documents. This warning softens the failure mode for users who are using commercial package index solutions that do not follow the requisite standards and serve malformed HTML documents.
pypa · Jan 30, 2022 · 4e4d7a6 · 4e4d7a6
1 parent cc35c93
commit 4e4d7a6
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 1 deletion.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -38,6 +38,7 @@
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
+from pip._internal.utils.deprecation import deprecated
 from pip._internal.utils.filetypes import is_archive_file
 from pip._internal.utils.misc import pairwise, redact_auth_from_url
 from pip._internal.vcs import vcs
@@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
     """
     Parse an HTML document, and yield its anchor elements as Link objects.
     """
+    encoding = page.encoding or "utf-8"
+
+    # Check if the page starts with a valid doctype, to decide whether to use
+    # http.parser or (deprecated) html5lib for parsing -- unless explicitly
+    # requested to use html5lib.
+    if not use_deprecated_html5lib:
+        expected_doctype = "<!doctype html>".encode(encoding)
+        actual_start = page.content[: len(expected_doctype)]
+        if actual_start.decode(encoding).lower() != "<!doctype html>":
+            deprecated(
+                reason=(
+                    f"The HTML index page being used ({page.url}) is not a proper "
+                    "HTML 5 document. This is in violation of PEP 503 which requires "
+                    "these pages to be well-formed HTML 5 documents. Please reach out "
+                    "to the owners of this index page, and ask them to update this "
+                    "index page to a valid HTML 5 document."
+                ),
+                replacement=None,
+                gone_in="22.2",
+                issue=10825,
+            )
+            use_deprecated_html5lib = True
+
     if use_deprecated_html5lib:
         yield from _parse_links_html5lib(page)
         return
 
     parser = HTMLLinkParser()
-    encoding = page.encoding or "utf-8"
     parser.feed(page.content.decode(encoding))
 
     url = page.url

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -551,6 +551,23 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
     assert "pkg1-2.0" in parsed_links[1].url
 
 
+@mock.patch("pip._internal.index.collector.deprecated")
+def test_parse_links_presents_deprecation_warning_on_non_html5_page(
+    mock_deprecated: mock.Mock,
+) -> None:
+    html = b'<a href="/pkg1-1.0.tar.gz"><a href="/pkg1-2.0.tar.gz">'
+    url = "https://example.com/simple/"
+    page = HTMLPage(html, encoding=None, url=url)
+
+    parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
+
+    assert len(parsed_links) == 2, parsed_links
+    assert "pkg1-1.0" in parsed_links[0].url
+    assert "pkg1-2.0" in parsed_links[1].url
+
+    mock_deprecated.assert_called_once()
+
+
 @mock.patch("pip._internal.index.collector.raise_for_status")
 def test_request_http_error(
     mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture