Merge pull request #10869 from pradyunsg/put-html5lib-behind-flag

pypa · Feb 3, 2022 · cf4655f · cf4655f
2 parents d7fed8f + 6c92a33
commit cf4655f
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 47 deletions.
diff --git a/news/10869.bugfix.rst b/news/10869.bugfix.rst
@@ -0,0 +1 @@
+Use ``html.parser`` by default, instead of falling back to ``html5lib`` when ``--use-deprecated=html5lib`` is not passed.
diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py
@@ -181,6 +181,30 @@ class UninstallationError(PipError):
     """General exception during uninstallation"""
 
 
+class BadHTMLDoctypeDeclaration(DiagnosticPipError):
+    reference = "bad-index-doctype"
+
+    def __init__(self, *, url: str) -> None:
+        super().__init__(
+            kind="warning",
+            message=(
+                "The package index page being used does not have a proper HTML "
+                "doctype declaration."
+            ),
+            context=f"Problematic URL: {escape(url)}",
+            note_stmt="This is an issue with the page at the URL mentioned above.",
+            hint_stmt=(
+                "You might need to reach out to the owner of that package index, "
+                "to get this fixed. "
+                "See https://github.com/pypa/pip/issues/10825 for context."
+            ),
+        )
+
+
+class MissingHTMLDoctypeDeclaration(BadHTMLDoctypeDeclaration):
+    reference = "missing-index-doctype"
+
+
 class MissingPyProjectBuildRequires(DiagnosticPipError):
     """Raised when pyproject.toml has `build-system`, but no `build-system.requires`."""
 

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -16,7 +16,6 @@
 from optparse import Values
 from typing import (
     TYPE_CHECKING,
-    Any,
     Callable,
     Dict,
     Iterable,
@@ -33,12 +32,15 @@
 from pip._vendor.requests import Response
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
-from pip._internal.exceptions import NetworkConnectionError
+from pip._internal.exceptions import (
+    BadHTMLDoctypeDeclaration,
+    MissingHTMLDoctypeDeclaration,
+    NetworkConnectionError,
+)
 from pip._internal.models.link import Link
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
-from pip._internal.utils.deprecation import deprecated
 from pip._internal.utils.filetypes import is_archive_file
 from pip._internal.utils.misc import pairwise, redact_auth_from_url
 from pip._internal.vcs import vcs
@@ -343,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
     """
     Parse an HTML document, and yield its anchor elements as Link objects.
     """
-    encoding = page.encoding or "utf-8"
-
-    # Check if the page starts with a valid doctype, to decide whether to use
-    # http.parser or (deprecated) html5lib for parsing -- unless explicitly
-    # requested to use html5lib.
-    if not use_deprecated_html5lib:
-        expected_doctype = "<!doctype html>".encode(encoding)
-        actual_start = page.content[: len(expected_doctype)]
-        if actual_start.decode(encoding).lower() != "<!doctype html>":
-            deprecated(
-                reason=(
-                    f"The HTML index page being used ({page.url}) is not a proper "
-                    "HTML 5 document. This is in violation of PEP 503 which requires "
-                    "these pages to be well-formed HTML 5 documents. Please reach out "
-                    "to the owners of this index page, and ask them to update this "
-                    "index page to a valid HTML 5 document."
-                ),
-                replacement=None,
-                gone_in="22.2",
-                issue=10825,
-            )
-            use_deprecated_html5lib = True
 
     if use_deprecated_html5lib:
         yield from _parse_links_html5lib(page)
         return
 
-    parser = HTMLLinkParser()
+    parser = HTMLLinkParser(page.url)
+    encoding = page.encoding or "utf-8"
     parser.feed(page.content.decode(encoding))
 
     url = page.url
@@ -418,20 +399,34 @@ class HTMLLinkParser(HTMLParser):
     elements' attributes.
     """
 
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-        self._seen_decl = False
+    def __init__(self, url: str) -> None:
+        super().__init__(convert_charrefs=True)
+        self._dealt_with_doctype_issues = False
+
+        self.url: str = url
         self.base_url: Optional[str] = None
         self.anchors: List[Dict[str, Optional[str]]] = []
 
     def handle_decl(self, decl: str) -> None:
-        if decl.lower() != "doctype html":
-            self._raise_error()
-        self._seen_decl = True
+        self._dealt_with_doctype_issues = True
+        match = re.match(
+            r"""doctype\s+html\s*(?:SYSTEM\s+(["'])about:legacy-compat\1)?\s*$""",
+            decl,
+            re.IGNORECASE,
+        )
+        if match is None:
+            logger.warning(
+                "[present-diagnostic] %s",
+                BadHTMLDoctypeDeclaration(url=self.url),
+            )
 
     def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
-        if not self._seen_decl:
-            self._raise_error()
+        if not self._dealt_with_doctype_issues:
+            logger.warning(
+                "[present-diagnostic] %s",
+                MissingHTMLDoctypeDeclaration(url=self.url),
+            )
+            self._dealt_with_doctype_issues = True
 
         if tag == "base" and self.base_url is None:
             href = self.get_href(attrs)
@@ -446,14 +441,6 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
                 return value
         return None
 
-    def _raise_error(self) -> None:
-        raise ValueError(
-            "HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
-            "If you believe this error to be incorrect, try passing the "
-            "command line option --use-deprecated=html5lib and please leave "
-            "a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
-        )
-
 
 def _handle_get_page_fail(
     link: Link,

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -551,21 +551,42 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
     assert "pkg1-2.0" in parsed_links[1].url
 
 
-@mock.patch("pip._internal.index.collector.deprecated")
-def test_parse_links_presents_deprecation_warning_on_non_html5_page(
-    mock_deprecated: mock.Mock,
+def test_parse_links_presents_warning_on_missing_doctype(
+    caplog: pytest.LogCaptureFixture,
 ) -> None:
     html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
     url = "https://example.com/simple/"
     page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
 
-    parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
+    with caplog.at_level(logging.WARN):
+        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
 
     assert len(parsed_links) == 2, parsed_links
     assert "pkg1-1.0" in parsed_links[0].url
     assert "pkg1-2.0" in parsed_links[1].url
 
-    mock_deprecated.assert_called_once()
+    assert len(caplog.records) == 1
+
+
+def test_parse_links_presents_warning_on_html4_doctype(
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    html = (
+        b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
+        b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
+        b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
+    )
+    url = "https://example.com/simple/"
+    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
+
+    with caplog.at_level(logging.WARN):
+        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
+
+    assert len(parsed_links) == 2, parsed_links
+    assert "pkg1-1.0" in parsed_links[0].url
+    assert "pkg1-2.0" in parsed_links[1].url
+
+    assert len(caplog.records) == 1
 
 
 @mock.patch("pip._internal.index.collector.raise_for_status")