Skip to content

Commit

Permalink
Merge pull request #10869 from pradyunsg/put-html5lib-behind-flag
Browse files Browse the repository at this point in the history
  • Loading branch information
pradyunsg committed Feb 3, 2022
2 parents d7fed8f + 6c92a33 commit cf4655f
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 47 deletions.
1 change: 1 addition & 0 deletions news/10869.bugfix.rst
@@ -0,0 +1 @@
Use ``html.parser`` by default, instead of falling back to ``html5lib`` when ``--use-deprecated=html5lib`` is not passed.
24 changes: 24 additions & 0 deletions src/pip/_internal/exceptions.py
Expand Up @@ -181,6 +181,30 @@ class UninstallationError(PipError):
"""General exception during uninstallation"""


class BadHTMLDoctypeDeclaration(DiagnosticPipError):
reference = "bad-index-doctype"

def __init__(self, *, url: str) -> None:
super().__init__(
kind="warning",
message=(
"The package index page being used does not have a proper HTML "
"doctype declaration."
),
context=f"Problematic URL: {escape(url)}",
note_stmt="This is an issue with the page at the URL mentioned above.",
hint_stmt=(
"You might need to reach out to the owner of that package index, "
"to get this fixed. "
"See https://github.com/pypa/pip/issues/10825 for context."
),
)


class MissingHTMLDoctypeDeclaration(BadHTMLDoctypeDeclaration):
reference = "missing-index-doctype"


class MissingPyProjectBuildRequires(DiagnosticPipError):
"""Raised when pyproject.toml has `build-system`, but no `build-system.requires`."""

Expand Down
71 changes: 29 additions & 42 deletions src/pip/_internal/index/collector.py
Expand Up @@ -16,7 +16,6 @@
from optparse import Values
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Expand All @@ -33,12 +32,15 @@
from pip._vendor.requests import Response
from pip._vendor.requests.exceptions import RetryError, SSLError

from pip._internal.exceptions import NetworkConnectionError
from pip._internal.exceptions import (
BadHTMLDoctypeDeclaration,
MissingHTMLDoctypeDeclaration,
NetworkConnectionError,
)
from pip._internal.models.link import Link
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.vcs import vcs
Expand Down Expand Up @@ -343,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
encoding = page.encoding or "utf-8"

# Check if the page starts with a valid doctype, to decide whether to use
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
# requested to use html5lib.
if not use_deprecated_html5lib:
expected_doctype = "<!doctype html>".encode(encoding)
actual_start = page.content[: len(expected_doctype)]
if actual_start.decode(encoding).lower() != "<!doctype html>":
deprecated(
reason=(
f"The HTML index page being used ({page.url}) is not a proper "
"HTML 5 document. This is in violation of PEP 503 which requires "
"these pages to be well-formed HTML 5 documents. Please reach out "
"to the owners of this index page, and ask them to update this "
"index page to a valid HTML 5 document."
),
replacement=None,
gone_in="22.2",
issue=10825,
)
use_deprecated_html5lib = True

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser()
parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
Expand Down Expand Up @@ -418,20 +399,34 @@ class HTMLLinkParser(HTMLParser):
elements' attributes.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._seen_decl = False
def __init__(self, url: str) -> None:
super().__init__(convert_charrefs=True)
self._dealt_with_doctype_issues = False

self.url: str = url
self.base_url: Optional[str] = None
self.anchors: List[Dict[str, Optional[str]]] = []

def handle_decl(self, decl: str) -> None:
if decl.lower() != "doctype html":
self._raise_error()
self._seen_decl = True
self._dealt_with_doctype_issues = True
match = re.match(
r"""doctype\s+html\s*(?:SYSTEM\s+(["'])about:legacy-compat\1)?\s*$""",
decl,
re.IGNORECASE,
)
if match is None:
logger.warning(
"[present-diagnostic] %s",
BadHTMLDoctypeDeclaration(url=self.url),
)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._seen_decl:
self._raise_error()
if not self._dealt_with_doctype_issues:
logger.warning(
"[present-diagnostic] %s",
MissingHTMLDoctypeDeclaration(url=self.url),
)
self._dealt_with_doctype_issues = True

if tag == "base" and self.base_url is None:
href = self.get_href(attrs)
Expand All @@ -446,14 +441,6 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
return value
return None

def _raise_error(self) -> None:
raise ValueError(
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
"If you believe this error to be incorrect, try passing the "
"command line option --use-deprecated=html5lib and please leave "
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
)


def _handle_get_page_fail(
link: Link,
Expand Down
31 changes: 26 additions & 5 deletions tests/unit/test_collector.py
Expand Up @@ -551,21 +551,42 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
assert "pkg1-2.0" in parsed_links[1].url


@mock.patch("pip._internal.index.collector.deprecated")
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
mock_deprecated: mock.Mock,
def test_parse_links_presents_warning_on_missing_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

mock_deprecated.assert_called_once()
assert len(caplog.records) == 1


def test_parse_links_presents_warning_on_html4_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = (
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
)
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

assert len(caplog.records) == 1


@mock.patch("pip._internal.index.collector.raise_for_status")
Expand Down

0 comments on commit cf4655f

Please sign in to comment.