Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Put html5lib logic behind a flag, unconditionally #10869

Merged
merged 4 commits into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions news/10869.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Use ``html.parser`` by default, instead of falling back to ``html5lib`` when ``--use-deprecated=html5lib`` is not passed.
24 changes: 24 additions & 0 deletions src/pip/_internal/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,30 @@ class UninstallationError(PipError):
"""General exception during uninstallation"""


class BadHTMLDoctypeDeclaration(DiagnosticPipError):
reference = "bad-index-doctype"

def __init__(self, *, url: str) -> None:
super().__init__(
kind="warning",
message=(
"The package index page being used does not have a proper HTML "
"doctype declaration."
),
context=f"Problematic URL: {escape(url)}",
note_stmt="This is an issue with the page at the URL mentioned above.",
hint_stmt=(
"You might need to reach out to the owner of that package index, "
"to get this fixed. "
"See https://github.com/pypa/pip/issues/10825 for context."
),
)


class MissingHTMLDoctypeDeclaration(BadHTMLDoctypeDeclaration):
reference = "missing-index-doctype"


class MissingPyProjectBuildRequires(DiagnosticPipError):
"""Raised when pyproject.toml has `build-system`, but no `build-system.requires`."""

Expand Down
71 changes: 29 additions & 42 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from optparse import Values
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Expand All @@ -33,12 +32,15 @@
from pip._vendor.requests import Response
from pip._vendor.requests.exceptions import RetryError, SSLError

from pip._internal.exceptions import NetworkConnectionError
from pip._internal.exceptions import (
BadHTMLDoctypeDeclaration,
MissingHTMLDoctypeDeclaration,
NetworkConnectionError,
)
from pip._internal.models.link import Link
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.vcs import vcs
Expand Down Expand Up @@ -343,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
encoding = page.encoding or "utf-8"

# Check if the page starts with a valid doctype, to decide whether to use
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
# requested to use html5lib.
if not use_deprecated_html5lib:
expected_doctype = "<!doctype html>".encode(encoding)
actual_start = page.content[: len(expected_doctype)]
if actual_start.decode(encoding).lower() != "<!doctype html>":
deprecated(
reason=(
f"The HTML index page being used ({page.url}) is not a proper "
"HTML 5 document. This is in violation of PEP 503 which requires "
"these pages to be well-formed HTML 5 documents. Please reach out "
"to the owners of this index page, and ask them to update this "
"index page to a valid HTML 5 document."
),
replacement=None,
gone_in="22.2",
issue=10825,
)
use_deprecated_html5lib = True

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser()
parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
Expand Down Expand Up @@ -418,20 +399,34 @@ class HTMLLinkParser(HTMLParser):
elements' attributes.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._seen_decl = False
def __init__(self, url: str) -> None:
super().__init__(convert_charrefs=True)
self._dealt_with_doctype_issues = False

self.url: str = url
self.base_url: Optional[str] = None
self.anchors: List[Dict[str, Optional[str]]] = []

def handle_decl(self, decl: str) -> None:
if decl.lower() != "doctype html":
self._raise_error()
self._seen_decl = True
self._dealt_with_doctype_issues = True
match = re.match(
r"""doctype\s+html\s*(?:SYSTEM\s+(["'])about:legacy-compat\1)?\s*$""",
decl,
re.IGNORECASE,
)
if match is None:
logger.warning(
"[present-diagnostic] %s",
BadHTMLDoctypeDeclaration(url=self.url),
)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._seen_decl:
self._raise_error()
if not self._dealt_with_doctype_issues:
logger.warning(
"[present-diagnostic] %s",
MissingHTMLDoctypeDeclaration(url=self.url),
)
self._dealt_with_doctype_issues = True

if tag == "base" and self.base_url is None:
href = self.get_href(attrs)
Expand All @@ -446,14 +441,6 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
return value
return None

def _raise_error(self) -> None:
raise ValueError(
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
"If you believe this error to be incorrect, try passing the "
"command line option --use-deprecated=html5lib and please leave "
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
)


def _handle_get_page_fail(
link: Link,
Expand Down
31 changes: 26 additions & 5 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,21 +551,42 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
assert "pkg1-2.0" in parsed_links[1].url


@mock.patch("pip._internal.index.collector.deprecated")
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
mock_deprecated: mock.Mock,
def test_parse_links_presents_warning_on_missing_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

mock_deprecated.assert_called_once()
assert len(caplog.records) == 1


def test_parse_links_presents_warning_on_html4_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = (
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
)
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

assert len(caplog.records) == 1


@mock.patch("pip._internal.index.collector.raise_for_status")
Expand Down