Skip to content

Commit

Permalink
Switch to using html.parser even when the doctype isn't proper
Browse files Browse the repository at this point in the history
This ensures that we handle html5lib parsing as non-default deprecated
behaviour.
  • Loading branch information
pradyunsg committed Feb 1, 2022
1 parent d5aeced commit a12598f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 31 deletions.
30 changes: 4 additions & 26 deletions src/pip/_internal/index/collector.py
Expand Up @@ -41,7 +41,6 @@
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.vcs import vcs
Expand Down Expand Up @@ -346,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
encoding = page.encoding or "utf-8"

# Check if the page starts with a valid doctype, to decide whether to use
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
# requested to use html5lib.
if not use_deprecated_html5lib:
expected_doctype = "<!doctype html>".encode(encoding)
actual_start = page.content[: len(expected_doctype)]
if actual_start.decode(encoding).lower() != "<!doctype html>":
deprecated(
reason=(
f"The HTML index page being used ({page.url}) is not a proper "
"HTML 5 document. This is in violation of PEP 503 which requires "
"these pages to be well-formed HTML 5 documents. Please reach out "
"to the owners of this index page, and ask them to update this "
"index page to a valid HTML 5 document."
),
replacement=None,
gone_in="22.2",
issue=10825,
)
use_deprecated_html5lib = True

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser()
parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
Expand Down Expand Up @@ -437,14 +415,14 @@ def handle_decl(self, decl: str) -> None:
re.IGNORECASE,
)
if match is None:
logger.warn(
logger.warning(
"[present-diagnostic] %s",
BadHTMLDoctypeDeclaration(url=self.url),
)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._dealt_with_doctype_issues:
logger.warn(
logger.warning(
"[present-diagnostic] %s",
MissingHTMLDoctypeDeclaration(url=self.url),
)
Expand Down
31 changes: 26 additions & 5 deletions tests/unit/test_collector.py
Expand Up @@ -551,21 +551,42 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
assert "pkg1-2.0" in parsed_links[1].url


@mock.patch("pip._internal.index.collector.deprecated")
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
mock_deprecated: mock.Mock,
def test_parse_links_presents_warning_on_missing_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

mock_deprecated.assert_called_once()
assert len(caplog.records) == 1


def test_parse_links_presents_warning_on_html4_doctype(
caplog: pytest.LogCaptureFixture,
) -> None:
html = (
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
)
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

with caplog.at_level(logging.WARN):
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

assert len(caplog.records) == 1


@mock.patch("pip._internal.index.collector.raise_for_status")
Expand Down

0 comments on commit a12598f

Please sign in to comment.