Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gracefully fallback to html5lib for parsing non-compliant index pages #10847

Merged
merged 3 commits into from Jan 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions news/10847.removal.rst
@@ -0,0 +1 @@
Instead of failing on index pages that use non-compliant HTML 5, print a deprecation warning and fall back to ``html5lib``-based parsing for now. This simplifies the migration for non-compliant index pages, by letting such indexes function with a warning.
25 changes: 24 additions & 1 deletion src/pip/_internal/index/collector.py
Expand Up @@ -38,6 +38,7 @@
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.vcs import vcs
Expand Down Expand Up @@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
encoding = page.encoding or "utf-8"

# Check if the page starts with a valid doctype, to decide whether to use
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
# requested to use html5lib.
if not use_deprecated_html5lib:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess --use-deprecated=html5lib is also "suppress the warning" flag now, in addition to being a "oh no, the new parser doesn't work for me and I need something NOW" flag.

expected_doctype = "<!doctype html>".encode(encoding)
actual_start = page.content[: len(expected_doctype)]
if actual_start.decode(encoding).lower() != "<!doctype html>":
deprecated(
reason=(
f"The HTML index page being used ({page.url}) is not a proper "
"HTML 5 document. This is in violation of PEP 503 which requires "
"these pages to be well-formed HTML 5 documents. Please reach out "
"to the owners of this index page, and ask them to update this "
"index page to a valid HTML 5 document."
),
replacement=None,
gone_in="22.2",
issue=10825,
)
use_deprecated_html5lib = True

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser()
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
Expand Down
6 changes: 4 additions & 2 deletions tests/functional/test_install_config.py
Expand Up @@ -16,6 +16,8 @@
)
from tests.lib.venv import VirtualEnvironment

TEST_PYPI_INITOOLS = "https://test.pypi.org/simple/initools/"


def test_options_from_env_vars(script: PipTestEnvironment) -> None:
"""
Expand Down Expand Up @@ -94,7 +96,7 @@ def test_command_line_append_flags(
variables.

"""
script.environ["PIP_FIND_LINKS"] = "https://test.pypi.org"
script.environ["PIP_FIND_LINKS"] = TEST_PYPI_INITOOLS
result = script.pip(
"install",
"-vvv",
Expand Down Expand Up @@ -133,7 +135,7 @@ def test_command_line_appends_correctly(
Test multiple appending options set by environmental variables.

"""
script.environ["PIP_FIND_LINKS"] = f"https://test.pypi.org {data.find_links}"
script.environ["PIP_FIND_LINKS"] = f"{TEST_PYPI_INITOOLS} {data.find_links}"
result = script.pip(
"install",
"-vvv",
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/test_new_resolver_hashes.py
Expand Up @@ -36,7 +36,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks:
wheel_url=path_to_url(wheel_path),
wheel_hash=wheel_hash,
wheel_path=wheel_path,
)
).strip()
)

return _FindLinks(index_html, sdist_hash, wheel_hash)
Expand Down
21 changes: 19 additions & 2 deletions tests/unit/test_collector.py
Expand Up @@ -540,9 +540,9 @@ def test_parse_links_caches_same_page_by_url() -> None:


def test_parse_link_handles_deprecated_usage_properly() -> None:
html = b'<a href="/pkg1-1.0.tar.gz"><a href="/pkg1-2.0.tar.gz">'
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url)
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=True))

Expand All @@ -551,6 +551,23 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
assert "pkg1-2.0" in parsed_links[1].url


@mock.patch("pip._internal.index.collector.deprecated")
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
mock_deprecated: mock.Mock,
) -> None:
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

mock_deprecated.assert_called_once()


@mock.patch("pip._internal.index.collector.raise_for_status")
def test_request_http_error(
mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture
Expand Down