diff --git a/news/10847.removal.rst b/news/10847.removal.rst new file mode 100644 index 00000000000..d29913d986f --- /dev/null +++ b/news/10847.removal.rst @@ -0,0 +1 @@ +Instead of failing on index pages that use non-compliant HTML 5, print a deprecation warning and fall back to ``html5lib``-based parsing for now. This simplifies the migration for non-compliant index pages, by letting such indexes function with a warning. diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 5de084a8ad9..4ecbb337805 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -38,6 +38,7 @@ from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status +from pip._internal.utils.deprecation import deprecated from pip._internal.utils.filetypes import is_archive_file from pip._internal.utils.misc import pairwise, redact_auth_from_url from pip._internal.vcs import vcs @@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin """ Parse an HTML document, and yield its anchor elements as Link objects. """ + encoding = page.encoding or "utf-8" + + # Check if the page starts with a valid doctype, to decide whether to use + # http.parser or (deprecated) html5lib for parsing -- unless explicitly + # requested to use html5lib. + if not use_deprecated_html5lib: + expected_doctype = "".encode(encoding) + actual_start = page.content[: len(expected_doctype)] + if actual_start.decode(encoding).lower() != "": + deprecated( + reason=( + f"The HTML index page being used ({page.url}) is not a proper " + "HTML 5 document. This is in violation of PEP 503 which requires " + "these pages to be well-formed HTML 5 documents. Please reach out " + "to the owners of this index page, and ask them to update this " + "index page to a valid HTML 5 document." + ), + replacement=None, + gone_in="22.2", + issue=10825, + ) + use_deprecated_html5lib = True + if use_deprecated_html5lib: yield from _parse_links_html5lib(page) return parser = HTMLLinkParser() - encoding = page.encoding or "utf-8" parser.feed(page.content.decode(encoding)) url = page.url diff --git a/tests/functional/test_install_config.py b/tests/functional/test_install_config.py index 5eae0401aa7..2e4bb742785 100644 --- a/tests/functional/test_install_config.py +++ b/tests/functional/test_install_config.py @@ -16,6 +16,8 @@ ) from tests.lib.venv import VirtualEnvironment +TEST_PYPI_INITOOLS = "https://test.pypi.org/simple/initools/" + def test_options_from_env_vars(script: PipTestEnvironment) -> None: """ @@ -94,7 +96,7 @@ def test_command_line_append_flags( variables. """ - script.environ["PIP_FIND_LINKS"] = "https://test.pypi.org" + script.environ["PIP_FIND_LINKS"] = TEST_PYPI_INITOOLS result = script.pip( "install", "-vvv", @@ -133,7 +135,7 @@ def test_command_line_appends_correctly( Test multiple appending options set by environmental variables. """ - script.environ["PIP_FIND_LINKS"] = f"https://test.pypi.org {data.find_links}" + script.environ["PIP_FIND_LINKS"] = f"{TEST_PYPI_INITOOLS} {data.find_links}" result = script.pip( "install", "-vvv", diff --git a/tests/functional/test_new_resolver_hashes.py b/tests/functional/test_new_resolver_hashes.py index 4c4c2253e99..80ed86219d4 100644 --- a/tests/functional/test_new_resolver_hashes.py +++ b/tests/functional/test_new_resolver_hashes.py @@ -36,7 +36,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks: wheel_url=path_to_url(wheel_path), wheel_hash=wheel_hash, wheel_path=wheel_path, - ) + ).strip() ) return _FindLinks(index_html, sdist_hash, wheel_hash) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 4b51d1cff98..403da9c4005 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -540,9 +540,9 @@ def test_parse_links_caches_same_page_by_url() -> None: def test_parse_link_handles_deprecated_usage_properly() -> None: - html = b'' + html = b'' url = "https://example.com/simple/" - page = HTMLPage(html, encoding=None, url=url) + page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) parsed_links = list(parse_links(page, use_deprecated_html5lib=True)) @@ -551,6 +551,23 @@ def test_parse_link_handles_deprecated_usage_properly() -> None: assert "pkg1-2.0" in parsed_links[1].url +@mock.patch("pip._internal.index.collector.deprecated") +def test_parse_links_presents_deprecation_warning_on_non_html5_page( + mock_deprecated: mock.Mock, +) -> None: + html = b'' + url = "https://example.com/simple/" + page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) + + parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) + + assert len(parsed_links) == 2, parsed_links + assert "pkg1-1.0" in parsed_links[0].url + assert "pkg1-2.0" in parsed_links[1].url + + mock_deprecated.assert_called_once() + + @mock.patch("pip._internal.index.collector.raise_for_status") def test_request_http_error( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture