Skip to content

Commit

Permalink
Merge pull request #10291 from jdufresne/html5lib
Browse files Browse the repository at this point in the history
  • Loading branch information
pradyunsg committed Jan 28, 2022
2 parents 98b1022 + 20fe83f commit 649048b
Show file tree
Hide file tree
Showing 26 changed files with 177 additions and 28 deletions.
5 changes: 5 additions & 0 deletions news/10291.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Changed ``PackageFinder`` to parse HTML documents using the stdlib
:class:`html.parser.HTMLParser` class instead of the ``html5lib`` package. For
now, the deprecated ``html5lib`` code remains and can be used with the
``--use-deprecated=html5lib`` command line option, but it will be removed in a
future pip release.
7 changes: 6 additions & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,12 @@ def check_list_path_option(options: Values) -> None:
metavar="feature",
action="append",
default=[],
choices=["legacy-resolver", "out-of-tree-build", "backtrack-on-build-failures"],
choices=[
"legacy-resolver",
"out-of-tree-build",
"backtrack-on-build-failures",
"html5lib",
],
help=("Enable deprecated functionality, that will be removed in the future."),
)

Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,4 +502,5 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)
1 change: 1 addition & 0 deletions src/pip/_internal/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def get_available_package_versions(self, options: Values, args: List[Any]) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def _build_package_finder(
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def run(self, options: Values, args: List[str]) -> int:
Expand Down
118 changes: 103 additions & 15 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from html.parser import HTMLParser
from optparse import Values
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
MutableMapping,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)

Expand All @@ -39,6 +44,11 @@

from .sources import CandidatesFromPage, LinkSource, build_source

if TYPE_CHECKING:
from typing import Protocol
else:
Protocol = object

logger = logging.getLogger(__name__)

HTMLElement = xml.etree.ElementTree.Element
Expand Down Expand Up @@ -163,6 +173,8 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.
TODO: Remove when `html5lib` is dropped.
"""
for base in document.findall(".//base"):
href = base.get("href")
Expand Down Expand Up @@ -234,20 +246,20 @@ def _clean_link(url: str) -> str:


def _create_link_from_element(
anchor: HTMLElement,
element_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional[Link]:
"""
Convert an anchor element in a simple repository page to a Link.
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = anchor.get("href")
href = element_attribs.get("href")
if not href:
return None

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = anchor.get("data-requires-python")
yanked_reason = anchor.get("data-yanked")
pyrequire = element_attribs.get("data-requires-python")
yanked_reason = element_attribs.get("data-yanked")

link = Link(
url,
Expand All @@ -271,32 +283,40 @@ def __hash__(self) -> int:
return hash(self.page.url)


def with_cached_html_pages(
fn: Callable[["HTMLPage"], Iterable[Link]],
) -> Callable[["HTMLPage"], List[Link]]:
class ParseLinks(Protocol):
def __call__(
self, page: "HTMLPage", use_deprecated_html5lib: bool
) -> Iterable[Link]:
...


def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
`page` has `page.cache_link_parsing == False`.
"""

@functools.lru_cache(maxsize=None)
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
return list(fn(cacheable_page.page))
def wrapper(
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
) -> List[Link]:
return list(fn(cacheable_page.page, use_deprecated_html5lib))

@functools.wraps(fn)
def wrapper_wrapper(page: "HTMLPage") -> List[Link]:
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page: "HTMLPage") -> Iterable[Link]:
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
TODO: Remove when `html5lib` is dropped.
"""
document = html5lib.parse(
page.content,
Expand All @@ -307,6 +327,31 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
url = page.url
base_url = _determine_base_url(document, url)
for anchor in document.findall(".//a"):
link = _create_link_from_element(
anchor.attrib,
page_url=url,
base_url=base_url,
)
if link is None:
continue
yield link


@with_cached_html_pages
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
if use_deprecated_html5lib:
return _parse_links_html5lib(page)

parser = HTMLLinkParser()
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
Expand Down Expand Up @@ -343,6 +388,49 @@ def __str__(self) -> str:
return redact_auth_from_url(self.url)


class HTMLLinkParser(HTMLParser):
"""
HTMLParser that keeps the first base HREF and a list of all anchor
elements' attributes.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._seen_decl = False
self.base_url: Optional[str] = None
self.anchors: List[Dict[str, Optional[str]]] = []

def handle_decl(self, decl: str) -> None:
if decl != "DOCTYPE html":
self._raise_error()
self._seen_decl = True

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._seen_decl:
self._raise_error()

if tag == "base" and self.base_url is None:
href = self.get_href(attrs)
if href is not None:
self.base_url = href
elif tag == "a":
self.anchors.append(dict(attrs))

def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
for name, value in attrs:
if name == "href":
return value
return None

def _raise_error(self) -> None:
raise ValueError(
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
"If you believe this error to be incorrect, try passing the "
"command line option --use-deprecated=html5lib and please leave "
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
)


def _handle_get_page_fail(
link: Link,
reason: Union[str, Exception],
Expand Down
7 changes: 6 additions & 1 deletion src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ def __init__(
link_collector: LinkCollector,
target_python: TargetPython,
allow_yanked: bool,
use_deprecated_html5lib: bool,
format_control: Optional[FormatControl] = None,
candidate_prefs: Optional[CandidatePreferences] = None,
ignore_requires_python: Optional[bool] = None,
Expand All @@ -604,6 +605,7 @@ def __init__(
self._ignore_requires_python = ignore_requires_python
self._link_collector = link_collector
self._target_python = target_python
self._use_deprecated_html5lib = use_deprecated_html5lib

self.format_control = format_control

Expand All @@ -620,6 +622,8 @@ def create(
link_collector: LinkCollector,
selection_prefs: SelectionPreferences,
target_python: Optional[TargetPython] = None,
*,
use_deprecated_html5lib: bool,
) -> "PackageFinder":
"""Create a PackageFinder.
Expand All @@ -644,6 +648,7 @@ def create(
allow_yanked=selection_prefs.allow_yanked,
format_control=selection_prefs.format_control,
ignore_requires_python=selection_prefs.ignore_requires_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)

@property
Expand Down Expand Up @@ -765,7 +770,7 @@ def process_project_url(
if html_page is None:
return []

page_links = list(parse_links(html_page))
page_links = list(parse_links(html_page, self._use_deprecated_html5lib))

with indent_log():
package_links = self.evaluate_links(
Expand Down
3 changes: 3 additions & 0 deletions src/pip/_internal/self_outdated_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ def pip_self_version_check(session: PipSession, options: optparse.Values) -> Non
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=(
"html5lib" in options.deprecated_features_enabled
),
)
best_candidate = finder.find_best_candidate("pip").best_candidate
if best_candidate is None:
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/datarequire/fakepackage/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>Links for fakepackage</title><meta name="api-version" value="2" /></head><body><h1>Links for fakepackage</h1>
<a data-requires-python='' href="/fakepackage-1.0.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-1.0.0.tar.gz</a><br/>
<a data-requires-python='&lt;2.7' href="/fakepackage-2.6.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-2.6.0.tar.gz</a><br/>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/dev/bar/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/in dex/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/pre/bar/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/simple/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/yanked/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/yanked_all/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a data-yanked="test reason message" href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/dinner/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/requiredinner/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/functional/test_build_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def run_with_build_env(
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=False,
)
with global_tempdir_manager():
Expand Down
1 change: 1 addition & 0 deletions tests/functional/test_new_resolver_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks:
index_html = script.scratch_path / "index.html"
index_html.write_text(
"""
<!DOCTYPE html>
<a href="{sdist_url}#sha256={sdist_hash}">{sdist_path.stem}</a>
<a href="{wheel_url}#sha256={wheel_hash}">{wheel_path.stem}</a>
""".format(
Expand Down
2 changes: 2 additions & 0 deletions tests/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def make_test_finder(
allow_all_prereleases: bool = False,
session: Optional[PipSession] = None,
target_python: Optional[TargetPython] = None,
use_deprecated_html5lib: bool = False,
) -> PackageFinder:
"""
Create a PackageFinder for testing purposes.
Expand All @@ -159,6 +160,7 @@ def make_test_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/resolution_resolvelib/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def finder(data: TestData) -> Iterator[PackageFinder]:
scope = SearchScope([str(data.packages)], [])
collector = LinkCollector(session, scope)
prefs = SelectionPreferences(allow_yanked=False)
finder = PackageFinder.create(collector, prefs)
finder = PackageFinder.create(collector, prefs, use_deprecated_html5lib=False)
yield finder


Expand Down

0 comments on commit 649048b

Please sign in to comment.