Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add method to extract from a urllib.parse.{ParseResult,SplitResult} #274

Merged
merged 1 commit into from Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 17 additions & 0 deletions tldextract/remote.py
Expand Up @@ -12,6 +12,23 @@
SCHEME_RE = re.compile(r"^([" + scheme_chars + "]+:)?//")


def lenient_netloc(url: str) -> str:
"""Extract the netloc of a URL-like string, similar to the netloc attribute
returned by urllib.parse.{urlparse,urlsplit}, but extract more leniently,
without raising errors."""

return (
SCHEME_RE.sub("", url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
.split("@")[-1]
.partition(":")[0]
.strip()
.rstrip(".")
)


def looks_like_ip(maybe_ip: str) -> bool:
"""Does the given str look like an IP address?"""
if not maybe_ip[0].isdigit():
Expand Down
51 changes: 36 additions & 15 deletions tldextract/tldextract.py
Expand Up @@ -53,11 +53,12 @@
import re
from functools import wraps
from typing import FrozenSet, List, NamedTuple, Optional, Sequence, Union
import urllib.parse

import idna

from .cache import DiskCache, get_cache_dir
from .remote import IP_RE, SCHEME_RE, looks_like_ip
from .remote import IP_RE, lenient_netloc, looks_like_ip
from .suffix_list import get_suffix_lists

LOG = logging.getLogger("tldextract")
Expand Down Expand Up @@ -207,29 +208,49 @@ def __init__( # pylint: disable=too-many-arguments

def __call__(
self, url: str, include_psl_private_domains: Optional[bool] = None
) -> ExtractResult:
"""Alias for `extract_str`."""
return self.extract_str(url, include_psl_private_domains)

def extract_str(
self, url: str, include_psl_private_domains: Optional[bool] = None
) -> ExtractResult:
"""
Takes a string URL and splits it into its subdomain, domain, and
suffix (effective TLD, gTLD, ccTLD, etc.) component.
suffix (effective TLD, gTLD, ccTLD, etc.) components.

>>> extract = TLDExtract()
>>> extract('http://forums.news.cnn.com/')
>>> extractor = TLDExtract()
>>> extractor.extract_str('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
>>> extract('http://forums.bbc.co.uk/')
>>> extractor.extract_str('http://forums.bbc.co.uk/')
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
"""
return self._extract_netloc(lenient_netloc(url), include_psl_private_domains)

netloc = (
SCHEME_RE.sub("", url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
.split("@")[-1]
.partition(":")[0]
.strip()
.rstrip(".")
)
def extract_urllib(
self,
url: Union[urllib.parse.ParseResult, urllib.parse.SplitResult],
include_psl_private_domains: Optional[bool] = None,
) -> ExtractResult:
"""
Takes the output of urllib.parse URL parsing methods and further splits
the parsed URL into its subdomain, domain, and suffix (effective TLD,
gTLD, ccTLD, etc.) components.

This method is like `extract_str` but faster, as the string's domain
name has already been parsed.

>>> extractor = TLDExtract()
>>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.news.cnn.com/'))
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
>>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/'))
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
"""
return self._extract_netloc(url.netloc, include_psl_private_domains)

def _extract_netloc(
self, netloc: str, include_psl_private_domains: Optional[bool]
) -> ExtractResult:
labels = _UNICODE_DOTS_RE.split(netloc)

translations = [_decode_punycode(label) for label in labels]
Expand Down