diff --git a/tldextract/remote.py b/tldextract/remote.py index 04886ede..225043fd 100644 --- a/tldextract/remote.py +++ b/tldextract/remote.py @@ -12,6 +12,23 @@ SCHEME_RE = re.compile(r"^([" + scheme_chars + "]+:)?//") +def lenient_netloc(url: str) -> str: + """Extract the netloc of a URL-like string, similar to the netloc attribute + returned by urllib.parse.{urlparse,urlsplit}, but extract more leniently, + without raising errors.""" + + return ( + SCHEME_RE.sub("", url) + .partition("/")[0] + .partition("?")[0] + .partition("#")[0] + .split("@")[-1] + .partition(":")[0] + .strip() + .rstrip(".") + ) + + def looks_like_ip(maybe_ip: str) -> bool: """Does the given str look like an IP address?""" if not maybe_ip[0].isdigit(): diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py index f3351073..b964f808 100644 --- a/tldextract/tldextract.py +++ b/tldextract/tldextract.py @@ -53,11 +53,12 @@ import re from functools import wraps from typing import FrozenSet, List, NamedTuple, Optional, Sequence, Union +import urllib.parse import idna from .cache import DiskCache, get_cache_dir -from .remote import IP_RE, SCHEME_RE, looks_like_ip +from .remote import IP_RE, lenient_netloc, looks_like_ip from .suffix_list import get_suffix_lists LOG = logging.getLogger("tldextract") @@ -207,29 +208,49 @@ def __init__( # pylint: disable=too-many-arguments def __call__( self, url: str, include_psl_private_domains: Optional[bool] = None + ) -> ExtractResult: + """Alias for `extract_str`.""" + return self.extract_str(url, include_psl_private_domains) + + def extract_str( + self, url: str, include_psl_private_domains: Optional[bool] = None ) -> ExtractResult: """ Takes a string URL and splits it into its subdomain, domain, and - suffix (effective TLD, gTLD, ccTLD, etc.) component. + suffix (effective TLD, gTLD, ccTLD, etc.) components. - >>> extract = TLDExtract() - >>> extract('http://forums.news.cnn.com/') + >>> extractor = TLDExtract() + >>> extractor.extract_str('http://forums.news.cnn.com/') ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') - >>> extract('http://forums.bbc.co.uk/') + >>> extractor.extract_str('http://forums.bbc.co.uk/') ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') """ + return self._extract_netloc(lenient_netloc(url), include_psl_private_domains) - netloc = ( - SCHEME_RE.sub("", url) - .partition("/")[0] - .partition("?")[0] - .partition("#")[0] - .split("@")[-1] - .partition(":")[0] - .strip() - .rstrip(".") - ) + def extract_urllib( + self, + url: Union[urllib.parse.ParseResult, urllib.parse.SplitResult], + include_psl_private_domains: Optional[bool] = None, + ) -> ExtractResult: + """ + Takes the output of urllib.parse URL parsing methods and further splits + the parsed URL into its subdomain, domain, and suffix (effective TLD, + gTLD, ccTLD, etc.) components. + This method is like `extract_str` but faster, as the string's domain + name has already been parsed. + + >>> extractor = TLDExtract() + >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.news.cnn.com/')) + ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') + >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/')) + ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') + """ + return self._extract_netloc(url.netloc, include_psl_private_domains) + + def _extract_netloc( + self, netloc: str, include_psl_private_domains: Optional[bool] + ) -> ExtractResult: labels = _UNICODE_DOTS_RE.split(netloc) translations = [_decode_punycode(label) for label in labels]