Skip to content

Commit

Permalink
Add method to extract from a urllib.parse.{ParseResult,SplitResult} (#…
Browse files Browse the repository at this point in the history
…274)

* Add method `extract_urllib`
* Add method `extract_str`, an alias for `__call__`
* Move this library's lenient netloc extraction to its own function

The new method `extract_urllib` is like `extract_str` or `__call__` but faster, as the string's domain name has already been parsed.
  • Loading branch information
john-kurkowski committed Oct 3, 2022
1 parent 76f9802 commit 3087963
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 15 deletions.
17 changes: 17 additions & 0 deletions tldextract/remote.py
Expand Up @@ -12,6 +12,23 @@
SCHEME_RE = re.compile(r"^([" + scheme_chars + "]+:)?//")


def lenient_netloc(url: str) -> str:
"""Extract the netloc of a URL-like string, similar to the netloc attribute
returned by urllib.parse.{urlparse,urlsplit}, but extract more leniently,
without raising errors."""

return (
SCHEME_RE.sub("", url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
.split("@")[-1]
.partition(":")[0]
.strip()
.rstrip(".")
)


def looks_like_ip(maybe_ip: str) -> bool:
"""Does the given str look like an IP address?"""
if not maybe_ip[0].isdigit():
Expand Down
51 changes: 36 additions & 15 deletions tldextract/tldextract.py
Expand Up @@ -53,11 +53,12 @@
import re
from functools import wraps
from typing import FrozenSet, List, NamedTuple, Optional, Sequence, Union
import urllib.parse

import idna

from .cache import DiskCache, get_cache_dir
from .remote import IP_RE, SCHEME_RE, looks_like_ip
from .remote import IP_RE, lenient_netloc, looks_like_ip
from .suffix_list import get_suffix_lists

LOG = logging.getLogger("tldextract")
Expand Down Expand Up @@ -207,29 +208,49 @@ def __init__( # pylint: disable=too-many-arguments

def __call__(
self, url: str, include_psl_private_domains: Optional[bool] = None
) -> ExtractResult:
"""Alias for `extract_str`."""
return self.extract_str(url, include_psl_private_domains)

def extract_str(
self, url: str, include_psl_private_domains: Optional[bool] = None
) -> ExtractResult:
"""
Takes a string URL and splits it into its subdomain, domain, and
suffix (effective TLD, gTLD, ccTLD, etc.) component.
suffix (effective TLD, gTLD, ccTLD, etc.) components.
>>> extract = TLDExtract()
>>> extract('http://forums.news.cnn.com/')
>>> extractor = TLDExtract()
>>> extractor.extract_str('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
>>> extract('http://forums.bbc.co.uk/')
>>> extractor.extract_str('http://forums.bbc.co.uk/')
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
"""
return self._extract_netloc(lenient_netloc(url), include_psl_private_domains)

netloc = (
SCHEME_RE.sub("", url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
.split("@")[-1]
.partition(":")[0]
.strip()
.rstrip(".")
)
def extract_urllib(
self,
url: Union[urllib.parse.ParseResult, urllib.parse.SplitResult],
include_psl_private_domains: Optional[bool] = None,
) -> ExtractResult:
"""
Takes the output of urllib.parse URL parsing methods and further splits
the parsed URL into its subdomain, domain, and suffix (effective TLD,
gTLD, ccTLD, etc.) components.
This method is like `extract_str` but faster, as the string's domain
name has already been parsed.
>>> extractor = TLDExtract()
>>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.news.cnn.com/'))
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
>>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/'))
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
"""
return self._extract_netloc(url.netloc, include_psl_private_domains)

def _extract_netloc(
self, netloc: str, include_psl_private_domains: Optional[bool]
) -> ExtractResult:
labels = _UNICODE_DOTS_RE.split(netloc)

translations = [_decode_punycode(label) for label in labels]
Expand Down

0 comments on commit 3087963

Please sign in to comment.