Add method to extract from a urllib.parse.{ParseResult,SplitResult} (#…

…274) * Add method `extract_urllib` * Add method `extract_str`, an alias for `__call__` * Move this library's lenient netloc extraction to its own function The new method `extract_urllib` is like `extract_str` or `__call__` but faster, as the string's domain name has already been parsed.
john-kurkowski · Oct 3, 2022 · 3087963 · 3087963
1 parent 76f9802
commit 3087963
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 15 deletions.
diff --git a/tldextract/remote.py b/tldextract/remote.py
@@ -12,6 +12,23 @@
 SCHEME_RE = re.compile(r"^([" + scheme_chars + "]+:)?//")
 
 
+def lenient_netloc(url: str) -> str:
+    """Extract the netloc of a URL-like string, similar to the netloc attribute
+    returned by urllib.parse.{urlparse,urlsplit}, but extract more leniently,
+    without raising errors."""
+
+    return (
+        SCHEME_RE.sub("", url)
+        .partition("/")[0]
+        .partition("?")[0]
+        .partition("#")[0]
+        .split("@")[-1]
+        .partition(":")[0]
+        .strip()
+        .rstrip(".")
+    )
+
+
 def looks_like_ip(maybe_ip: str) -> bool:
     """Does the given str look like an IP address?"""
     if not maybe_ip[0].isdigit():

diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py
@@ -53,11 +53,12 @@
 import re
 from functools import wraps
 from typing import FrozenSet, List, NamedTuple, Optional, Sequence, Union
+import urllib.parse
 
 import idna
 
 from .cache import DiskCache, get_cache_dir
-from .remote import IP_RE, SCHEME_RE, looks_like_ip
+from .remote import IP_RE, lenient_netloc, looks_like_ip
 from .suffix_list import get_suffix_lists
 
 LOG = logging.getLogger("tldextract")
@@ -207,29 +208,49 @@ def __init__(  # pylint: disable=too-many-arguments
 
     def __call__(
         self, url: str, include_psl_private_domains: Optional[bool] = None
+    ) -> ExtractResult:
+        """Alias for `extract_str`."""
+        return self.extract_str(url, include_psl_private_domains)
+
+    def extract_str(
+        self, url: str, include_psl_private_domains: Optional[bool] = None
     ) -> ExtractResult:
         """
         Takes a string URL and splits it into its subdomain, domain, and
-        suffix (effective TLD, gTLD, ccTLD, etc.) component.
+        suffix (effective TLD, gTLD, ccTLD, etc.) components.
 
-        >>> extract = TLDExtract()
-        >>> extract('http://forums.news.cnn.com/')
+        >>> extractor = TLDExtract()
+        >>> extractor.extract_str('http://forums.news.cnn.com/')
         ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
-        >>> extract('http://forums.bbc.co.uk/')
+        >>> extractor.extract_str('http://forums.bbc.co.uk/')
         ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
         """
+        return self._extract_netloc(lenient_netloc(url), include_psl_private_domains)
 
-        netloc = (
-            SCHEME_RE.sub("", url)
-            .partition("/")[0]
-            .partition("?")[0]
-            .partition("#")[0]
-            .split("@")[-1]
-            .partition(":")[0]
-            .strip()
-            .rstrip(".")
-        )
+    def extract_urllib(
+        self,
+        url: Union[urllib.parse.ParseResult, urllib.parse.SplitResult],
+        include_psl_private_domains: Optional[bool] = None,
+    ) -> ExtractResult:
+        """
+        Takes the output of urllib.parse URL parsing methods and further splits
+        the parsed URL into its subdomain, domain, and suffix (effective TLD,
+        gTLD, ccTLD, etc.) components.
 
+        This method is like `extract_str` but faster, as the string's domain
+        name has already been parsed.
+
+        >>> extractor = TLDExtract()
+        >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.news.cnn.com/'))
+        ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
+        >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/'))
+        ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
+        """
+        return self._extract_netloc(url.netloc, include_psl_private_domains)
+
+    def _extract_netloc(
+        self, netloc: str, include_psl_private_domains: Optional[bool]
+    ) -> ExtractResult:
         labels = _UNICODE_DOTS_RE.split(netloc)
 
         translations = [_decode_punycode(label) for label in labels]