Fix sphinx-doc#6629: linkcheck: Handle rate-limiting

Follow the Retry-After header if present, otherwise use an exponential back-off.
francoisfreitag · Nov 20, 2020 · 28c8667 · 28c8667
1 parent 13a986b
commit 28c8667
Show file tree

Hide file tree

Showing 5 changed files with 384 additions and 9 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -110,7 +110,10 @@
      1),
 ]
 
-intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)}
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'requests': ('https://requests.readthedocs.io/en/master', None),
+}
 
 # Sphinx document translation with sphinx gettext feature uses these settings:
 locale_dirs = ['locale/']

diff --git a/doc/usage/builders/index.rst b/doc/usage/builders/index.rst
@@ -442,6 +442,11 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.
 
       Since Sphinx-1.5, the linkcheck builder comes to use requests module.
 
+   .. versionchanged:: 3.2
+
+      The linkcheck builder retries links when the server replies with rate
+      limits.
+
 .. module:: sphinx.builders.xml
 .. class:: XMLBuilder
 

diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst
@@ -2525,6 +2525,64 @@ Options for the linkcheck builder
 
    .. versionadded:: 2.3
 
+.. confval:: linkcheck_retry_on_rate_limit
+
+   The ``linkcheck`` builder may issue a large number of requests to the same
+   site over a short period of time. This setting controls the builder behavior
+   when servers indicate the requests are rate-limited.
+
+   When a server indicates when to retry (using the `Retry-After`_ header),
+   ``linkcheck`` follows the server indication. Otherwise, it retries after a
+   configurable delay.
+
+   .. _Retry-After: https://tools.ietf.org/html/rfc2616#section-14.37
+
+   ``linkcheck_retry_on_rate_limit`` is a mapping of domains to retry policies.
+
+   The key is a network location, like ``"sphinx-doc.org"`` or
+   ``localhost:7777``. A catchall policy can be specified with ``"*"`` key. The
+   default policy is to retry after a minute, and keep doubling the wait time
+   between attempts until the wait time exceeds 5 minutes, after which the link
+   is marked as broken.
+
+   The value is a function with signature:
+
+   .. function:: retry(response, delay) -> bool
+      :noindex:
+
+      :param requests.Response response: returned by the server.
+      :param int delay: last wait time for that domain, initially 0.
+      :return: The return value can either be a bool or a float:
+
+         - **bool** ``True`` to keep retrying, ``False`` to stop.
+         - **float** time in seconds to wait before the next attempt.
+
+   The retry policy function is called immediately after a server returns an
+   HTTP response with status code `429
+   <https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429>`_ (Too Many
+   Requests).
+
+   Example:
+
+   .. code-block:: python
+
+      linkcheck_retry_on_rate_limit = {
+          # Never retry.
+          "wikipedia.org": lambda response, delay: False,
+          # Retry until wait time exceeds 10 minutes.
+          "sphinx-doc.org": lambda response, delay: delay <= 600.0,
+          # Retry after 2 minutes.
+          "example.org": lambda response, delay: 120.0,
+          # Retry after 10 times the last wait time.
+          "linux.org": lambda response, delay: 10.0 * delay,
+          # Retry until header X-Go-Away is present in the response.
+          "localhost:7777": lambda response, delay: "X-Go-Away" in response.headers,
+          # Retry forever.
+          "*": lambda: response, delay: True,
+      }
+
+   .. versionadded:: 3.2
+
 
 Options for the XML builder
 ---------------------------

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
@@ -13,6 +13,10 @@
 import re
 import socket
 import threading
+import time
+from collections import namedtuple
+from datetime import datetime, timezone
+from email.utils import parsedate_to_datetime
 from html.parser import HTMLParser
 from os import path
 from typing import Any, Dict, List, Set, Tuple
@@ -33,10 +37,13 @@
 
 uri_re = re.compile('([a-z]+:)?//')  # matches to foo:// and // (a protocol relative URL)
 
+RateLimit = namedtuple('RateLimit', ('delay', 'next_check'))
 
 DEFAULT_REQUEST_HEADERS = {
     'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
 }
+QUEUE_POLL_SECS = 1
+DEFAULT_DELAY = 60.0
 
 
 class AnchorCheckParser(HTMLParser):
@@ -98,7 +105,8 @@ def init(self) -> None:
         open(path.join(self.outdir, 'output.json'), 'w').close()
 
         # create queues and worker threads
-        self.wqueue = queue.Queue()  # type: queue.Queue
+        self.rate_limits = {}  # type: Dict[str, RateLimit]
+        self.wqueue = queue.PriorityQueue()  # type: queue.Queue
         self.rqueue = queue.Queue()  # type: queue.Queue
         self.workers = []  # type: List[threading.Thread]
         for i in range(self.app.config.linkcheck_workers):
@@ -172,23 +180,38 @@ def check_uri() -> Tuple[str, str, int]:
                                                  config=self.app.config, auth=auth_info,
                                                  **kwargs)
                         response.raise_for_status()
-                    except HTTPError:
+                    except HTTPError as err:
+                        if err.response.status_code == 429:
+                            raise
                         # retry with GET request if that fails, some servers
                         # don't like HEAD requests.
-                        response = requests.get(req_url, stream=True, config=self.app.config,
+                        response = requests.get(req_url, stream=True,
+                                                config=self.app.config,
                                                 auth=auth_info, **kwargs)
                         response.raise_for_status()
             except HTTPError as err:
                 if err.response.status_code == 401:
                     # We'll take "Unauthorized" as working.
                     return 'working', ' - unauthorized', 0
+                elif err.response.status_code == 429:
+                    next_check = self.limit_rate(err.response)
+                    if next_check is not None:
+                        self.wqueue.put((next_check, (uri, docname, lineno)), False)
+                        return 'rate-limited', '', 0
+                    return 'broken', str(err), 0
                 elif err.response.status_code == 503:
                     # We'll take "Service Unavailable" as ignored.
                     return 'ignored', str(err), 0
                 else:
                     return 'broken', str(err), 0
             except Exception as err:
                 return 'broken', str(err), 0
+            else:
+                netloc = urlparse(req_url).netloc
+                try:
+                    del self.rate_limits[netloc]
+                except KeyError:
+                    pass
             if response.url.rstrip('/') == req_url.rstrip('/'):
                 return 'working', '', 0
             else:
@@ -247,11 +270,80 @@ def check(docname: str) -> Tuple[str, str, int]:
             return (status, info, code)
 
         while True:
-            uri, docname, lineno = self.wqueue.get()
+            next_check, (uri, docname, lineno) = self.wqueue.get()
             if uri is None:
                 break
+            if next_check > time.time():
+                # Sleep before putting message back in the queue to avoid
+                # waking up other threads.
+                time.sleep(QUEUE_POLL_SECS)
+                self.wqueue.put((next_check, (uri, docname, lineno)), False)
+                self.wqueue.task_done()
+                continue
             status, info, code = check(docname)
-            self.rqueue.put((uri, docname, lineno, status, info, code))
+            if status == 'rate-limited':
+                logger.info(darkgray('-rate limited-   ') + uri + darkgray(' | sleeping...'))
+            else:
+                self.rqueue.put((uri, docname, lineno, status, info, code))
+            self.wqueue.task_done()
+
+    def limit_rate(self, response):
+        netloc = urlparse(response.url).netloc
+        try:
+            rate_limit = self.rate_limits[netloc]
+        except KeyError:
+            last_delay = 0
+        else:
+            last_delay = rate_limit.delay
+
+        retry_config = self.app.config.linkcheck_retry_on_rate_limit
+        try:
+            retry = retry_config[netloc]
+        except KeyError:
+            try:
+                retry = retry_config["*"]
+            except KeyError:
+                def retry(response, delay):
+                    return delay <= 300.0
+        retry_or_delay = retry(response, last_delay)
+        user_delay = None
+        if isinstance(retry_or_delay, bool):
+            if not retry_or_delay:
+                return None
+        elif isinstance(retry_or_delay, (float, int)):
+            user_delay = retry_or_delay
+        else:
+            raise ValueError("linkcheck_retry_on_rate_limit function for netloc %s must "
+                             "return a float or a bool, got %s (type %s).", netloc,
+                             retry_or_delay, type(retry_or_delay))
+
+        next_check = None
+        retry_after = response.headers.get("Retry-After")
+        if retry_after:
+            try:
+                # Integer: time to wait before next attempt.
+                delay = float(retry_after)
+            except ValueError:
+                try:
+                    # An HTTP-date: time of next attempt.
+                    until = parsedate_to_datetime(retry_after)
+                except (TypeError, ValueError):
+                    # TypeError: Invalid date format.
+                    # ValueError: Invalid date, e.g. Oct 52th.
+                    pass
+                else:
+                    next_check = datetime.timestamp(until)
+                    delay = (until - datetime.now(timezone.utc)).total_seconds()
+            else:
+                next_check = time.time() + delay
+        if next_check is None:
+            if user_delay is None:
+                delay = 2.0 * last_delay if last_delay else DEFAULT_DELAY
+            else:
+                delay = user_delay
+            next_check = time.time() + delay
+        self.rate_limits[netloc] = RateLimit(delay, next_check)
+        return next_check
 
     def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
         uri, docname, lineno, status, info, code = result
@@ -319,21 +411,30 @@ def write_doc(self, docname: str, doctree: Node) -> None:
         logger.info('')
         n = 0
 
+        def get_next_check(uri):
+            netloc = urlparse(uri).netloc
+            try:
+                return self.rate_limits[netloc].next_check
+            except KeyError:
+                return 0  # Check immediately.
+
         # reference nodes
         for refnode in doctree.traverse(nodes.reference):
             if 'refuri' not in refnode:
                 continue
             uri = refnode['refuri']
             lineno = get_node_line(refnode)
-            self.wqueue.put((uri, docname, lineno), False)
+            uri_info = (get_next_check(uri), (uri, docname, lineno))
+            self.wqueue.put(uri_info, False)
             n += 1
 
         # image nodes
         for imgnode in doctree.traverse(nodes.image):
             uri = imgnode['candidates'].get('?')
             if uri and '://' in uri:
                 lineno = get_node_line(imgnode)
-                self.wqueue.put((uri, docname, lineno), False)
+                uri_info = (get_next_check(uri), (uri, docname, lineno))
+                self.wqueue.put(uri_info, False)
                 n += 1
 
         done = 0
@@ -355,8 +456,10 @@ def write_linkstat(self, data: dict) -> None:
             output.write('\n')
 
     def finish(self) -> None:
+        self.wqueue.join()
+        # Shutdown threads.
         for worker in self.workers:
-            self.wqueue.put((None, None, None), False)
+            self.wqueue.put((0, (None, None, None)), False)
 
 
 def setup(app: Sphinx) -> Dict[str, Any]:
@@ -372,6 +475,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
     # Anchors starting with ! are ignored since they are
     # commonly used for dynamic pages
     app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
+    app.add_config_value('linkcheck_retry_on_rate_limit', {}, None)
 
     return {
         'version': 'builtin',