Skip to content

Commit

Permalink
Fix sphinx-doc#6629: linkcheck: Handle rate-limiting
Browse files Browse the repository at this point in the history
Follow the Retry-After header if present, otherwise use an exponential
back-off.
  • Loading branch information
francoisfreitag committed Nov 20, 2020
1 parent 13a986b commit 28c8667
Show file tree
Hide file tree
Showing 5 changed files with 384 additions and 9 deletions.
5 changes: 4 additions & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@
1),
]

intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)}
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'requests': ('https://requests.readthedocs.io/en/master', None),
}

# Sphinx document translation with sphinx gettext feature uses these settings:
locale_dirs = ['locale/']
Expand Down
5 changes: 5 additions & 0 deletions doc/usage/builders/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,11 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.

Since Sphinx-1.5, the linkcheck builder comes to use requests module.

.. versionchanged:: 3.2

The linkcheck builder retries links when the server replies with rate
limits.

.. module:: sphinx.builders.xml
.. class:: XMLBuilder

Expand Down
58 changes: 58 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,64 @@ Options for the linkcheck builder

.. versionadded:: 2.3

.. confval:: linkcheck_retry_on_rate_limit

The ``linkcheck`` builder may issue a large number of requests to the same
site over a short period of time. This setting controls the builder behavior
when servers indicate the requests are rate-limited.

When a server indicates when to retry (using the `Retry-After`_ header),
``linkcheck`` follows the server indication. Otherwise, it retries after a
configurable delay.

.. _Retry-After: https://tools.ietf.org/html/rfc2616#section-14.37

``linkcheck_retry_on_rate_limit`` is a mapping of domains to retry policies.

The key is a network location, like ``"sphinx-doc.org"`` or
``localhost:7777``. A catchall policy can be specified with ``"*"`` key. The
default policy is to retry after a minute, and keep doubling the wait time
between attempts until the wait time exceeds 5 minutes, after which the link
is marked as broken.

The value is a function with signature:

.. function:: retry(response, delay) -> bool
:noindex:

:param requests.Response response: returned by the server.
:param int delay: last wait time for that domain, initially 0.
:return: The return value can either be a bool or a float:

- **bool** ``True`` to keep retrying, ``False`` to stop.
- **float** time in seconds to wait before the next attempt.

The retry policy function is called immediately after a server returns an
HTTP response with status code `429
<https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429>`_ (Too Many
Requests).

Example:

.. code-block:: python
linkcheck_retry_on_rate_limit = {
# Never retry.
"wikipedia.org": lambda response, delay: False,
# Retry until wait time exceeds 10 minutes.
"sphinx-doc.org": lambda response, delay: delay <= 600.0,
# Retry after 2 minutes.
"example.org": lambda response, delay: 120.0,
# Retry after 10 times the last wait time.
"linux.org": lambda response, delay: 10.0 * delay,
# Retry until header X-Go-Away is present in the response.
"localhost:7777": lambda response, delay: "X-Go-Away" in response.headers,
# Retry forever.
"*": lambda: response, delay: True,
}
.. versionadded:: 3.2


Options for the XML builder
---------------------------
Expand Down
120 changes: 112 additions & 8 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
import re
import socket
import threading
import time
from collections import namedtuple
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from typing import Any, Dict, List, Set, Tuple
Expand All @@ -33,10 +37,13 @@

uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)

RateLimit = namedtuple('RateLimit', ('delay', 'next_check'))

DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
}
QUEUE_POLL_SECS = 1
DEFAULT_DELAY = 60.0


class AnchorCheckParser(HTMLParser):
Expand Down Expand Up @@ -98,7 +105,8 @@ def init(self) -> None:
open(path.join(self.outdir, 'output.json'), 'w').close()

# create queues and worker threads
self.wqueue = queue.Queue() # type: queue.Queue
self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.Queue
self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread]
for i in range(self.app.config.linkcheck_workers):
Expand Down Expand Up @@ -172,23 +180,38 @@ def check_uri() -> Tuple[str, str, int]:
config=self.app.config, auth=auth_info,
**kwargs)
response.raise_for_status()
except HTTPError:
except HTTPError as err:
if err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
response = requests.get(req_url, stream=True, config=self.app.config,
response = requests.get(req_url, stream=True,
config=self.app.config,
auth=auth_info, **kwargs)
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put((next_check, (uri, docname, lineno)), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
else:
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0
else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
return 'working', '', 0
else:
Expand Down Expand Up @@ -247,11 +270,80 @@ def check(docname: str) -> Tuple[str, str, int]:
return (status, info, code)

while True:
uri, docname, lineno = self.wqueue.get()
next_check, (uri, docname, lineno) = self.wqueue.get()
if uri is None:
break
if next_check > time.time():
# Sleep before putting message back in the queue to avoid
# waking up other threads.
time.sleep(QUEUE_POLL_SECS)
self.wqueue.put((next_check, (uri, docname, lineno)), False)
self.wqueue.task_done()
continue
status, info, code = check(docname)
self.rqueue.put((uri, docname, lineno, status, info, code))
if status == 'rate-limited':
logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...'))
else:
self.rqueue.put((uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def limit_rate(self, response):
netloc = urlparse(response.url).netloc
try:
rate_limit = self.rate_limits[netloc]
except KeyError:
last_delay = 0
else:
last_delay = rate_limit.delay

retry_config = self.app.config.linkcheck_retry_on_rate_limit
try:
retry = retry_config[netloc]
except KeyError:
try:
retry = retry_config["*"]
except KeyError:
def retry(response, delay):
return delay <= 300.0
retry_or_delay = retry(response, last_delay)
user_delay = None
if isinstance(retry_or_delay, bool):
if not retry_or_delay:
return None
elif isinstance(retry_or_delay, (float, int)):
user_delay = retry_or_delay
else:
raise ValueError("linkcheck_retry_on_rate_limit function for netloc %s must "
"return a float or a bool, got %s (type %s).", netloc,
retry_or_delay, type(retry_or_delay))

next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
delay = float(retry_after)
except ValueError:
try:
# An HTTP-date: time of next attempt.
until = parsedate_to_datetime(retry_after)
except (TypeError, ValueError):
# TypeError: Invalid date format.
# ValueError: Invalid date, e.g. Oct 52th.
pass
else:
next_check = datetime.timestamp(until)
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
if next_check is None:
if user_delay is None:
delay = 2.0 * last_delay if last_delay else DEFAULT_DELAY
else:
delay = user_delay
next_check = time.time() + delay
self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check

def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result
Expand Down Expand Up @@ -319,21 +411,30 @@ def write_doc(self, docname: str, doctree: Node) -> None:
logger.info('')
n = 0

def get_next_check(uri):
netloc = urlparse(uri).netloc
try:
return self.rate_limits[netloc].next_check
except KeyError:
return 0 # Check immediately.

# reference nodes
for refnode in doctree.traverse(nodes.reference):
if 'refuri' not in refnode:
continue
uri = refnode['refuri']
lineno = get_node_line(refnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (get_next_check(uri), (uri, docname, lineno))
self.wqueue.put(uri_info, False)
n += 1

# image nodes
for imgnode in doctree.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = get_node_line(imgnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (get_next_check(uri), (uri, docname, lineno))
self.wqueue.put(uri_info, False)
n += 1

done = 0
Expand All @@ -355,8 +456,10 @@ def write_linkstat(self, data: dict) -> None:
output.write('\n')

def finish(self) -> None:
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((None, None, None), False)
self.wqueue.put((0, (None, None, None)), False)


def setup(app: Sphinx) -> Dict[str, Any]:
Expand All @@ -372,6 +475,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
# Anchors starting with ! are ignored since they are
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
app.add_config_value('linkcheck_retry_on_rate_limit', {}, None)

return {
'version': 'builtin',
Expand Down

0 comments on commit 28c8667

Please sign in to comment.