Skip to content

Commit

Permalink
Fix sphinx-doc#6629: linkcheck: Handle rate-limiting
Browse files Browse the repository at this point in the history
Follow the Retry-After header if present, otherwise use an exponential
back-off.

Close sphinx-doc#7388
  • Loading branch information
francoisfreitag committed Nov 22, 2020
1 parent a440270 commit 3238ac3
Show file tree
Hide file tree
Showing 6 changed files with 277 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Features added
* #6914: Add a new event :event:`warn-missing-reference` to custom warning
messages when failed to resolve a cross-reference
* #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference
* #6629: The linkcheck builder now handles rate limits. See
:confval:`linkcheck_retry_on_rate_limit` for details.

Bugs fixed
----------
Expand Down
5 changes: 4 additions & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@
1),
]

intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)}
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'requests': ('https://requests.readthedocs.io/en/master', None),
}

# Sphinx document translation with sphinx gettext feature uses these settings:
locale_dirs = ['locale/']
Expand Down
4 changes: 4 additions & 0 deletions doc/usage/builders/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,10 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details.

Since Sphinx-1.5, the linkcheck builder comes to use requests module.

.. versionchanged:: 3.4

The linkcheck builder retries links when servers apply rate limits.

.. module:: sphinx.builders.xml
.. class:: XMLBuilder

Expand Down
17 changes: 17 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,23 @@ Options for the linkcheck builder

.. versionadded:: 2.3

.. confval:: linkcheck_rate_limit_timeout

The ``linkcheck`` builder may issue a large number of requests to the same
site over a short period of time. This setting controls the builder behavior
when servers indicate that requests are rate-limited.

If a server indicates when to retry (using the `Retry-After`_ header),
``linkcheck`` always follows the server indication.

Otherwise, ``linkcheck`` waits for a minute before to retry and keeps
doubling the wait time between attempts until it succeeds or exceeds the
``linkcheck_rate_limit_timeout``. By default, the timeout is 5 minutes.

.. _Retry-After: https://tools.ietf.org/html/rfc2616#section-14.37

.. versionadded:: 3.4


Options for the XML builder
---------------------------
Expand Down
105 changes: 96 additions & 9 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@
import re
import socket
import threading
import time
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from typing import Any, Dict, List, Set, Tuple
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
from urllib.parse import unquote, urlparse

from docutils import nodes
from docutils.nodes import Node
from requests import Response
from requests.exceptions import HTTPError

from sphinx.application import Sphinx
Expand All @@ -33,10 +37,14 @@

uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)

RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))

DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
}
CHECK_IMMEDIATELY = 0
QUEUE_POLL_SECS = 1
DEFAULT_DELAY = 60.0


class AnchorCheckParser(HTMLParser):
Expand Down Expand Up @@ -98,7 +106,8 @@ def init(self) -> None:
open(path.join(self.outdir, 'output.json'), 'w').close()

# create queues and worker threads
self.wqueue = queue.Queue() # type: queue.Queue
self.rate_limits = {} # type: Dict[str, RateLimit]
self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue
self.rqueue = queue.Queue() # type: queue.Queue
self.workers = [] # type: List[threading.Thread]
for i in range(self.app.config.linkcheck_workers):
Expand Down Expand Up @@ -172,23 +181,38 @@ def check_uri() -> Tuple[str, str, int]:
config=self.app.config, auth=auth_info,
**kwargs)
response.raise_for_status()
except HTTPError:
except HTTPError as err:
if err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
response = requests.get(req_url, stream=True, config=self.app.config,
response = requests.get(req_url, stream=True,
config=self.app.config,
auth=auth_info, **kwargs)
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put((next_check, uri, docname, lineno), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
else:
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0
else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
return 'working', '', 0
else:
Expand Down Expand Up @@ -247,11 +271,69 @@ def check(docname: str) -> Tuple[str, str, int]:
return (status, info, code)

while True:
uri, docname, lineno = self.wqueue.get()
next_check, uri, docname, lineno = self.wqueue.get()
if uri is None:
break
netloc = urlparse(uri).netloc
try:
# Refresh rate limit.
# When there are many links in the queue, workers are all stuck waiting
# for responses, but the builder keeps queuing. Links in the queue may
# have been queued before rate limits were discovered.
next_check = self.rate_limits[netloc].next_check
except KeyError:
pass
if next_check > time.time():
# Sleep before putting message back in the queue to avoid
# waking up other threads.
time.sleep(QUEUE_POLL_SECS)
self.wqueue.put((next_check, uri, docname, lineno), False)
self.wqueue.task_done()
continue
status, info, code = check(docname)
self.rqueue.put((uri, docname, lineno, status, info, code))
if status == 'rate-limited':
logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...'))
else:
self.rqueue.put((uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def limit_rate(self, response: Response) -> Optional[float]:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
delay = float(retry_after)
except ValueError:
try:
# An HTTP-date: time of next attempt.
until = parsedate_to_datetime(retry_after)
except (TypeError, ValueError):
# TypeError: Invalid date format.
# ValueError: Invalid date, e.g. Oct 52th.
pass
else:
next_check = datetime.timestamp(until)
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
if next_check is None:
max_delay = self.app.config.linkcheck_rate_limit_timeout
try:
rate_limit = self.rate_limits[netloc]
except KeyError:
delay = DEFAULT_DELAY
else:
last_wait_time = rate_limit.delay
delay = 2.0 * last_wait_time
if delay > max_delay and last_wait_time < max_delay:
delay = max_delay
if delay > max_delay:
return None
next_check = time.time() + delay
self.rate_limits[netloc] = RateLimit(delay, next_check)
return next_check

def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None:
uri, docname, lineno, status, info, code = result
Expand Down Expand Up @@ -325,15 +407,17 @@ def write_doc(self, docname: str, doctree: Node) -> None:
continue
uri = refnode['refuri']
lineno = get_node_line(refnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1

# image nodes
for imgnode in doctree.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = get_node_line(imgnode)
self.wqueue.put((uri, docname, lineno), False)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1

done = 0
Expand All @@ -355,8 +439,10 @@ def write_linkstat(self, data: dict) -> None:
output.write('\n')

def finish(self) -> None:
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((None, None, None), False)
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)


def setup(app: Sphinx) -> Dict[str, Any]:
Expand All @@ -372,6 +458,7 @@ def setup(app: Sphinx) -> Dict[str, Any]:
# Anchors starting with ! are ignored since they are
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ["^!"], None)
app.add_config_value('linkcheck_rate_limit_timeout', 300.0, None)

return {
'version': 'builtin',
Expand Down

0 comments on commit 3238ac3

Please sign in to comment.