diff --git a/docs/_ext/scrapydocs.py b/docs/_ext/scrapydocs.py index d02a2e17bb6..f0f382da326 100644 --- a/docs/_ext/scrapydocs.py +++ b/docs/_ext/scrapydocs.py @@ -15,7 +15,7 @@ def run(self): def is_setting_index(node): - if node.tagname == 'index': + if node.tagname == 'index' and node['entries']: # index entries for setting directives look like: # [('pair', 'SETTING_NAME; setting', 'std:setting-SETTING_NAME', '')] entry_type, info, refid = node['entries'][0][:3] diff --git a/docs/news.rst b/docs/news.rst index 9469d0fe5e6..d8b9fcd1ea2 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -3,6 +3,195 @@ Release notes ============= +.. _release-2.7.0: + +Scrapy 2.7.0 (to be released) +----------------------------- + +Highlights: + +- Added Python 3.11 support, dropped Python 3.6 support +- Improved support for :ref:`asynchronous callbacks ` +- :ref:`Asyncio support ` is enabled by default on new + projects +- Output names of item fields can now be arbitrary strings +- Centralized :ref:`request fingerprinting ` + configuration is now possible + +Modified requirements +~~~~~~~~~~~~~~~~~~~~~ + +Python 3.7 or greater is now required; support for Python 3.6 has been dropped. +Support for the upcoming Python 3.11 has been added. + +The minimum required version of some dependencies has changed as well: + +- lxml_: 3.5.0 → 4.3.0 + +- Pillow_ (:ref:`images pipeline `): 4.0.0 → 7.1.0 + +- zope.interface_: 5.0.0 → 5.1.0 + +(:issue:`5512`, :issue:`5514`, :issue:`5524`, :issue:`5563`, :issue:`5664`, +:issue:`5670`, :issue:`5678`) + + +Deprecations +~~~~~~~~~~~~ + +- :meth:`ImagesPipeline.thumb_path + ` must now accept an + ``item`` parameter (:issue:`5504`, :issue:`5508`). + +- The ``scrapy.downloadermiddlewares.decompression`` module is now + deprecated (:issue:`5546`, :issue:`5547`). + + +New features +~~~~~~~~~~~~ + +- The + :meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_output` + method of :ref:`spider middlewares ` can now be + defined as an :term:`asynchronous generator` (:issue:`4978`). + +- The output of :class:`~scrapy.Request` callbacks defined as + :ref:`coroutines ` is now processed asynchronously + (:issue:`4978`). + +- :class:`~scrapy.spiders.crawl.CrawlSpider` now supports :ref:`asynchronous + callbacks ` (:issue:`5657`). + +- New projects created with the :command:`startproject` command have + :ref:`asyncio support ` enabled by default (:issue:`5590`, + :issue:`5679`). + +- The :setting:`FEED_EXPORT_FIELDS` setting can now be defined as a + dictionary to customize the output name of item fields, lifting the + restriction that required output names to be valid Python identifiers, e.g. + preventing them to have whitespace (:issue:`1008`, :issue:`3266`, + :issue:`3696`). + +- You can now customize :ref:`request fingerprinting ` + through the new :setting:`REQUEST_FINGERPRINTER_CLASS` setting, instead of + having to change it on every Scrapy component that relies on request + fingerprinting (:issue:`900`, :issue:`3420`, :issue:`4113`, :issue:`4762`, + :issue:`4524`). + +- ``jsonl`` is now supported and encouraged as a file extension for `JSON + Lines`_ files (:issue:`4848`). + + .. _JSON Lines: https://jsonlines.org/ + +- :meth:`ImagesPipeline.thumb_path + ` now receives the + source :ref:`item ` (:issue:`5504`, :issue:`5508`). + + +Bug fixes +~~~~~~~~~ + +- When using Google Cloud Storage with a :ref:`media pipeline + `, :setting:`FILES_EXPIRES` now also works when + :setting:`FILES_STORE` does not point at the root of your Google Cloud + Storage bucket (:issue:`5317`, :issue:`5318`). + +- The :command:`parse` command now supports :ref:`asynchronous callbacks + ` (:issue:`5424`, :issue:`5577`). + +- When using the :command:`parse` command with a URL for which there is no + available spider, an exception is no longer raised (:issue:`3264`, + :issue:`3265`, :issue:`5375`, :issue:`5376`, :issue:`5497`). + +- :class:`~scrapy.http.TextResponse` now gives higher priority to the `byte + order mark`_ when determining the text encoding of the response body, + following the `HTML living standard`_ (:issue:`5601`, :issue:`5611`). + + .. _byte order mark: https://en.wikipedia.org/wiki/Byte_order_mark + .. _HTML living standard: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + +- MIME sniffing takes the response body into account in FTP and HTTP/1.0 + requests, as well as in cached requests (:issue:`4873`). + +- MIME sniffing now detects valid HTML 5 documents even if the ``html`` tag + is missing (:issue:`4873`). + +- An exception is now raised if :setting:`ASYNCIO_EVENT_LOOP` has a value + that does not match the asyncio event loop actually installed + (:issue:`5529`). + +- Fixed :meth:`Headers.getlist ` + returning only the last header (:issue:`5515`, :issue:`5526`). + +- Fixed :class:`LinkExtractor + ` not ignoring the + ``tar.gz`` file extension by default (:issue:`1837`, :issue:`2067`, + :issue:`4066`) + + +Documentation +~~~~~~~~~~~~~ + +- Clarified the return type of :meth:`Spider.parse ` + (:issue:`5602`, :issue:`5608`). + +- To enable + :class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware` + to do `brotli compression`_, installing brotli_ is now recommended instead + of installing brotlipy_, as the former provides a more recent version of + brotli. + + .. _brotli: https://github.com/google/brotli + .. _brotli compression: https://www.ietf.org/rfc/rfc7932.txt + +- :ref:`Signal documentation ` now mentions :ref:`coroutine + support ` and uses it in code examples (:issue:`4852`, + :issue:`5358`). + +- :ref:`bans` now recommends `Common Crawl`_ instead of `Google cache`_ + (:issue:`3582`, :issue:`5432`). + + .. _Common Crawl: https://commoncrawl.org/ + .. _Google cache: http://www.googleguide.com/cached_pages.html + +- The new :ref:`topics-components` topic covers enforcing requirements on + Scrapy components, like :ref:`downloader middlewares + `, :ref:`extensions `, + :ref:`item pipelines `, :ref:`spider middlewares + `, and more; :ref:`enforce-asyncio-requirement` + has also been added (:issue:`4978`). + +- :ref:`topics-settings` now indicates that setting values must be + :ref:`picklable ` (:issue:`5607`, :issue:`5629`). + +- Removed outdated documentation (:issue:`5446`, :issue:`5373`, + :issue:`5369`, :issue:`5370`, :issue:`5554`). + +- Fixed typos (:issue:`5442`, :issue:`5455`, :issue:`5457`, :issue:`5461`, + :issue:`5538`, :issue:`5553`, :issue:`5558`, :issue:`5624`, :issue:`5631`). + +- Fixed other issues (:issue:`5283`, :issue:`5284`, :issue:`5559`, + :issue:`5567`, :issue:`5648`, :issue:`5659`, :issue:`5665`). + + +Quality assurance +~~~~~~~~~~~~~~~~~ + +- Added a continuous integration job to run `twine check`_ (:issue:`5655`, + :issue:`5656`). + + .. _twine check: https://twine.readthedocs.io/en/stable/#twine-check + +- Addressed test issues and warnings (:issue:`5560`, :issue:`5561`, + :issue:`5612`, :issue:`5617`, :issue:`5639`, :issue:`5645`, :issue:`5662`, + :issue:`5671`, :issue:`5675`). + +- Cleaned up code (:issue:`4991`, :issue:`4995`, :issue:`5451`, + :issue:`5487`, :issue:`5542`, :issue:`5667`, :issue:`5668`, :issue:`5672`). + +- Applied minor code improvements (:issue:`5661`). + + .. _release-2.6.3: Scrapy 2.6.3 (2022-09-27) @@ -3139,7 +3328,7 @@ New Features ~~~~~~~~~~~~ - Accept proxy credentials in :reqmeta:`proxy` request meta key (:issue:`2526`) -- Support `brotli`_-compressed content; requires optional `brotlipy`_ +- Support `brotli-compressed`_ content; requires optional `brotlipy`_ (:issue:`2535`) - New :ref:`response.follow ` shortcut for creating requests (:issue:`1940`) @@ -3176,7 +3365,7 @@ New Features - ``python -m scrapy`` as a more explicit alternative to ``scrapy`` command (:issue:`2740`) -.. _brotli: https://github.com/google/brotli +.. _brotli-compressed: https://www.ietf.org/rfc/rfc7932.txt .. _brotlipy: https://github.com/python-hyper/brotlipy/ Bug fixes diff --git a/docs/topics/components.rst b/docs/topics/components.rst index c44f3def207..ca301b82742 100644 --- a/docs/topics/components.rst +++ b/docs/topics/components.rst @@ -75,9 +75,9 @@ If your requirement is a minimum Scrapy version, you may use class MyComponent: def __init__(self): - if parse_version(scrapy.__version__) < parse_version('VERSION'): + if parse_version(scrapy.__version__) < parse_version('2.7'): raise RuntimeError( - f"{MyComponent.__qualname__} requires Scrapy VERSION or " + f"{MyComponent.__qualname__} requires Scrapy 2.7 or " f"later, which allow defining the process_spider_output " f"method of spider middlewares as an asynchronous " f"generator." diff --git a/docs/topics/coroutines.rst b/docs/topics/coroutines.rst index 7502633857a..a1ba4ba5cd3 100644 --- a/docs/topics/coroutines.rst +++ b/docs/topics/coroutines.rst @@ -22,7 +22,7 @@ hence use coroutine syntax (e.g. ``await``, ``async for``, ``async with``): If you are using any custom or third-party :ref:`spider middleware `, see :ref:`sync-async-spider-middleware`. - .. versionchanged:: VERSION + .. versionchanged:: 2.7 Output of async callbacks is now processed asynchronously instead of collecting all of it first. @@ -49,7 +49,7 @@ hence use coroutine syntax (e.g. ``await``, ``async for``, ``async with``): See also :ref:`sync-async-spider-middleware` and :ref:`universal-spider-middleware`. - .. versionadded:: VERSION + .. versionadded:: 2.7 General usage ============= @@ -129,7 +129,7 @@ Common use cases for asynchronous code include: Mixing synchronous and asynchronous spider middlewares ====================================================== -.. versionadded:: VERSION +.. versionadded:: 2.7 The output of a :class:`~scrapy.Request` callback is passed as the ``result`` parameter to the @@ -182,10 +182,10 @@ process_spider_output_async method `. Universal spider middlewares ============================ -.. versionadded:: VERSION +.. versionadded:: 2.7 To allow writing a spider middleware that supports asynchronous execution of -its ``process_spider_output`` method in Scrapy VERSION and later (avoiding +its ``process_spider_output`` method in Scrapy 2.7 and later (avoiding :ref:`asynchronous-to-synchronous conversions `) while maintaining support for older Scrapy versions, you may define ``process_spider_output`` as a synchronous method and define an @@ -206,7 +206,7 @@ For example:: yield r .. note:: This is an interim measure to allow, for a time, to write code that - works in Scrapy VERSION and later without requiring + works in Scrapy 2.7 and later without requiring asynchronous-to-synchronous conversions, and works in earlier Scrapy versions as well. diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 49cb69f6775..4393e1c6889 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -394,7 +394,7 @@ To change how request fingerprints are built for your requests, use the REQUEST_FINGERPRINTER_CLASS ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: VERSION +.. versionadded:: 2.7 Default: :class:`scrapy.utils.request.RequestFingerprinter` @@ -409,38 +409,38 @@ import path. REQUEST_FINGERPRINTER_IMPLEMENTATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: VERSION +.. versionadded:: 2.7 -Default: ``'PREVIOUS_VERSION'`` +Default: ``'2.6'`` Determines which request fingerprinting algorithm is used by the default request fingerprinter class (see :setting:`REQUEST_FINGERPRINTER_CLASS`). Possible values are: -- ``'PREVIOUS_VERSION'`` (default) +- ``'2.6'`` (default) This implementation uses the same request fingerprinting algorithm as - Scrapy PREVIOUS_VERSION and earlier versions. + Scrapy 2.6 and earlier versions. Even though this is the default value for backward compatibility reasons, it is a deprecated value. -- ``'VERSION'`` +- ``'2.7'`` - This implementation was introduced in Scrapy VERSION to fix an issue of the + This implementation was introduced in Scrapy 2.7 to fix an issue of the previous implementation. New projects should use this value. The :command:`startproject` command sets this value in the generated ``settings.py`` file. -If you are using the default value (``'PREVIOUS_VERSION'``) for this setting, and you are +If you are using the default value (``'2.6'``) for this setting, and you are using Scrapy components where changing the request fingerprinting algorithm would cause undesired results, you need to carefully decide when to change the value of this setting, or switch the :setting:`REQUEST_FINGERPRINTER_CLASS` -setting to a custom request fingerprinter class that implements the PREVIOUS_VERSION request +setting to a custom request fingerprinter class that implements the 2.6 request fingerprinting algorithm and does not log this warning ( -:ref:`PREVIOUS_VERSION-request-fingerprinter` includes an example implementation of such a +:ref:`2.6-request-fingerprinter` includes an example implementation of such a class). Scenarios where changing the request fingerprinting algorithm may cause @@ -449,14 +449,14 @@ undesired results include, for example, using the HTTP cache middleware (see Changing the request fingerprinting algorithm would invalidade the current cache, requiring you to redownload all requests again. -Otherwise, set :setting:`REQUEST_FINGERPRINTER_IMPLEMENTATION` to ``'VERSION'`` in +Otherwise, set :setting:`REQUEST_FINGERPRINTER_IMPLEMENTATION` to ``'2.7'`` in your settings to switch already to the request fingerprinting implementation that will be the only request fingerprinting implementation available in a future version of Scrapy, and remove the deprecation warning triggered by using -the default value (``'PREVIOUS_VERSION'``). +the default value (``'2.6'``). -.. _PREVIOUS_VERSION-request-fingerprinter: +.. _2.6-request-fingerprinter: .. _custom-request-fingerprinter: Writing your own request fingerprinter @@ -464,6 +464,8 @@ Writing your own request fingerprinter A request fingerprinter is a class that must implement the following method: +.. currentmodule:: None + .. method:: fingerprint(self, request) Return a :class:`bytes` object that uniquely identifies *request*. @@ -476,6 +478,7 @@ A request fingerprinter is a class that must implement the following method: Additionally, it may also implement the following methods: .. classmethod:: from_crawler(cls, crawler) + :noindex: If present, this class method is called to create a request fingerprinter instance from a :class:`~scrapy.crawler.Crawler` object. It must return a @@ -495,11 +498,13 @@ Additionally, it may also implement the following methods: :class:`~scrapy.settings.Settings` object. It must return a new instance of the request fingerprinter. -The ``fingerprint`` method of the default request fingerprinter, +.. currentmodule:: scrapy.http + +The :meth:`fingerprint` method of the default request fingerprinter, :class:`scrapy.utils.request.RequestFingerprinter`, uses :func:`scrapy.utils.request.fingerprint` with its default parameters. For some -common use cases you can use :func:`~scrapy.utils.request.fingerprint` as well -in your ``fingerprint`` method implementation: +common use cases you can use :func:`scrapy.utils.request.fingerprint` as well +in your :meth:`fingerprint` method implementation: .. autofunction:: scrapy.utils.request.fingerprint @@ -519,7 +524,7 @@ account:: You can also write your own fingerprinting logic from scratch. -However, if you do not use :func:`~scrapy.utils.request.fingerprint`, make sure +However, if you do not use :func:`scrapy.utils.request.fingerprint`, make sure you use :class:`~weakref.WeakKeyDictionary` to cache request fingerprints: - Caching saves CPU by ensuring that fingerprints are calculated only once @@ -553,7 +558,7 @@ If you need to be able to override the request fingerprinting for arbitrary requests from your spider callbacks, you may implement a request fingerprinter that reads fingerprints from :attr:`request.meta ` when available, and then falls back to -:func:`~scrapy.utils.request.fingerprint`. For example:: +:func:`scrapy.utils.request.fingerprint`. For example:: from scrapy.utils.request import fingerprint @@ -564,8 +569,8 @@ when available, and then falls back to return request.meta['fingerprint'] return fingerprint(request) -If you need to reproduce the same fingerprinting algorithm as Scrapy PREVIOUS_VERSION -without using the deprecated ``'PREVIOUS_VERSION'`` value of the +If you need to reproduce the same fingerprinting algorithm as Scrapy 2.6 +without using the deprecated ``'2.6'`` value of the :setting:`REQUEST_FINGERPRINTER_IMPLEMENTATION` setting, use the following request fingerprinter:: diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 0b1ef71cfa3..40bcda288b1 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -1642,7 +1642,7 @@ install the default reactor defined by Twisted for the current platform. This is to maintain backward compatibility and avoid possible problems caused by using a non-default reactor. -.. versionchanged:: VERSION +.. versionchanged:: 2.7 The :command:`startproject` command now sets this setting to ``twisted.internet.asyncioreactor.AsyncioSelectorReactor`` in the generated ``settings.py`` file. @@ -1661,14 +1661,14 @@ Scope: ``spidermiddlewares.urllength`` The maximum URL length to allow for crawled URLs. -This setting can act as a stopping condition in case of URLs of ever-increasing -length, which may be caused for example by a programming error either in the -target server or in your code. See also :setting:`REDIRECT_MAX_TIMES` and +This setting can act as a stopping condition in case of URLs of ever-increasing +length, which may be caused for example by a programming error either in the +target server or in your code. See also :setting:`REDIRECT_MAX_TIMES` and :setting:`DEPTH_LIMIT`. Use ``0`` to allow URLs of any length. -The default value is copied from the `Microsoft Internet Explorer maximum URL +The default value is copied from the `Microsoft Internet Explorer maximum URL length`_, even though this setting exists for different reasons. .. _Microsoft Internet Explorer maximum URL length: https://support.microsoft.com/en-us/topic/maximum-url-length-is-2-083-characters-in-internet-explorer-174e7c8a-6666-f4e0-6fd6-908b53c12246 diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst index 816cb5e03bf..303401a3c63 100644 --- a/docs/topics/spider-middleware.rst +++ b/docs/topics/spider-middleware.rst @@ -105,17 +105,17 @@ object gives you access, for example, to the :ref:`settings `. :class:`~scrapy.Request` objects and :ref:`item objects `. - .. versionchanged:: VERSION + .. versionchanged:: 2.7 This method may be defined as an :term:`asynchronous generator`, in which case ``result`` is an :term:`asynchronous iterable`. Consider defining this method as an :term:`asynchronous generator`, which will be a requirement in a future version of Scrapy. However, if you plan on sharing your spider middleware with other people, consider - either :ref:`enforcing Scrapy VERSION ` + either :ref:`enforcing Scrapy 2.7 ` as a minimum requirement of your spider middleware, or :ref:`making your spider middleware universal ` so that - it works with Scrapy versions earlier than Scrapy VERSION. + it works with Scrapy versions earlier than Scrapy 2.7. :param response: the response which generated this output from the spider @@ -130,7 +130,7 @@ object gives you access, for example, to the :ref:`settings `. .. method:: process_spider_output_async(response, result, spider) - .. versionadded:: VERSION + .. versionadded:: 2.7 If defined, this method must be an :term:`asynchronous generator`, which will be called instead of :meth:`process_spider_output` if diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index ff86af125e6..29ff028bef3 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -248,7 +248,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy' REQUEST_FINGERPRINTER_CLASS = 'scrapy.utils.request.RequestFingerprinter' -REQUEST_FINGERPRINTER_IMPLEMENTATION = 'PREVIOUS_VERSION' +REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.6' RETRY_ENABLED = True RETRY_TIMES = 2 # initial response + 2 retries = 3 requests diff --git a/scrapy/templates/project/module/settings.py.tmpl b/scrapy/templates/project/module/settings.py.tmpl index c0c34e986cb..bbf60982c23 100644 --- a/scrapy/templates/project/module/settings.py.tmpl +++ b/scrapy/templates/project/module/settings.py.tmpl @@ -88,5 +88,5 @@ ROBOTSTXT_OBEY = True #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # Set settings whose default value is deprecated to a future-proof value -REQUEST_FINGERPRINTER_IMPLEMENTATION = 'VERSION' +REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' diff --git a/scrapy/utils/request.py b/scrapy/utils/request.py index cf33317ce0f..fbddc41fbe8 100644 --- a/scrapy/utils/request.py +++ b/scrapy/utils/request.py @@ -236,10 +236,10 @@ def __init__(self, crawler=None): 'REQUEST_FINGERPRINTER_IMPLEMENTATION' ) else: - implementation = 'PREVIOUS_VERSION' - if implementation == 'PREVIOUS_VERSION': + implementation = '2.6' + if implementation == '2.6': message = ( - '\'PREVIOUS_VERSION\' is a deprecated value for the ' + '\'2.6\' is a deprecated value for the ' '\'REQUEST_FINGERPRINTER_IMPLEMENTATION\' setting.\n' '\n' 'It is also the default value. In other words, it is normal ' @@ -254,14 +254,14 @@ def __init__(self, crawler=None): ) warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2) self._fingerprint = _request_fingerprint_as_bytes - elif implementation == 'VERSION': + elif implementation == '2.7': self._fingerprint = fingerprint else: raise ValueError( f'Got an invalid value on setting ' f'\'REQUEST_FINGERPRINTER_IMPLEMENTATION\': ' - f'{implementation!r}. Valid values are \'PREVIOUS_VERSION\' (deprecated) ' - f'and \'VERSION\'.' + f'{implementation!r}. Valid values are \'2.6\' (deprecated) ' + f'and \'2.7\'.' ) def fingerprint(self, request): diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py index 0b828f7c099..445cd2e3aa8 100644 --- a/scrapy/utils/test.py +++ b/scrapy/utils/test.py @@ -65,7 +65,7 @@ def get_crawler(spidercls=None, settings_dict=None, prevent_warnings=True): # Set by default settings that prevent deprecation warnings. settings = {} if prevent_warnings: - settings['REQUEST_FINGERPRINTER_IMPLEMENTATION'] = 'VERSION' + settings['REQUEST_FINGERPRINTER_IMPLEMENTATION'] = '2.7' settings.update(settings_dict or {}) runner = CrawlerRunner(settings) return runner.create_crawler(spidercls or Spider) diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 8be4b6fe15c..5383ec65298 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -350,7 +350,7 @@ def test_crawlerrunner_accepts_crawler(self): @defer.inlineCallbacks def test_crawl_multiple(self): - runner = CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'}) + runner = CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'}) runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index c61d461f71a..da6024c2b74 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -110,7 +110,7 @@ class MySpider(scrapy.Spider): 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file, # settings to avoid extra warnings - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION', + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7', 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } @@ -235,7 +235,7 @@ def start_requests(self): class CrawlerRunnerHasSpider(unittest.TestCase): def _runner(self): - return CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'}) + return CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'}) @defer.inlineCallbacks def test_crawler_runner_bootstrap_successful(self): @@ -283,14 +283,14 @@ def test_crawler_runner_asyncio_enabled_true(self): if self.reactor_pytest == 'asyncio': CrawlerRunner(settings={ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", - "REQUEST_FINGERPRINTER_IMPLEMENTATION": "VERSION", + "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", }) else: msg = r"The installed reactor \(.*?\) does not match the requested one \(.*?\)" with self.assertRaisesRegex(Exception, msg): runner = CrawlerRunner(settings={ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", - "REQUEST_FINGERPRINTER_IMPLEMENTATION": "VERSION", + "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", }) yield runner.crawl(NoRequestsSpider) diff --git a/tests/test_dupefilters.py b/tests/test_dupefilters.py index 8a37a8ebec4..6ebb716b012 100644 --- a/tests/test_dupefilters.py +++ b/tests/test_dupefilters.py @@ -51,7 +51,7 @@ class RFPDupeFilterTest(unittest.TestCase): def test_df_from_crawler_scheduler(self): settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': FromCrawlerRFPDupeFilter, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertTrue(scheduler.df.debug) @@ -60,7 +60,7 @@ def test_df_from_crawler_scheduler(self): def test_df_from_settings_scheduler(self): settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': FromSettingsRFPDupeFilter, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertTrue(scheduler.df.debug) @@ -68,7 +68,7 @@ def test_df_from_settings_scheduler(self): def test_df_direct_scheduler(self): settings = {'DUPEFILTER_CLASS': DirectDupeFilter, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertEqual(scheduler.df.method, 'n/a') @@ -172,7 +172,7 @@ def test_log(self): with LogCapture() as log: settings = {'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': FromCrawlerRFPDupeFilter, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(SimpleSpider, settings_dict=settings) spider = SimpleSpider.from_crawler(crawler) dupefilter = _get_dupefilter(crawler=crawler) @@ -199,7 +199,7 @@ def test_log_debug(self): with LogCapture() as log: settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': FromCrawlerRFPDupeFilter, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(SimpleSpider, settings_dict=settings) spider = SimpleSpider.from_crawler(crawler) dupefilter = _get_dupefilter(crawler=crawler) @@ -233,7 +233,7 @@ def test_log_debug(self): def test_log_debug_default_dupefilter(self): with LogCapture() as log: settings = {'DUPEFILTER_DEBUG': True, - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'} + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7'} crawler = get_crawler(SimpleSpider, settings_dict=settings) spider = SimpleSpider.from_crawler(crawler) dupefilter = _get_dupefilter(crawler=crawler) diff --git a/tests/test_pipeline_crawl.py b/tests/test_pipeline_crawl.py index e46532a1cc8..0e174cd34b4 100644 --- a/tests/test_pipeline_crawl.py +++ b/tests/test_pipeline_crawl.py @@ -64,7 +64,7 @@ def setUp(self): self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION', + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7', 'ITEM_PIPELINES': {self.pipeline_class: 1}, self.store_setting_key: self.tmpmediastore, } diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index ac66056ba8d..50a7755c1a1 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -52,7 +52,7 @@ def __init__(self, priority_queue_cls, jobdir): SCHEDULER_PRIORITY_QUEUE=priority_queue_cls, JOBDIR=jobdir, DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter', - REQUEST_FINGERPRINTER_IMPLEMENTATION='VERSION', + REQUEST_FINGERPRINTER_IMPLEMENTATION='2.7', ) super().__init__(Spider, settings) self.engine = MockEngine(downloader=MockDownloader()) diff --git a/tests/test_spiderloader/__init__.py b/tests/test_spiderloader/__init__.py index 3719c7c9fb6..7a590f96cbc 100644 --- a/tests/test_spiderloader/__init__.py +++ b/tests/test_spiderloader/__init__.py @@ -98,7 +98,7 @@ def test_crawler_runner_loading(self): module = 'tests.test_spiderloader.test_spiders.spider1' runner = CrawlerRunner({ 'SPIDER_MODULES': [module], - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION', + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7', }) self.assertRaisesRegex(KeyError, 'Spider not found', diff --git a/tests/test_utils_request.py b/tests/test_utils_request.py index 8bc7922b628..a92d9a0acf1 100644 --- a/tests/test_utils_request.py +++ b/tests/test_utils_request.py @@ -505,7 +505,7 @@ def test_default_implementation(self): def test_deprecated_implementation(self): settings = { - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'PREVIOUS_VERSION', + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.6', } with warnings.catch_warnings(record=True) as logged_warnings: crawler = get_crawler(settings_dict=settings) @@ -518,7 +518,7 @@ def test_deprecated_implementation(self): def test_recommended_implementation(self): settings = { - 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION', + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7', } with warnings.catch_warnings(record=True) as logged_warnings: crawler = get_crawler(settings_dict=settings)