scrapy · BurnzZ · Nov 22, 2023 · Nov 28, 2023 · Dec 13, 2023 · May 16, 2024
diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst
@@ -345,9 +345,14 @@ OffsiteMiddleware
    :attr:`~scrapy.Spider.allowed_domains` attribute, or the
    attribute is empty, the offsite middleware will allow all requests.
 
-   If the request has the :attr:`~scrapy.Request.dont_filter` attribute
-   set, the offsite middleware will allow the request even if its domain is not
-   listed in allowed domains.
+   If ``allow_offsite`` is set to ``True`` in :attr:`Request.meta`, then the
+   offsite middleware will allow the request even if its domain is not listed
+   in allowed domains.
+
+   .. caution:: Setting :attr:`~scrapy.Request.dont_filter` to ``True`` also
+                causes the offsite middleware to allow the request. However,
+                this is deprecated. Use ``allow_offsite`` instead in
+                :attr:`Request.meta`.
 
 
 RefererMiddleware

diff --git a/scrapy/spidermiddlewares/offsite.py b/scrapy/spidermiddlewares/offsite.py
@@ -12,6 +12,7 @@
 
 from scrapy import Spider, signals
 from scrapy.crawler import Crawler
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Request, Response
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.httpobj import urlparse_cached
@@ -50,7 +51,15 @@
     def _filter(self, request: Any, spider: Spider) -> bool:
         if not isinstance(request, Request):
             return True
-        if request.dont_filter or self.should_follow(request, spider):
+        if request.dont_filter:
+            warnings.warn(
+                "The dont_filter filter flag is deprecated in OffsiteMiddleware. "
+                "Set 'allow_offsite' to True in Request.meta instead.",
+                ScrapyDeprecationWarning,
+                stacklevel=2,
+            )
+            return True
+        if request.meta.get("allow_offsite") or self.should_follow(request, spider):
             return True
         domain = urlparse_cached(request).hostname
         if domain and domain not in self.domains_seen:

diff --git a/tests/test_spidermiddleware_offsite.py b/tests/test_spidermiddleware_offsite.py
@@ -29,6 +29,7 @@ def test_process_spider_output(self):
             Request("http://scrapy.org/1"),
             Request("http://sub.scrapy.org/1"),
             Request("http://offsite.tld/letmepass", dont_filter=True),
+            Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}),
             Request("http://scrapy.test.org/"),
             Request("http://scrapy.test.org:8000/"),
         ]