Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Content-Encoding header in response flag #5943

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
83 changes: 55 additions & 28 deletions scrapy/downloadermiddlewares/httpcompression.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,53 @@ class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""

def __init__(self, stats=None):
def __init__(self, stats=None, settings=None):
self.stats = stats
if not stats:
warnings.warn(
"The default value of COMPRESSION_KEEP_ENCODING_HEADER, "
"False, is deprecated, and will stop working and stop "
"being its default value in a future version of Scrapy. "
"Set COMPRESSION_KEEP_ENCODING_HEADER=True in your "
"settings to remove this warning.",
ScrapyDeprecationWarning,
stacklevel=2,
)
if settings:
self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER')
if not self.keep_encoding_header:
warnings.warn(
"Setting COMPRESSION_KEEP_ENCODING_HEADER=False is deprecated",
ScrapyDeprecationWarning,
)
else:
self.keep_encoding_header = False
warnings.warn(
"The default value of COMPRESSION_KEEP_ENCODING_HEADER, "
"False, is deprecated, and will stop working and stop "
"being its default value in a future version of Scrapy. "
"Set COMPRESSION_KEEP_ENCODING_HEADER=True in your "
"settings to remove this warning.",
ScrapyDeprecationWarning,
stacklevel=2,
)

@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool("COMPRESSION_ENABLED"):
raise NotConfigured
try:
return cls(stats=crawler.stats)
return cls(stats=crawler.stats, settings=crawler.settings)
except TypeError:
warnings.warn(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter or "
"their '__init__' method to support 'stats' and 'settings' parameters or "
"reimplement the 'from_crawler' method.",
ScrapyDeprecationWarning,
)
result = cls()
result.stats = crawler.stats
result.keep_encoding_header = False
return result

def process_request(self, request, spider):
Expand All @@ -55,32 +84,30 @@ def process_request(self, request, spider):
def process_response(self, request, response, spider):
if request.method == "HEAD":
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist("Content-Encoding")
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value(
"httpcompression/response_bytes",
len(decoded_body),
spider=spider,
)
self.stats.inc_value(
"httpcompression/response_count", spider=spider
)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs["encoding"] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers["Content-Encoding"]
if b'decoded' in response.flags:
return response
content_encoding = response.headers.getlist('Content-Encoding')
if not content_encoding:
return response

encoding = content_encoding[0]
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None

kwargs['flags'] = response.flags + [b'decoded']
response = response.replace(**kwargs)
if not self.keep_encoding_header:
del response.headers['Content-Encoding']
return response

def _decode(self, body, encoding):
Expand Down
2 changes: 1 addition & 1 deletion scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
COMMANDS_MODULE = ""

COMPRESSION_ENABLED = True

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should keep the empty line before CONCURRENT_ITEMS

COMPRESSION_KEEP_ENCODING_HEADER = False
CONCURRENT_ITEMS = 100

CONCURRENT_REQUESTS = 16
Expand Down
1 change: 1 addition & 0 deletions scrapy/templates/project/module/settings.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ ROBOTSTXT_OBEY = True
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
COMPRESSION_KEEP_ENCODING_HEADER = True

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
Expand Down