diff --git a/docs/news.rst b/docs/news.rst index 921089ccd96..d79844ed25f 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -6,6 +6,17 @@ Release notes .. note:: Scrapy 1.x will be the last series supporting Python 2. Scrapy 2.0, planned for Q4 2019 or Q1 2020, will support **Python 3 only**. +Scrapy 1.7.2 (2019-07-23) +------------------------- + +Fix Python 2 support (:issue:`3889`, :issue:`3893`, :issue:`3896`). + + +Scrapy 1.7.1 (2019-07-18) +------------------------- + +Re-packaging of Scrapy 1.7.0, which was missing some changes in PyPI. + .. _release-1.7.0: Scrapy 1.7.0 (2019-07-18) diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 38a4fdb2593..a3780a177cc 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -963,7 +963,7 @@ precedence over the :setting:`RETRY_TIMES` setting. RETRY_HTTP_CODES ^^^^^^^^^^^^^^^^ -Default: ``[500, 502, 503, 504, 522, 524, 408]`` +Default: ``[500, 502, 503, 504, 522, 524, 408, 429]`` Which HTTP response codes to retry. Other errors (DNS lookup issues, connections lost, etc) are always retried. diff --git a/docs/topics/logging.rst b/docs/topics/logging.rst index dea0528db0a..87ea43c7dd0 100644 --- a/docs/topics/logging.rst +++ b/docs/topics/logging.rst @@ -193,6 +193,17 @@ to override some of the Scrapy settings regarding logging. Module `logging.handlers `_ Further documentation on available handlers +.. _custom-log-formats: + +Custom Log Formats +------------------ + +A custom log format can be set for different actions by extending :class:`~scrapy.logformatter.LogFormatter` class +and making :setting:`LOG_FORMATTER` point to your new class. + +.. autoclass:: scrapy.logformatter.LogFormatter + :members: + Advanced customization ---------------------- diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index fd46c614e01..85ae2a3058c 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -440,9 +440,10 @@ or even enable client-side authentication (and various other things). which uses the platform's certificates to validate remote endpoints. **This is only available if you use Twisted>=14.0.** -If you do use a custom ContextFactory, make sure it accepts a ``method`` -parameter at init (this is the ``OpenSSL.SSL`` method mapping -:setting:`DOWNLOADER_CLIENT_TLS_METHOD`). +If you do use a custom ContextFactory, make sure its ``__init__`` method +accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping +:setting:`DOWNLOADER_CLIENT_TLS_METHOD`) and a ``tls_verbose_logging`` +parameter (``bool``). .. setting:: DOWNLOADER_CLIENT_TLS_METHOD @@ -470,6 +471,20 @@ This setting must be one of these string values: We recommend that you use PyOpenSSL>=0.13 and Twisted>=0.13 or above (Twisted>=14.0 if you can). +.. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING + +DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING +------------------------------------- + +Default: ``False`` + +Setting this to ``True`` will enable DEBUG level messages about TLS connection +parameters after establishing HTTPS connections. The kind of information logged +depends on the versions of OpenSSL and pyOpenSSL. + +This setting is only used for the default +:setting:`DOWNLOADER_CLIENTCONTEXTFACTORY`. + .. setting:: DOWNLOADER_MIDDLEWARES DOWNLOADER_MIDDLEWARES @@ -870,6 +885,15 @@ directives. .. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior +.. setting:: LOG_FORMATTER + +LOG_FORMATTER +------------- + +Default: :class:`scrapy.logformatter.LogFormatter` + +The class to use for :ref:`formatting log messages ` for different actions. + .. setting:: LOG_LEVEL LOG_LEVEL diff --git a/requirements-py3.txt b/requirements-py3.txt index 5a5d4c95af4..478ed0010dd 100644 --- a/requirements-py3.txt +++ b/requirements-py3.txt @@ -1,5 +1,6 @@ Twisted>=17.9.0 -lxml>=3.2.4 +lxml;python_version!="3.4" +lxml<=4.3.5;python_version=="3.4" pyOpenSSL>=0.13.1 cssselect>=0.9 queuelib>=1.1.1 diff --git a/scrapy/core/downloader/contextfactory.py b/scrapy/core/downloader/contextfactory.py index 783d4c38341..5ac20c0bbb2 100644 --- a/scrapy/core/downloader/contextfactory.py +++ b/scrapy/core/downloader/contextfactory.py @@ -28,9 +28,15 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS): understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.' """ - def __init__(self, method=SSL.SSLv23_METHOD, *args, **kwargs): + def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, *args, **kwargs): super(ScrapyClientContextFactory, self).__init__(*args, **kwargs) self._ssl_method = method + self.tls_verbose_logging = tls_verbose_logging + + @classmethod + def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs): + tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING') + return cls(method=method, tls_verbose_logging=tls_verbose_logging, *args, **kwargs) def getCertificateOptions(self): # setting verify=True will require you to provide CAs @@ -56,7 +62,8 @@ def getContext(self, hostname=None, port=None): return self.getCertificateOptions().getContext() def creatorForNetloc(self, hostname, port): - return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext()) + return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(), + verbose_logging=self.tls_verbose_logging) @implementer(IPolicyForHTTPS) diff --git a/scrapy/core/downloader/handlers/http10.py b/scrapy/core/downloader/handlers/http10.py index d875fb1e441..be729853111 100644 --- a/scrapy/core/downloader/handlers/http10.py +++ b/scrapy/core/downloader/handlers/http10.py @@ -1,7 +1,7 @@ """Download handlers for http and https schemes """ from twisted.internet import reactor -from scrapy.utils.misc import load_object +from scrapy.utils.misc import load_object, create_instance from scrapy.utils.python import to_unicode @@ -11,6 +11,7 @@ class HTTP10DownloadHandler(object): def __init__(self, settings): self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY']) self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) + self._settings = settings def download_request(self, request, spider): """Return a deferred for the HTTP download""" @@ -21,7 +22,7 @@ def download_request(self, request, spider): def _connect(self, factory): host, port = to_unicode(factory.host), factory.port if factory.scheme == b'https': - return reactor.connectSSL(host, port, factory, - self.ClientContextFactory()) + client_context_factory = create_instance(self.ClientContextFactory, settings=self._settings, crawler=None) + return reactor.connectSSL(host, port, factory, client_context_factory) else: return reactor.connectTCP(host, port, factory) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index cbaa36b2dfb..2ccab261469 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -25,7 +25,7 @@ from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.core.downloader.tls import openssl_methods -from scrapy.utils.misc import load_object +from scrapy.utils.misc import load_object, create_instance from scrapy.utils.python import to_bytes, to_unicode from scrapy import twisted_version @@ -44,14 +44,15 @@ def __init__(self, settings): self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: - self._contextFactory = self._contextFactoryClass(method=self._sslMethod) + self._contextFactory = create_instance(self._contextFactoryClass, settings=settings, crawler=None, + method=self._sslMethod) except TypeError: # use context factory defaults - self._contextFactory = self._contextFactoryClass() + self._contextFactory = create_instance(self._contextFactoryClass, settings=settings, crawler=None) msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ - e.g. OpenSSL.SSL.SSLv23_METHOD).\ - Please upgrade your context factory class to handle it or ignore it.""" % ( + e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument.\ + Please upgrade your context factory class to handle them or ignore them.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') diff --git a/scrapy/core/downloader/tls.py b/scrapy/core/downloader/tls.py index df805118249..74afb3f102c 100644 --- a/scrapy/core/downloader/tls.py +++ b/scrapy/core/downloader/tls.py @@ -2,6 +2,7 @@ from OpenSSL import SSL from scrapy import twisted_version +from scrapy.utils.ssl import x509name_to_string, get_temp_key_info logger = logging.getLogger(__name__) @@ -20,6 +21,7 @@ METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only } + if twisted_version >= (14, 0, 0): # ClientTLSOptions requires a recent-enough version of Twisted. # Not having ScrapyClientTLSOptions should not matter for older @@ -65,13 +67,39 @@ class ScrapyClientTLSOptions(ClientTLSOptions): Same as Twisted's private _sslverify.ClientTLSOptions, except that VerificationError, CertificateError and ValueError exceptions are caught, so that the connection is not closed, only - logging warnings. + logging warnings. Also, HTTPS connection parameters logging is added. """ + def __init__(self, hostname, ctx, verbose_logging=False): + super(ScrapyClientTLSOptions, self).__init__(hostname, ctx) + self.verbose_logging = verbose_logging + def _identityVerifyingInfoCallback(self, connection, where, ret): if where & SSL_CB_HANDSHAKE_START: set_tlsext_host_name(connection, self._hostnameBytes) elif where & SSL_CB_HANDSHAKE_DONE: + if self.verbose_logging: + if hasattr(connection, 'get_cipher_name'): # requires pyOPenSSL 0.15 + if hasattr(connection, 'get_protocol_version_name'): # requires pyOPenSSL 16.0.0 + logger.debug('SSL connection to %s using protocol %s, cipher %s', + self._hostnameASCII, + connection.get_protocol_version_name(), + connection.get_cipher_name(), + ) + else: + logger.debug('SSL connection to %s using cipher %s', + self._hostnameASCII, + connection.get_cipher_name(), + ) + server_cert = connection.get_peer_certificate() + logger.debug('SSL connection certificate: issuer "%s", subject "%s"', + x509name_to_string(server_cert.get_issuer()), + x509name_to_string(server_cert.get_subject()), + ) + key_info = get_temp_key_info(connection._ssl) + if key_info: + logger.debug('SSL temp key: %s', key_info) + try: verifyHostname(connection, self._hostnameASCII) except verification_errors as e: diff --git a/scrapy/logformatter.py b/scrapy/logformatter.py index 65f347dcfe3..b4d6787ffc6 100644 --- a/scrapy/logformatter.py +++ b/scrapy/logformatter.py @@ -12,26 +12,40 @@ class LogFormatter(object): """Class for generating log messages for different actions. - - All methods must return a dictionary listing the parameters ``level``, - ``msg`` and ``args`` which are going to be used for constructing the log - message when calling logging.log. + + All methods must return a dictionary listing the parameters ``level``, ``msg`` + and ``args`` which are going to be used for constructing the log message when + calling ``logging.log``. Dictionary keys for the method outputs: - * ``level`` should be the log level for that action, you can use those - from the python logging library: logging.DEBUG, logging.INFO, - logging.WARNING, logging.ERROR and logging.CRITICAL. - * ``msg`` should be a string that can contain different formatting - placeholders. This string, formatted with the provided ``args``, is - going to be the log message for that action. + * ``level`` is the log level for that action, you can use those from the + `python logging library `_ : + ``logging.DEBUG``, ``logging.INFO``, ``logging.WARNING``, ``logging.ERROR`` + and ``logging.CRITICAL``. + * ``msg`` should be a string that can contain different formatting placeholders. + This string, formatted with the provided ``args``, is going to be the long message + for that action. + * ``args`` should be a tuple or dict with the formatting placeholders for ``msg``. + The final log message is computed as ``msg % args``. - * ``args`` should be a tuple or dict with the formatting placeholders - for ``msg``. The final log message is computed as output['msg'] % - output['args']. - """ + Here is an example on how to create a custom log formatter to lower the severity level of + the log message when an item is dropped from the pipeline:: + class PoliteLogFormatter(logformatter.LogFormatter): + def dropped(self, item, exception, response, spider): + return { + 'level': logging.INFO, # lowering the level from logging.WARNING + 'msg': u"Dropped: %(exception)s" + os.linesep + "%(item)s", + 'args': { + 'exception': exception, + 'item': item, + } + } + """ + def crawled(self, request, response, spider): + """Logs a message when the crawler finds a webpage.""" request_flags = ' %s' % str(request.flags) if request.flags else '' response_flags = ' %s' % str(response.flags) if response.flags else '' return { @@ -40,7 +54,7 @@ def crawled(self, request, response, spider): 'args': { 'status': response.status, 'request': request, - 'request_flags' : request_flags, + 'request_flags': request_flags, 'referer': referer_str(request), 'response_flags': response_flags, # backward compatibility with Scrapy logformatter below 1.4 version @@ -49,6 +63,7 @@ def crawled(self, request, response, spider): } def scraped(self, item, response, spider): + """Logs a message when an item is scraped by a spider.""" if isinstance(response, Failure): src = response.getErrorMessage() else: @@ -63,6 +78,7 @@ def scraped(self, item, response, spider): } def dropped(self, item, exception, response, spider): + """Logs a message when an item is dropped while it is passing through the item pipeline.""" return { 'level': logging.WARNING, 'msg': DROPPEDMSG, diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index 2145e6d2b5e..ea06d2ae87e 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -189,6 +189,19 @@ def _headers_to_botocore_kwargs(self, headers): 'X-Amz-Grant-Read': 'GrantRead', 'X-Amz-Grant-Read-ACP': 'GrantReadACP', 'X-Amz-Grant-Write-ACP': 'GrantWriteACP', + 'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus', + 'X-Amz-Object-Lock-Mode': 'ObjectLockMode', + 'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate', + 'X-Amz-Request-Payer': 'RequestPayer', + 'X-Amz-Server-Side-Encryption': 'ServerSideEncryption', + 'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId', + 'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext', + 'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm', + 'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey', + 'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5', + 'X-Amz-Storage-Class': 'StorageClass', + 'X-Amz-Tagging': 'Tagging', + 'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation', }) extra = {} for key, value in six.iteritems(headers): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index d17eb31257d..086adf48ef6 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -87,6 +87,7 @@ DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory' DOWNLOADER_CLIENT_TLS_METHOD = 'TLS' # Use highest TLS/SSL protocol version supported by the platform, # also allowing negotiation +DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING = False DOWNLOADER_MIDDLEWARES = {} diff --git a/scrapy/utils/conf.py b/scrapy/utils/conf.py index 26d66eaf893..fb7ca3310d6 100644 --- a/scrapy/utils/conf.py +++ b/scrapy/utils/conf.py @@ -1,10 +1,13 @@ import os import sys import numbers -import configparser from operator import itemgetter import six +if six.PY2: + from ConfigParser import SafeConfigParser as ConfigParser +else: + from configparser import ConfigParser from scrapy.settings import BaseSettings from scrapy.utils.deprecate import update_classpath @@ -94,7 +97,7 @@ def init_env(project='default', set_syspath=True): def get_config(use_closest=True): """Get Scrapy config file as a ConfigParser""" sources = get_sources(use_closest) - cfg = configparser.ConfigParser() + cfg = ConfigParser() cfg.read(sources) return cfg diff --git a/scrapy/utils/ssl.py b/scrapy/utils/ssl.py new file mode 100644 index 00000000000..5db1608bf48 --- /dev/null +++ b/scrapy/utils/ssl.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +import OpenSSL._util as pyOpenSSLutil + +from scrapy.utils.python import to_native_str + + +def ffi_buf_to_string(buf): + return to_native_str(pyOpenSSLutil.ffi.string(buf)) + + +def x509name_to_string(x509name): + # from OpenSSL.crypto.X509Name.__repr__ + result_buffer = pyOpenSSLutil.ffi.new("char[]", 512) + pyOpenSSLutil.lib.X509_NAME_oneline(x509name._name, result_buffer, len(result_buffer)) + + return ffi_buf_to_string(result_buffer) + + +def get_temp_key_info(ssl_object): + if not hasattr(pyOpenSSLutil.lib, 'SSL_get_server_tmp_key'): # requires OpenSSL 1.0.2 + return None + + # adapted from OpenSSL apps/s_cb.c::ssl_print_tmp_key() + temp_key_p = pyOpenSSLutil.ffi.new("EVP_PKEY **") + pyOpenSSLutil.lib.SSL_get_server_tmp_key(ssl_object, temp_key_p) + if temp_key_p == pyOpenSSLutil.ffi.NULL: + return None + + temp_key = temp_key_p[0] + pyOpenSSLutil.ffi.gc(temp_key, pyOpenSSLutil.lib.EVP_PKEY_free) + key_info = [] + key_type = pyOpenSSLutil.lib.EVP_PKEY_id(temp_key) + if key_type == pyOpenSSLutil.lib.EVP_PKEY_RSA: + key_info.append('RSA') + elif key_type == pyOpenSSLutil.lib.EVP_PKEY_DH: + key_info.append('DH') + elif key_type == pyOpenSSLutil.lib.EVP_PKEY_EC: + key_info.append('ECDH') + ec_key = pyOpenSSLutil.lib.EVP_PKEY_get1_EC_KEY(temp_key) + pyOpenSSLutil.ffi.gc(ec_key, pyOpenSSLutil.lib.EC_KEY_free) + nid = pyOpenSSLutil.lib.EC_GROUP_get_curve_name(pyOpenSSLutil.lib.EC_KEY_get0_group(ec_key)) + cname = pyOpenSSLutil.lib.EC_curve_nid2nist(nid) + if cname == pyOpenSSLutil.ffi.NULL: + cname = pyOpenSSLutil.lib.OBJ_nid2sn(nid) + key_info.append(ffi_buf_to_string(cname)) + else: + key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type))) + key_info.append('%s bits' % pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)) + return ', '.join(key_info) diff --git a/setup.py b/setup.py index 4dc6d18c165..ee0aaabf0aa 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,8 @@ def has_environment_marker_platform_impl_support(): 'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"', 'w3lib>=1.17.0', 'queuelib', - 'lxml', + 'lxml;python_version!="3.4"', + 'lxml<=4.3.5;python_version=="3.4"', 'pyOpenSSL', 'cssselect>=0.9', 'six>=1.5.2', diff --git a/tests/constraints.txt b/tests/constraints.txt index e59e68b3f20..5655ac2d374 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -1,2 +1 @@ -Twisted!=18.4.0 -lxml!=4.2.2 \ No newline at end of file +Twisted!=18.4.0 \ No newline at end of file diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index d2151e10e0d..efef4192c39 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -8,6 +8,7 @@ except ImportError: import mock +from testfixtures import LogCapture from twisted.trial import unittest from twisted.protocols.policies import WrappingFactory from twisted.python.filepath import FilePath @@ -498,6 +499,24 @@ def test_download_broken_chunked_content_allow_data_loss_via_setting(self): class Https11TestCase(Http11TestCase): scheme = 'https' + tls_log_message = 'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=localhost", subject "/C=IE/O=Scrapy/CN=localhost"' + + @defer.inlineCallbacks + def test_tls_logging(self): + download_handler = self.download_handler_cls(Settings({ + 'DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING': True, + })) + try: + with LogCapture() as log_capture: + request = Request(self.getURL('file')) + d = download_handler.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEqual, b"0123456789") + yield d + log_capture.check_present(('scrapy.core.downloader.tls', 'DEBUG', self.tls_log_message)) + finally: + yield download_handler.close() + class Https11WrongHostnameTestCase(Http11TestCase): scheme = 'https' @@ -518,6 +537,7 @@ def setUp(self): super(Https11InvalidDNSId, self).setUp() self.host = '127.0.0.1' + class Https11InvalidDNSPattern(Https11TestCase): """Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain.""" @@ -529,6 +549,7 @@ def setUp(self): from service_identity.exceptions import CertificateError except ImportError: raise unittest.SkipTest("cryptography lib is too old") + self.tls_log_message = 'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=127.0.0.1", subject "/C=IE/O=Scrapy/CN=127.0.0.1"' super(Https11InvalidDNSPattern, self).setUp()