From bf37ba18c03a23decc634e8c47ff402dbf24d0f9 Mon Sep 17 00:00:00 2001 From: Jorge Date: Thu, 12 Mar 2020 01:39:39 -0700 Subject: [PATCH] Add support for HTTPS connections to proxies. (#1679) * Add support to talk HTTPS to proxies. Currently there's no way to validate identify for the proxy you might be connecting. Proxies supporting HTTPS endpoints are becoming more common and we need to extend the support for them. When an HTTPS proxy is provided, instead of doing the HTTP CONNECT, we'll forward any requests directly to the proxy and ultimately to the destination. * Fix proxy_headers missing on HTTPS proxy connections. * blackfmt missing files. * Prevent usage of HTTPS proxies when fetching HTTPS resources. - Will be supported by default when we can do TLS within TLS. * Update proxy documentation with more information. * Renamed flag for HTTPS websites through HTTPS proxies. * Added myself to contributors. * Documentation and contributors fixes. * Removed mention that TLS in TLS is being developed as requested. * Space in between my name and the github page. * Add flag to enable HTTPS proxy support. Now that we're adding support for HTTPS proxies we want to avoid a breaking change with clients that had an improper proxy configuration. For now, we're adding a warning an defaulting to the previous behavior. In the future we'll change the behavior to enable HTTPS proxies by default. * Remove guard flag, error out on HTTPS/HTTPS. As requested in the last revision for the PR: - Removed the _enable_https_proxies flag. Instead the feature will be enabled and will error out on invalid configurations. (HTTPS + HTTPS) - Other comments: rename a method, parentheses to clarify order of operations. --- docs/advanced-usage.rst | 22 +++++-- dummyserver/proxy.py | 8 +++ dummyserver/testcase.py | 9 +++ src/urllib3/connection.py | 12 ++-- src/urllib3/connectionpool.py | 19 ++++-- src/urllib3/exceptions.py | 5 ++ src/urllib3/poolmanager.py | 65 +++++++++++++++---- test/test_proxymanager.py | 7 +- .../test_proxy_poolmanager.py | 61 +++++++++++++++++ 9 files changed, 176 insertions(+), 32 deletions(-) diff --git a/docs/advanced-usage.rst b/docs/advanced-usage.rst index 1442975f48..84c0eef1f5 100644 --- a/docs/advanced-usage.rst +++ b/docs/advanced-usage.rst @@ -122,10 +122,24 @@ HTTP proxy:: The usage of :class:`~poolmanager.ProxyManager` is the same as :class:`~poolmanager.PoolManager`. -You can use :class:`~contrib.socks.SOCKSProxyManager` to connect to SOCKS4 or -SOCKS5 proxies. In order to use SOCKS proxies you will need to install -`PySocks `_ or install urllib3 with the -``socks`` extra:: +You can connect to a proxy using HTTP, HTTPS or SOCKS. urllib3's behavior will +be different depending on the type of proxy you selected and the destination +you're contacting. + +When contacting a HTTP website through a HTTP or HTTPS proxy, the request will +be forwarded with the `absolute URI +`_. + +When contacting a HTTPS website through a HTTP proxy, a TCP tunnel will be +established with a HTTP CONNECT. Afterward a TLS connection will be established +with the destination and your request will be sent. + +Contacting HTTPS websites through HTTPS proxies is currently not supported. + +For SOCKS, you can use :class:`~contrib.socks.SOCKSProxyManager` to connect to +SOCKS4 or SOCKS5 proxies. In order to use SOCKS proxies you will need to +install `PySocks `_ or install urllib3 with +the ``socks`` extra:: pip install urllib3[socks] diff --git a/dummyserver/proxy.py b/dummyserver/proxy.py index c4f0b824f7..42f293104d 100755 --- a/dummyserver/proxy.py +++ b/dummyserver/proxy.py @@ -34,6 +34,7 @@ import tornado.iostream import tornado.web import tornado.httpclient +import ssl __all__ = ["ProxyHandler", "run_proxy"] @@ -66,6 +67,12 @@ def handle_response(response): self.write(response.body) self.finish() + upstream_ca_certs = self.application.settings.get("upstream_ca_certs", None) + ssl_options = None + + if upstream_ca_certs: + ssl_options = ssl.create_default_context(cafile=upstream_ca_certs) + req = tornado.httpclient.HTTPRequest( url=self.request.uri, method=self.request.method, @@ -73,6 +80,7 @@ def handle_response(response): headers=self.request.headers, follow_redirects=False, allow_nonstandard_methods=True, + ssl_options=ssl_options, ) client = tornado.httpclient.AsyncHTTPClient() diff --git a/dummyserver/testcase.py b/dummyserver/testcase.py index 90c6b2240c..4e3cf593c5 100644 --- a/dummyserver/testcase.py +++ b/dummyserver/testcase.py @@ -180,6 +180,14 @@ def setup_class(cls): app, cls.io_loop, None, "http", cls.proxy_host ) + upstream_ca_certs = cls.https_certs.get("ca_certs", None) + app = web.Application( + [(r".*", ProxyHandler)], upstream_ca_certs=upstream_ca_certs + ) + cls.https_proxy_server, cls.https_proxy_port = run_tornado_app( + app, cls.io_loop, cls.https_certs, "https", cls.proxy_host + ) + cls.server_thread = run_loop_in_thread(cls.io_loop) @classmethod @@ -187,6 +195,7 @@ def teardown_class(cls): cls.io_loop.add_callback(cls.http_server.stop) cls.io_loop.add_callback(cls.https_server.stop) cls.io_loop.add_callback(cls.proxy_server.stop) + cls.io_loop.add_callback(cls.https_proxy_server.stop) cls.io_loop.add_callback(cls.io_loop.stop) cls.server_thread.join() diff --git a/src/urllib3/connection.py b/src/urllib3/connection.py index d67c0a70b3..0b1ac06340 100644 --- a/src/urllib3/connection.py +++ b/src/urllib3/connection.py @@ -111,7 +111,6 @@ def __init__(self, *args, **kw): #: The socket options provided by the user. If no options are #: provided, we use the default options. self.socket_options = kw.pop("socket_options", self.default_socket_options) - _HTTPConnection.__init__(self, *args, **kw) @property @@ -174,10 +173,13 @@ def _new_conn(self): return conn + def _is_using_tunnel(self): + # Google App Engine's httplib does not define _tunnel_host + return getattr(self, "_tunnel_host", None) + def _prepare_conn(self, conn): self.sock = conn - # Google App Engine's httplib does not define _tunnel_host - if getattr(self, "_tunnel_host", None): + if self._is_using_tunnel(): # TODO: Fix tunnel so it doesn't depend on self.sock state. self._tunnel() # Mark this connection as not reusable @@ -309,9 +311,9 @@ def connect(self): conn = self._new_conn() hostname = self.host - # Google App Engine's httplib does not define _tunnel_host - if getattr(self, "_tunnel_host", None): + if self._is_using_tunnel(): self.sock = conn + # Calls self._set_hostport(), so self.host is # self._tunnel_host below. self._tunnel() diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py index 174fe6c2e1..492590fb9e 100644 --- a/src/urllib3/connectionpool.py +++ b/src/urllib3/connectionpool.py @@ -634,10 +634,10 @@ def urlopen( # [1] release_this_conn = release_conn - # Merge the proxy headers. Only do this in HTTP. We have to copy the - # headers dict so we can safely change it without those changes being - # reflected in anyone else's copy. - if self.scheme == "http": + # Merge the proxy headers. Only done when not using HTTP CONNECT. We + # have to copy the headers dict so we can safely change it without those + # changes being reflected in anyone else's copy. + if self.scheme == "http" or (self.proxy and self.proxy.scheme == "https"): headers = headers.copy() headers.update(self.proxy_headers) @@ -925,10 +925,15 @@ def _prepare_conn(self, conn): def _prepare_proxy(self, conn): """ - Establish tunnel connection early, because otherwise httplib - would improperly set Host: header to proxy's IP:port. + Establishes a tunnel connection through HTTP CONNECT. + + Tunnel connection is established early because otherwise httplib would + improperly set Host: header to proxy's IP:port. """ - conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers) + + if self.proxy.scheme != "https": + conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers) + conn.connect() def _new_conn(self): diff --git a/src/urllib3/exceptions.py b/src/urllib3/exceptions.py index 5cc4d8a4f1..1c7de4bb46 100644 --- a/src/urllib3/exceptions.py +++ b/src/urllib3/exceptions.py @@ -259,6 +259,11 @@ def __init__(self, scheme): super(ProxySchemeUnknown, self).__init__(message) +class ProxySchemeUnsupported(ValueError): + "Fetching HTTPS resources through HTTPS proxies is unsupported" + pass + + class HeaderParsingError(HTTPError): "Raised by assert_header_parsing, but we convert it to a log.warning statement." diff --git a/src/urllib3/poolmanager.py b/src/urllib3/poolmanager.py index e2bd3bd8db..db7ce8c39a 100644 --- a/src/urllib3/poolmanager.py +++ b/src/urllib3/poolmanager.py @@ -8,10 +8,11 @@ from .connectionpool import HTTPConnectionPool, HTTPSConnectionPool from .connectionpool import port_by_scheme from .exceptions import ( + HTTPWarning, LocationValueError, MaxRetryError, ProxySchemeUnknown, - InvalidProxyConfigurationWarning, + ProxySchemeUnsupported, ) from .packages import six from .packages.six.moves.urllib.parse import urljoin @@ -23,6 +24,12 @@ __all__ = ["PoolManager", "ProxyManager", "proxy_from_url"] +class InvalidProxyConfigurationWarning(HTTPWarning): + """Raised when a user has an HTTPS proxy without enabling HTTPS proxies.""" + + pass + + log = logging.getLogger(__name__) SSL_KEYWORDS = ( @@ -312,6 +319,18 @@ def _merge_pool_kwargs(self, override): base_pool_kwargs[key] = value return base_pool_kwargs + def _proxy_requires_url_absolute_form(self, parsed_url): + """ + Indicates if the proxy requires the complete destination URL in the + request. + + Normally this is only needed when not using an HTTP CONNECT tunnel. + """ + if self.proxy is None: + return False + + return parsed_url.scheme == "http" or self.proxy.scheme == "https" + def urlopen(self, method, url, redirect=True, **kw): """ Same as :meth:`urllib3.connectionpool.HTTPConnectionPool.urlopen` @@ -330,7 +349,7 @@ def urlopen(self, method, url, redirect=True, **kw): if "headers" not in kw: kw["headers"] = self.headers.copy() - if self.proxy is not None and u.scheme == "http": + if self._proxy_requires_url_absolute_form(u): response = conn.urlopen(method, url, **kw) else: response = conn.urlopen(method, u.request_uri, **kw) @@ -392,6 +411,12 @@ class ProxyManager(PoolManager): HTTPS/CONNECT case they are sent only once. Could be used for proxy authentication. + :param _allow_https_proxy_to_see_traffic: + Allows forwarding of HTTPS requests to HTTPS proxies. The proxy will + have visibility of all the traffic sent. ONLY USE IF YOU KNOW WHAT + YOU'RE DOING. This flag might be removed at any time in any future + update. + Example: >>> proxy = urllib3.ProxyManager('http://localhost:3128/') >>> r1 = proxy.request('GET', 'http://google.com/') @@ -411,6 +436,7 @@ def __init__( num_pools=10, headers=None, proxy_headers=None, + _allow_https_proxy_to_see_traffic=False, **connection_pool_kw ): @@ -421,19 +447,22 @@ def __init__( proxy_url.port, ) proxy = parse_url(proxy_url) - if not proxy.port: - port = port_by_scheme.get(proxy.scheme, 80) - proxy = proxy._replace(port=port) if proxy.scheme not in ("http", "https"): raise ProxySchemeUnknown(proxy.scheme) + if not proxy.port: + port = port_by_scheme.get(proxy.scheme, 80) + proxy = proxy._replace(port=port) + self.proxy = proxy self.proxy_headers = proxy_headers or {} connection_pool_kw["_proxy"] = self.proxy connection_pool_kw["_proxy_headers"] = self.proxy_headers + self.allow_insecure_proxy = _allow_https_proxy_to_see_traffic + super(ProxyManager, self).__init__(num_pools, headers, **connection_pool_kw) def connection_from_host(self, host, port=None, scheme="http", pool_kwargs=None): @@ -462,15 +491,22 @@ def _set_proxy_headers(self, url, headers=None): return headers_ def _validate_proxy_scheme_url_selection(self, url_scheme): - if url_scheme == "https" and self.proxy.scheme == "https": + if ( + url_scheme == "https" + and self.proxy.scheme == "https" + and not self.allow_insecure_proxy + ): warnings.warn( "Your proxy configuration specified an HTTPS scheme for the proxy. " "Are you sure you want to use HTTPS to contact the proxy? " - "This most likely indicates an error in your configuration. " - "Read this issue for more info: " - "https://github.com/urllib3/urllib3/issues/1850", + "This most likely indicates an error in your configuration." + "If you are sure you want use HTTPS to contact the proxy, enable " + "the _allow_https_proxy_to_see_traffic.", InvalidProxyConfigurationWarning, - stacklevel=3, + ) + + raise ProxySchemeUnsupported( + "Contacting HTTPS destinations through HTTPS proxies is not supported." ) def urlopen(self, method, url, redirect=True, **kw): @@ -478,10 +514,11 @@ def urlopen(self, method, url, redirect=True, **kw): u = parse_url(url) self._validate_proxy_scheme_url_selection(u.scheme) - if u.scheme == "http": - # For proxied HTTPS requests, httplib sets the necessary headers - # on the CONNECT to the proxy. For HTTP, we'll definitely - # need to set 'Host' at the very least. + if u.scheme == "http" or self.proxy.scheme == "https": + # For connections using HTTP CONNECT, httplib sets the necessary + # headers on the CONNECT to the proxy. For HTTP or when talking + # HTTPS to the proxy, we'll definitely need to set 'Host' at the + # very least. headers = kw.get("headers", self.headers) kw["headers"] = self._set_proxy_headers(url, headers) diff --git a/test/test_proxymanager.py b/test/test_proxymanager.py index 0fcbe85eb4..0e1b13af33 100644 --- a/test/test_proxymanager.py +++ b/test/test_proxymanager.py @@ -8,12 +8,15 @@ ProxyError, NewConnectionError, ) +from urllib3.util.url import parse_url class TestProxyManager(object): - def test_proxy_headers(self): + @pytest.mark.parametrize("proxy_scheme", ["http", "https"]) + def test_proxy_headers(self, proxy_scheme): url = "http://pypi.org/project/urllib3/" - with ProxyManager("http://something:1234") as p: + proxy_url = "{}://something:1234".format(proxy_scheme) + with ProxyManager(proxy_url) as p: # Verify default headers default_headers = {"Accept": "*/*", "Host": "pypi.org"} headers = p._set_proxy_headers(url) diff --git a/test/with_dummyserver/test_proxy_poolmanager.py b/test/with_dummyserver/test_proxy_poolmanager.py index 8993ed1d06..acdb0729bb 100644 --- a/test/with_dummyserver/test_proxy_poolmanager.py +++ b/test/with_dummyserver/test_proxy_poolmanager.py @@ -20,6 +20,7 @@ ProxyError, ConnectTimeoutError, InvalidProxyConfigurationWarning, + ProxySchemeUnsupported, ) from urllib3.connectionpool import connection_from_url, VerifiedHTTPSConnection @@ -38,6 +39,7 @@ def setup_class(cls): cls.https_url = "https://%s:%d" % (cls.https_host, cls.https_port) cls.https_url_alt = "https://%s:%d" % (cls.https_host_alt, cls.https_port) cls.proxy_url = "http://%s:%d" % (cls.proxy_host, cls.proxy_port) + cls.https_proxy_url = "https://%s:%d" % (cls.proxy_host, cls.https_proxy_port,) # This URL is used only to test that a warning is # raised due to an improper config. urllib3 doesn't @@ -80,6 +82,24 @@ def test_https_proxy_warning(self): "https://github.com/urllib3/urllib3/issues/1850" ) + def test_https_proxy(self): + with proxy_from_url(self.https_proxy_url, ca_certs=DEFAULT_CA) as https: + r = https.request("GET", "%s/" % self.http_url) + assert r.status == 200 + + with pytest.raises(ProxySchemeUnsupported): + https.request("GET", "%s/" % self.https_url) + + with proxy_from_url( + self.https_proxy_url, + ca_certs=DEFAULT_CA, + _allow_https_proxy_to_see_traffic=True, + ) as https: + r = https.request("GET", "%s/" % self.http_url) + https.request("GET", "%s/" % self.https_url) + assert r.status == 200 +>>>>>>> 8c7a43b4... Add support for HTTPS connections to proxies. (#1679) + def test_nagle_proxy(self): """ Test that proxy connections do not have TCP_NODELAY turned on """ with ProxyManager(self.proxy_url) as http: @@ -302,6 +322,47 @@ def test_headers(self): self.https_port, ) + def test_https_headers(self): + with proxy_from_url( + self.https_proxy_url, + headers={"Foo": "bar"}, + proxy_headers={"Hickory": "dickory"}, + ca_certs=DEFAULT_CA, + ) as http: + + r = http.request_encode_url("GET", "%s/headers" % self.http_url) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") == "bar" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host, + self.http_port, + ) + + r = http.request_encode_url("GET", "%s/headers" % self.http_url_alt) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") == "bar" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host_alt, + self.http_port, + ) + + with pytest.raises(ProxySchemeUnsupported): + http.request_encode_url("GET", "%s/headers" % self.https_url) + + r = http.request_encode_url( + "GET", "%s/headers" % self.http_url, headers={"Baz": "quux"} + ) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") is None + assert returned_headers.get("Baz") == "quux" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host, + self.http_port, + ) + def test_headerdict(self): default_headers = HTTPHeaderDict(a="b") proxy_headers = HTTPHeaderDict()