diff --git a/changelog.d/281.doc.rst b/changelog.d/281.doc.rst new file mode 100644 index 00000000..a8682598 --- /dev/null +++ b/changelog.d/281.doc.rst @@ -0,0 +1 @@ +The documentation of the *params* argument has been updated to more accurately describe its type-coercion behavior. diff --git a/changelog.d/303.bugfix.rst b/changelog.d/303.bugfix.rst new file mode 100644 index 00000000..2571c50a --- /dev/null +++ b/changelog.d/303.bugfix.rst @@ -0,0 +1 @@ +The *params* argument once more accepts non-ASCII ``bytes``, fixing a regression first introduced in treq 20.4.1. diff --git a/docs/examples/query_params.py b/docs/examples/query_params.py index 7c7b19af..ad2d4916 100644 --- a/docs/examples/query_params.py +++ b/docs/examples/query_params.py @@ -20,13 +20,13 @@ def main(reactor): print('Multi value dictionary') resp = yield treq.get('https://httpbin.org/get', - params={'foo': ['bar', 'baz', 'bax']}) + params={b'foo': [b'bar', b'baz', b'bax']}) content = yield resp.text() print(content) print('Mixed value dictionary') resp = yield treq.get('https://httpbin.org/get', - params={'foo': ['bar', 'baz'], 'bax': 'quux'}) + params={'foo': [1, 2, 3], 'bax': b'quux', b'bar': 'foo'}) content = yield resp.text() print(content) diff --git a/docs/howto.rst b/docs/howto.rst index 01cf3f31..652000df 100644 --- a/docs/howto.rst +++ b/docs/howto.rst @@ -49,8 +49,9 @@ parameters that may already exist. The ``params`` argument may be either a ``dict`` or a ``list`` of ``(key, value)`` tuples. -If it is a ``dict`` then the values in the dict may either be a ``str`` value -or a ``list`` of ``str`` values. +If it is a ``dict`` then the values in the dict may either be scalar values or a ``list`` or ``tuple`` thereof. +Scalar values means ``str``, ``bytes``, or anything else — even ``None`` — which will be coerced to ``str``. +Strings are UTF-8 encoded. .. literalinclude:: examples/query_params.py :linenos: @@ -58,6 +59,9 @@ or a ``list`` of ``str`` values. Full example: :download:`query_params.py ` +If you prefer a strictly-typed API, try :class:`hyperlink.DecodedURL`. +Use its :meth:`~hyperlink.URL.add` and :meth:`~hyperlink.URL.set` methods to add query parameters without risk of accidental type coercion. + JSON ---- diff --git a/setup.py b/setup.py index 24fadf58..528bed28 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ install_requires=[ "incremental", "requests >= 2.1.0", - "hyperlink >= 19.0.0", + "hyperlink >= 21.0.0", "six >= 1.13.0", "Twisted[tls] >= 18.7.0", "attrs", diff --git a/src/treq/client.py b/src/treq/client.py index 509160b0..98b56892 100644 --- a/src/treq/client.py +++ b/src/treq/client.py @@ -9,7 +9,7 @@ import six from six.moves.collections_abc import Mapping from six.moves.http_cookiejar import CookieJar -from six.moves.urllib.parse import urlencode as _urlencode +from six.moves.urllib.parse import quote_plus, urlencode as _urlencode from twisted.internet.interfaces import IProtocol from twisted.internet.defer import Deferred @@ -153,13 +153,15 @@ def request(self, method, url, **kwargs): stacklevel = kwargs.pop('_stacklevel', 2) if isinstance(url, DecodedURL): - parsed_url = url + parsed_url = url.encoded_url elif isinstance(url, EncodedURL): - parsed_url = DecodedURL(url) + parsed_url = url elif isinstance(url, six.text_type): - parsed_url = DecodedURL.from_text(url) + # We use hyperlink in lazy mode so that users can pass arbitrary + # bytes in the path and querystring. + parsed_url = EncodedURL.from_text(url) else: - parsed_url = DecodedURL.from_text(url.decode('ascii')) + parsed_url = EncodedURL.from_text(url.decode('ascii')) # Join parameters provided in the URL # and the ones passed as argument. @@ -418,12 +420,37 @@ def _convert_files(files): yield (param, (file_name, content_type, IBodyProducer(fobj))) +def _query_quote(v): + # (Any) -> Text + """ + Percent-encode a querystring name or value. + + :param v: A value. + + :returns: + The value, coerced to a string and percent-encoded as appropriate for + a querystring (with space as ``+``). + """ + if not isinstance(v, (str, bytes)): + v = six.text_type(v) + if not isinstance(v, bytes): + v = v.encode("utf-8") + q = quote_plus(v) + if isinstance(q, bytes): + # Python 2.7 returnes bytes when given bytes, but Python 3.x always + # returns str. Coverage disabled here to stop Coveralls complaining + # until we can drop Python 2.7 support. + q = q.decode("ascii") # pragma: no cover + return q + + def _coerced_query_params(params): """ Carefully coerce *params* in the same way as `urllib.parse.urlencode()` - Parameter names and values are coerced to unicode. As a special case, - `bytes` are decoded as ASCII. + Parameter names and values are coerced to unicode, which is encoded as + UTF-8 and then percent-encoded. As a special case, `bytes` are directly + percent-encoded. :param params: A mapping or sequence of (name, value) two-tuples. The value may be @@ -431,7 +458,8 @@ def _coerced_query_params(params): any type. :returns: - A generator that yields two-tuples containing text strings. + A generator that yields two-tuples containing percent-encoded text + strings. :rtype: Iterator[Tuple[Text, Text]] """ @@ -441,18 +469,12 @@ def _coerced_query_params(params): items = params for key, values in items: - if isinstance(key, bytes): - key = key.decode('ascii') - elif not isinstance(key, six.text_type): - key = six.text_type(key) + key_quoted = _query_quote(key) + if not isinstance(values, (list, tuple)): - values = [values] + values = (values,) for value in values: - if isinstance(value, bytes): - value = value.decode('ascii') - elif not isinstance(value, six.text_type): - value = six.text_type(value) - yield key, value + yield key_quoted, _query_quote(value) def _from_bytes(orig_bytes): diff --git a/src/treq/test/test_client.py b/src/treq/test/test_client.py index 571353b5..e4481c62 100644 --- a/src/treq/test/test_client.py +++ b/src/treq/test/test_client.py @@ -73,6 +73,50 @@ def test_request_uri_encodedurl(self): None, ) + def test_request_uri_bytes_pass(self): + """ + The URL parameter may contain path segments or querystring parameters + that are not valid UTF-8. These pass through. + """ + # This URL is http://example.com/hello?who=you, but "hello", "who", and + # "you" are encoded as UTF-16. The particulars of the encoding aren't + # important; what matters is that those segments can't be decoded by + # Hyperlink's UTF-8 default. + self.client.request( + "GET", + ( + "http://example.com/%FF%FEh%00e%00l%00l%00o%00" + "?%FF%FEw%00h%00o%00=%FF%FEy%00o%00u%00" + ), + ) + self.agent.request.assert_called_once_with( + b'GET', + ( + b'http://example.com/%FF%FEh%00e%00l%00l%00o%00' + b'?%FF%FEw%00h%00o%00=%FF%FEy%00o%00u%00' + ), + Headers({b'accept-encoding': [b'gzip']}), + None, + ) + + def test_request_uri_plus_pass(self): + """ + URL parameters may contain spaces encoded as ``+``. These remain as + such and are not mangled. + + This reproduces `Klein #339 `_. + """ + self.client.request( + "GET", + "https://example.com/?foo+bar=baz+biff", + ) + self.agent.request.assert_called_once_with( + b'GET', + b"https://example.com/?foo+bar=baz+biff", + Headers({b'accept-encoding': [b'gzip']}), + None, + ) + def test_request_uri_idn_params(self): """ A URL that contains non-ASCII characters can be augmented with @@ -160,9 +204,13 @@ def test_request_tuple_query_param_coercion(self): treq coerces non-string param names passed to *params* like `urllib.urlencode()` """ + # A value used to test that it is never encoded or decoded. + # It should be invalid UTF-8 or UTF-32 (at least). + raw_bytes = b"\x00\xff\xfb" + self.client.request('GET', 'http://example.com/', params=[ (u'text', u'A\u03a9'), - (b'bytes', ['ascii']), + (b'bytes', ['ascii', raw_bytes]), ('native', 'native'), (1, 'int'), (None, ['none']), @@ -172,7 +220,7 @@ def test_request_tuple_query_param_coercion(self): b'GET', ( b'http://example.com/' - b'?text=A%CE%A9&bytes=ascii' + b'?text=A%CE%A9&bytes=ascii&bytes=%00%FF%FB' b'&native=native&1=int&None=none' ), Headers({b'accept-encoding': [b'gzip']}), @@ -548,6 +596,27 @@ def test_request_dict_headers(self): b'Accept': [b'application/json', b'text/plain']}), None) + def test_request_headers_object(self): + """ + The *headers* parameter accepts a `twisted.web.http_headers.Headers` + instance. + """ + self.client.request( + "GET", + "https://example.com", + headers=Headers({"X-Foo": ["bar"]}), + ) + + self.agent.request.assert_called_once_with( + b"GET", + b"https://example.com", + Headers({ + "X-Foo": ["bar"], + "Accept-Encoding": ["gzip"], + }), + None, + ) + def test_request_headers_invalid_type(self): """ `HTTPClient.request()` warns that headers of an unexpected type are