Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass through bytes in path and querystring #304

Merged
merged 18 commits into from Jan 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/281.doc.rst
@@ -0,0 +1 @@
The documentation of the *params* argument has been updated to more accurately describe its type-coercion behavior.
1 change: 1 addition & 0 deletions changelog.d/303.bugfix.rst
@@ -0,0 +1 @@
The *params* argument once more accepts non-ASCII ``bytes``, fixing a regression first introduced in treq 20.4.1.
4 changes: 2 additions & 2 deletions docs/examples/query_params.py
Expand Up @@ -20,13 +20,13 @@ def main(reactor):

print('Multi value dictionary')
resp = yield treq.get('https://httpbin.org/get',
params={'foo': ['bar', 'baz', 'bax']})
params={b'foo': [b'bar', b'baz', b'bax']})
content = yield resp.text()
print(content)

print('Mixed value dictionary')
resp = yield treq.get('https://httpbin.org/get',
params={'foo': ['bar', 'baz'], 'bax': 'quux'})
params={'foo': [1, 2, 3], 'bax': b'quux', b'bar': 'foo'})
content = yield resp.text()
print(content)

Expand Down
8 changes: 6 additions & 2 deletions docs/howto.rst
Expand Up @@ -49,15 +49,19 @@ parameters that may already exist.
The ``params`` argument may be either a ``dict`` or a ``list`` of
``(key, value)`` tuples.

If it is a ``dict`` then the values in the dict may either be a ``str`` value
or a ``list`` of ``str`` values.
If it is a ``dict`` then the values in the dict may either be scalar values or a ``list`` or ``tuple`` thereof.
Scalar values means ``str``, ``bytes``, or anything else — even ``None`` — which will be coerced to ``str``.
Strings are UTF-8 encoded.

.. literalinclude:: examples/query_params.py
:linenos:
:lines: 7-37

Full example: :download:`query_params.py <examples/query_params.py>`

If you prefer a strictly-typed API, try :class:`hyperlink.DecodedURL`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be hyperlink.EncodedURL ?

I think that examples/query_params.py or inline here, add an example of using hyperlink.EncodedURL together with treq.

from hyperlink import EncodedURL

resp = yield treq.get(EncodedURL('https://httpbin.org/get?raw-value-here'))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It very much should not be EncodedURL. IMO that API is dangerous because it doesn't handle percent-encoding for you. If you use it with untrusted input you'll get security injection vulnerabilities! Sometimes you need that lower-level interface, but it's not something I'd want to suggest. If you need it, you'll know.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I think this question speaks to the way that EncodedURL and DecodedURL are confusing names. Particularly since URL is EncodedURL and so seems like the default.)

Use its :meth:`~hyperlink.URL.add` and :meth:`~hyperlink.URL.set` methods to add query parameters without risk of accidental type coercion.

JSON
----

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -32,7 +32,7 @@
install_requires=[
"incremental",
"requests >= 2.1.0",
"hyperlink >= 19.0.0",
"hyperlink >= 21.0.0",
"six >= 1.13.0",
"Twisted[tls] >= 18.7.0",
"attrs",
Expand Down
58 changes: 40 additions & 18 deletions src/treq/client.py
Expand Up @@ -9,7 +9,7 @@
import six
from six.moves.collections_abc import Mapping
from six.moves.http_cookiejar import CookieJar
from six.moves.urllib.parse import urlencode as _urlencode
from six.moves.urllib.parse import quote_plus, urlencode as _urlencode

from twisted.internet.interfaces import IProtocol
from twisted.internet.defer import Deferred
Expand Down Expand Up @@ -153,13 +153,15 @@ def request(self, method, url, **kwargs):
stacklevel = kwargs.pop('_stacklevel', 2)

if isinstance(url, DecodedURL):
parsed_url = url
parsed_url = url.encoded_url
elif isinstance(url, EncodedURL):
parsed_url = DecodedURL(url)
parsed_url = url
elif isinstance(url, six.text_type):
parsed_url = DecodedURL.from_text(url)
# We use hyperlink in lazy mode so that users can pass arbitrary
# bytes in the path and querystring.
parsed_url = EncodedURL.from_text(url)
else:
parsed_url = DecodedURL.from_text(url.decode('ascii'))
parsed_url = EncodedURL.from_text(url.decode('ascii'))

# Join parameters provided in the URL
# and the ones passed as argument.
Expand Down Expand Up @@ -418,20 +420,46 @@ def _convert_files(files):
yield (param, (file_name, content_type, IBodyProducer(fobj)))


def _query_quote(v):
# (Any) -> Text
"""
Percent-encode a querystring name or value.

:param v: A value.

:returns:
The value, coerced to a string and percent-encoded as appropriate for
a querystring (with space as ``+``).
"""
if not isinstance(v, (str, bytes)):
v = six.text_type(v)
if not isinstance(v, bytes):
v = v.encode("utf-8")
q = quote_plus(v)
if isinstance(q, bytes):
# Python 2.7 returnes bytes when given bytes, but Python 3.x always
# returns str. Coverage disabled here to stop Coveralls complaining
# until we can drop Python 2.7 support.
q = q.decode("ascii") # pragma: no cover
return q


def _coerced_query_params(params):
"""
Carefully coerce *params* in the same way as `urllib.parse.urlencode()`

Parameter names and values are coerced to unicode. As a special case,
`bytes` are decoded as ASCII.
Parameter names and values are coerced to unicode, which is encoded as
UTF-8 and then percent-encoded. As a special case, `bytes` are directly
percent-encoded.

:param params:
A mapping or sequence of (name, value) two-tuples. The value may be
a list or tuple of multiple values. Names and values may be pretty much
any type.

:returns:
A generator that yields two-tuples containing text strings.
A generator that yields two-tuples containing percent-encoded text
strings.
:rtype:
Iterator[Tuple[Text, Text]]
"""
Expand All @@ -441,18 +469,12 @@ def _coerced_query_params(params):
items = params

for key, values in items:
if isinstance(key, bytes):
key = key.decode('ascii')
elif not isinstance(key, six.text_type):
key = six.text_type(key)
key_quoted = _query_quote(key)

if not isinstance(values, (list, tuple)):
values = [values]
values = (values,)
for value in values:
if isinstance(value, bytes):
value = value.decode('ascii')
elif not isinstance(value, six.text_type):
value = six.text_type(value)
yield key, value
yield key_quoted, _query_quote(value)


def _from_bytes(orig_bytes):
Expand Down
73 changes: 71 additions & 2 deletions src/treq/test/test_client.py
Expand Up @@ -73,6 +73,50 @@ def test_request_uri_encodedurl(self):
None,
)

def test_request_uri_bytes_pass(self):
"""
The URL parameter may contain path segments or querystring parameters
that are not valid UTF-8. These pass through.
"""
# This URL is http://example.com/hello?who=you, but "hello", "who", and
# "you" are encoded as UTF-16. The particulars of the encoding aren't
# important; what matters is that those segments can't be decoded by
# Hyperlink's UTF-8 default.
self.client.request(
"GET",
(
"http://example.com/%FF%FEh%00e%00l%00l%00o%00"
"?%FF%FEw%00h%00o%00=%FF%FEy%00o%00u%00"
),
)
self.agent.request.assert_called_once_with(
b'GET',
(
b'http://example.com/%FF%FEh%00e%00l%00l%00o%00'
b'?%FF%FEw%00h%00o%00=%FF%FEy%00o%00u%00'
),
Headers({b'accept-encoding': [b'gzip']}),
None,
)

def test_request_uri_plus_pass(self):
"""
URL parameters may contain spaces encoded as ``+``. These remain as
such and are not mangled.

This reproduces `Klein #339 <https://github.com/twisted/klein/issues/339>`_.
"""
self.client.request(
"GET",
"https://example.com/?foo+bar=baz+biff",
)
self.agent.request.assert_called_once_with(
b'GET',
b"https://example.com/?foo+bar=baz+biff",
Headers({b'accept-encoding': [b'gzip']}),
None,
)

def test_request_uri_idn_params(self):
"""
A URL that contains non-ASCII characters can be augmented with
Expand Down Expand Up @@ -160,9 +204,13 @@ def test_request_tuple_query_param_coercion(self):
treq coerces non-string param names passed to *params* like
`urllib.urlencode()`
"""
# A value used to test that it is never encoded or decoded.
# It should be invalid UTF-8 or UTF-32 (at least).
raw_bytes = b"\x00\xff\xfb"

self.client.request('GET', 'http://example.com/', params=[
(u'text', u'A\u03a9'),
(b'bytes', ['ascii']),
(b'bytes', ['ascii', raw_bytes]),
('native', 'native'),
(1, 'int'),
(None, ['none']),
Expand All @@ -172,7 +220,7 @@ def test_request_tuple_query_param_coercion(self):
b'GET',
(
b'http://example.com/'
b'?text=A%CE%A9&bytes=ascii'
b'?text=A%CE%A9&bytes=ascii&bytes=%00%FF%FB'
b'&native=native&1=int&None=none'
),
Headers({b'accept-encoding': [b'gzip']}),
Expand Down Expand Up @@ -548,6 +596,27 @@ def test_request_dict_headers(self):
b'Accept': [b'application/json', b'text/plain']}),
None)

def test_request_headers_object(self):
"""
The *headers* parameter accepts a `twisted.web.http_headers.Headers`
instance.
"""
self.client.request(
"GET",
"https://example.com",
headers=Headers({"X-Foo": ["bar"]}),
)

self.agent.request.assert_called_once_with(
b"GET",
b"https://example.com",
Headers({
"X-Foo": ["bar"],
"Accept-Encoding": ["gzip"],
}),
None,
)

def test_request_headers_invalid_type(self):
"""
`HTTPClient.request()` warns that headers of an unexpected type are
Expand Down