From f288d6c50bb41905a0d40ccb811e50a2aab2d297 Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Mon, 26 Apr 2021 18:53:31 +0200 Subject: [PATCH 1/6] fix(falcon.asgi): assume latin-1 encoding for incoming headers --- falcon/asgi/request.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/falcon/asgi/request.py b/falcon/asgi/request.py index 8167f70f2..d0c2ef424 100644 --- a/falcon/asgi/request.py +++ b/falcon/asgi/request.py @@ -459,14 +459,14 @@ def __init__(self, scope, receive, first_event=None, options=None): # PERF(kgriffs): Normally we expect no Content-Type header, so # use this pattern which is a little bit faster than dict.get() if b'content-type' in req_headers: - self.content_type = req_headers[b'content-type'].decode() + self.content_type = req_headers[b'content-type'].decode('latin1') else: self.content_type = None else: # PERF(kgriffs): This is the most performant pattern when we expect # the key to be present most of the time. try: - self.content_type = req_headers[b'content-type'].decode() + self.content_type = req_headers[b'content-type'].decode('latin1') except KeyError: self.content_type = None @@ -516,14 +516,14 @@ def accept(self): # NOTE(kgriffs): Per RFC, a missing accept header is # equivalent to '*/*' try: - return self._asgi_headers[b'accept'].decode() or '*/*' + return self._asgi_headers[b'accept'].decode('latin1') or '*/*' except KeyError: return '*/*' @property def content_length(self): try: - value = self._asgi_headers[b'content-length'].decode() + value = self._asgi_headers[b'content-length'].decode('latin1') except KeyError: return None @@ -612,7 +612,7 @@ def forwarded_scheme(self): # first. Note also that the indexing operator is # slightly faster than using get(). try: - scheme = self._asgi_headers[b'x-forwarded-proto'].decode().lower() + scheme = self._asgi_headers[b'x-forwarded-proto'].decode('latin1').lower() except KeyError: scheme = self.scheme @@ -624,7 +624,7 @@ def host(self): # NOTE(kgriffs): Prefer the host header; the web server # isn't supposed to mess with it, so it should be what # the client actually sent. - host_header = self._asgi_headers[b'host'].decode() + host_header = self._asgi_headers[b'host'].decode('latin1') host, __ = parse_host(host_header) except KeyError: host, __ = self._asgi_server @@ -647,7 +647,7 @@ def forwarded_host(self): # just go for it without wasting time checking it # first. try: - host = self._asgi_headers[b'x-forwarded-host'].decode() + host = self._asgi_headers[b'x-forwarded-host'].decode('latin1') except KeyError: host = self.netloc @@ -682,10 +682,10 @@ def access_route(self): host, __ = parse_host(hop.src) self._cached_access_route.append(host) elif b'x-forwarded-for' in headers: - addresses = headers[b'x-forwarded-for'].decode().split(',') + addresses = headers[b'x-forwarded-for'].decode('latin1').split(',') self._cached_access_route = [ip.strip() for ip in addresses] elif b'x-real-ip' in headers: - self._cached_access_route = [headers[b'x-real-ip'].decode()] + self._cached_access_route = [headers[b'x-real-ip'].decode('latin1')] if self._cached_access_route: if self._cached_access_route[-1] != client: @@ -703,7 +703,7 @@ def remote_addr(self): @property def port(self): try: - host_header = self._asgi_headers[b'host'].decode() + host_header = self._asgi_headers[b'host'].decode('latin1') default_port = 443 if self._secure_scheme else 80 __, port = parse_host(host_header, default_port=default_port) except KeyError: @@ -716,7 +716,7 @@ def netloc(self): # PERF(kgriffs): try..except is faster than get() when we # expect the key to be present most of the time. try: - netloc_value = self._asgi_headers[b'host'].decode() + netloc_value = self._asgi_headers[b'host'].decode('latin1') except KeyError: netloc_value, port = self._asgi_server @@ -825,7 +825,7 @@ def if_match(self): if self._cached_if_match is None: header_value = self._asgi_headers.get(b'if-match') if header_value: - self._cached_if_match = helpers._parse_etags(header_value.decode()) + self._cached_if_match = helpers._parse_etags(header_value.decode('latin1')) return self._cached_if_match @@ -834,7 +834,7 @@ def if_none_match(self): if self._cached_if_none_match is None: header_value = self._asgi_headers.get(b'if-none-match') if header_value: - self._cached_if_none_match = helpers._parse_etags(header_value.decode()) + self._cached_if_none_match = helpers._parse_etags(header_value.decode('latin1')) return self._cached_if_none_match @@ -844,7 +844,7 @@ def headers(self): # have to do is clone it in the future. if self._cached_headers is None: self._cached_headers = { - name.decode(): value.decode() + name.decode('latin1'): value.decode('latin1') for name, value in self._asgi_headers.items() } @@ -885,7 +885,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}): try: asgi_name = _name_cache[name] except KeyError: - asgi_name = name.lower().encode() + asgi_name = name.lower().encode('latin1') if len(_name_cache) < 64: # Somewhat arbitrary ceiling to mitigate abuse _name_cache[name] = asgi_name @@ -894,7 +894,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}): # Don't take the time to cache beforehand, using HTTP naming. # This will be faster, assuming that most headers are looked # up only once, and not all headers will be requested. - return self._asgi_headers[asgi_name].decode() + return self._asgi_headers[asgi_name].decode('latin1') except KeyError: if not required: From 1d07eca0258a43aabf3a44b362b0ac218444ce2e Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Sun, 2 May 2021 17:42:40 +0200 Subject: [PATCH 2/6] fix(ASGI): assume ISO-8859-1 for ASGI headers --- falcon/asgi/_request_helpers.py | 5 ++- falcon/asgi/request.py | 29 +++++++++++------ falcon/asgi/response.py | 9 ++++-- falcon/cyutil/misc.pyx | 13 +++++++- falcon/testing/helpers.py | 18 ++++++++--- falcon/util/misc.py | 25 +++++++++++++++ tests/test_headers.py | 55 +++++++++++++++++++++++---------- 7 files changed, 120 insertions(+), 34 deletions(-) diff --git a/falcon/asgi/_request_helpers.py b/falcon/asgi/_request_helpers.py index bb3e3c574..55e7b5661 100644 --- a/falcon/asgi/_request_helpers.py +++ b/falcon/asgi/_request_helpers.py @@ -29,7 +29,10 @@ def header_property(header_name): def fget(self): try: - return self._asgi_headers[header_name].decode() or None + # NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per + # RFC 7230, Section 3.2.4; and to strive for maximum + # compatibility with WSGI. + return self._asgi_headers[header_name].decode('latin1') or None except KeyError: return None diff --git a/falcon/asgi/request.py b/falcon/asgi/request.py index d0c2ef424..3cd8d681a 100644 --- a/falcon/asgi/request.py +++ b/falcon/asgi/request.py @@ -456,6 +456,10 @@ def __init__(self, scope, receive, first_event=None, options=None): # self._cached_uri = None if self.method == 'GET': + # NOTE(vytas): We do not really expect the Content-Type to be + # non-ASCII, however we assume ISO-8859-1 here for maximum + # compatibility with WSGI. + # PERF(kgriffs): Normally we expect no Content-Type header, so # use this pattern which is a little bit faster than dict.get() if b'content-type' in req_headers: @@ -523,22 +527,27 @@ def accept(self): @property def content_length(self): try: - value = self._asgi_headers[b'content-length'].decode('latin1') + value = self._asgi_headers[b'content-length'] except KeyError: return None - # NOTE(kgriffs): Normalize an empty value to behave as if - # the header were not included; wsgiref, at least, inserts - # an empty CONTENT_LENGTH value if the request does not - # set the header. Gunicorn and uWSGI do not do this, but - # others might if they are trying to match wsgiref's - # behavior too closely. - if not value: - return None - try: + # PERF(vytas): int() also works with a bytestring argument. value_as_int = int(value) except ValueError: + # PERF(vytas): Check for an empty value in the except clause, + # because we do not expect ASGI servers to inject any headers + # that the client did not provide. + + # NOTE(kgriffs): Normalize an empty value to behave as if + # the header were not included; wsgiref, at least, inserts + # an empty CONTENT_LENGTH value if the request does not + # set the header. Gunicorn and uWSGI do not do this, but + # others might if they are trying to match wsgiref's + # behavior too closely. + if not value: + return None + msg = 'The value of the header must be a number.' raise errors.HTTPInvalidHeader(msg, 'Content-Length') diff --git a/falcon/asgi/response.py b/falcon/asgi/response.py index e19a8f7d9..4237bcd96 100644 --- a/falcon/asgi/response.py +++ b/falcon/asgi/response.py @@ -20,7 +20,7 @@ from falcon import response from falcon.constants import _UNSET -from falcon.util.misc import is_python_func +from falcon.util.misc import _encode_items_to_latin1, is_python_func __all__ = ['Response'] @@ -415,8 +415,13 @@ def _asgi_headers(self, media_type=None): headers['content-type'] = media_type try: - items = [(n.encode('ascii'), v.encode('ascii')) for n, v in headers.items()] + # NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per + # RFC 7230, Section 3.2.4; and to strive for maximum + # compatibility with WSGI. + items = _encode_items_to_latin1(headers) except UnicodeEncodeError as ex: + # TODO(vytas): In 3.1.0, update this error message to highlight the + # fact that we decided to allow ISO-8859-1? raise ValueError( 'The modern series of HTTP standards require that header names and values ' f'use only ASCII characters: {ex}' diff --git a/falcon/cyutil/misc.pyx b/falcon/cyutil/misc.pyx index 095dddd4c..f4e2b1229 100644 --- a/falcon/cyutil/misc.pyx +++ b/falcon/cyutil/misc.pyx @@ -1,4 +1,4 @@ -# Copyright 2020 by Vytautas Liuolia. +# Copyright 2020-2021 by Vytautas Liuolia. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,3 +38,14 @@ def isascii(unicode string not None): return False return True + + +def encode_items_to_latin1(dict data not None): + cdef list result = [] + cdef unicode key + cdef unicode value + + for key, value in data.items(): + result.append((key.encode('latin1'), value.encode('latin1'))) + + return result diff --git a/falcon/testing/helpers.py b/falcon/testing/helpers.py index a23fcf5ba..fff856e37 100644 --- a/falcon/testing/helpers.py +++ b/falcon/testing/helpers.py @@ -33,6 +33,7 @@ import itertools import json import random +import re import socket import sys import time @@ -278,7 +279,7 @@ class ASGIResponseEventCollector: events (iterable): An iterable of events that were emitted by the app, collected as-is from the app. headers (iterable): An iterable of (str, str) tuples representing - the UTF-8 decoded headers emitted by the app in the body of + the ISO-8859-1 decoded headers emitted by the app in the body of the ``'http.response.start'`` event. status (int): HTTP status code emitted by the app in the body of the ``'http.response.start'`` event. @@ -300,6 +301,9 @@ class ASGIResponseEventCollector: 'lifespan.shutdown.failed', ]) + _HEADER_NAME_RE = re.compile(br'^[a-zA-Z][a-zA-Z0-9\-_]*$') + _BAD_HEADER_VALUE_RE = re.compile(br'[\000-\037]') + def __init__(self): self.events = [] self.headers = [] @@ -327,11 +331,17 @@ async def collect(self, event: Dict[str, Any]): if not isinstance(value, bytes): raise TypeError('ASGI header names must be byte strings') + # NOTE(vytas): Ported basic validation from wsgiref.validate. + if not self._HEADER_NAME_RE.match(name): + raise ValueError('Bad header name: {!r}'.format(name)) + if self._BAD_HEADER_VALUE_RE.search(value): + raise ValueError('Bad header value: {!r}'.format(value)) + name_decoded = name.decode() if not name_decoded.islower(): raise ValueError('ASGI header names must be lowercase') - self.headers.append((name_decoded, value.decode())) + self.headers.append((name_decoded, value.decode('latin1'))) self.status = event['status'] @@ -1345,12 +1355,12 @@ def _add_headers_to_scope(scope, headers, content_length, host, items = headers for name, value in items: - n = name.lower().encode() + n = name.lower().encode('latin1') found_ua = found_ua or (n == b'user-agent') # NOTE(kgriffs): Value is stripped if not empty, otherwise defaults # to b'' to be consistent with _add_headers_to_environ(). - v = b'' if value is None else value.strip().encode() + v = b'' if value is None else value.strip().encode('latin1') # NOTE(kgriffs): Expose as an iterable to ensure the framework/app # isn't hard-coded to only work with a list or tuple. diff --git a/falcon/util/misc.py b/falcon/util/misc.py index bf6e11262..f25ba002d 100644 --- a/falcon/util/misc.py +++ b/falcon/util/misc.py @@ -39,11 +39,17 @@ # public Falcon interface. from .deprecation import deprecated +try: + from falcon.cyutil.misc import encode_items_to_latin1 as _cy_encode_items_to_latin1 +except ImportError: + _cy_encode_items_to_latin1 = None + try: from falcon.cyutil.misc import isascii as _cy_isascii except ImportError: _cy_isascii = None + __all__ = ( 'is_python_func', 'deprecated', @@ -470,6 +476,24 @@ def code_to_http_status(status): return '{} {}'.format(code, _DEFAULT_HTTP_REASON) +def _encode_items_to_latin1(data): + """Decode all key/values of a dict to Latin-1. + + Args: + data (dict): A dict of string key/values to encode to a list of + bytestring items. + + Returns: + A list of (bytes, bytes) tuples. + """ + result = [] + + for key, value in data.items(): + result.append((key.encode('latin1'), value.encode('latin1'))) + + return result + + def _isascii(string): """Return ``True`` if all characters in the string are ASCII. @@ -495,4 +519,5 @@ def _isascii(string): return False +_encode_items_to_latin1 = _cy_encode_items_to_latin1 or _encode_items_to_latin1 isascii = getattr(str, 'isascii', _cy_isascii or _isascii) diff --git a/tests/test_headers.py b/tests/test_headers.py index 335cf27c4..3d4892943 100644 --- a/tests/test_headers.py +++ b/tests/test_headers.py @@ -125,6 +125,10 @@ def on_head(self, req, resp): class UnicodeHeaderResource: + def on_connect(self, req, resp): + # A way to CONNECT with people. + resp.set_header('X-Clinking-Beer-Mugs', 'đŸș') + def on_get(self, req, resp): resp.set_headers([ ('X-auTH-toKEN', 'toomanysecrets'), @@ -132,6 +136,11 @@ def on_get(self, req, resp): ('X-symbOl', '@'), ]) + def on_patch(self, req, resp): + resp.set_headers([ + ('X-Thing', '\x01\x02\xff'), + ]) + def on_post(self, req, resp): resp.set_headers([ ('X-symb\u00F6l', 'thing'), @@ -551,28 +560,42 @@ def test_unicode_headers_contain_only_ascii(self, client): assert result.headers['X-Auth-Token'] == 'toomanysecrets' assert result.headers['X-Symbol'] == '@' - @pytest.mark.parametrize('method', ['POST', 'PUT']) + @pytest.mark.parametrize('method', ['CONNECT', 'POST', 'PUT']) def test_unicode_headers_contain_non_ascii(self, method, client): app = client.app app.add_route('/', UnicodeHeaderResource()) - if app._ASGI: - # NOTE(kgriffs): Unlike PEP-3333, the ASGI spec requires the - # app to encode header names and values to a byte string. This - # gives Falcon the opportunity to verify the character set - # in the process and raise an error as appropriate. - error_type, pattern = ValueError, 'ASCII' + if method == 'CONNECT': + # NOTE(vytas): Response headers cannot be encoded to Latin-1. + if not app._ASGI: + pytest.skip('wsgiref.validate sees no evil here') + + # NOTE(vytas): Shouldn't this result in an HTTP 500 instead of + # bubbling up a ValueError to the app server? + with pytest.raises(ValueError): + client.simulate_request(method, '/') elif method == 'PUT': - pytest.skip('The wsgiref validator does not check header values.') + # NOTE(vytas): Latin-1 header values are allowed. + resp = client.simulate_request(method, '/') + assert resp.headers else: - # NOTE(kgriffs): The wsgiref validator that is integrated into - # Falcon's testing framework will catch this. However, Falcon - # itself does not do the check to avoid potential overhead - # in a production deployment. - error_type, pattern = AssertionError, 'Bad header name' - - with pytest.raises(error_type, match=pattern): - client.simulate_request(method, '/') + if app._ASGI: + # NOTE(kgriffs): Unlike PEP-3333, the ASGI spec requires the + # app to encode header names and values to a byte string. This + # gives Falcon the opportunity to verify the character set + # in the process and raise an error as appropriate. + error_type = ValueError + else: + # NOTE(kgriffs): The wsgiref validator that is integrated into + # Falcon's testing framework will catch this. However, Falcon + # itself does not do the check to avoid potential overhead + # in a production deployment. + error_type = AssertionError + + pattern = 'Bad header name' if method == 'POST' else 'Bad header value' + + with pytest.raises(error_type, match=pattern): + client.simulate_request(method, '/') def test_response_set_and_get_header(self, client): resource = HeaderHelpersResource() From 38a6caf8c27ef35117cd67a9674c2cff669abdf0 Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Sun, 2 May 2021 18:04:28 +0200 Subject: [PATCH 3/6] docs(changelog): add a news fragment for the fix --- docs/_newsfragments/1902.bugfix.rst | 7 +++++-- docs/_newsfragments/1911.bugfix.rst | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 docs/_newsfragments/1911.bugfix.rst diff --git a/docs/_newsfragments/1902.bugfix.rst b/docs/_newsfragments/1902.bugfix.rst index 2d0ebd868..982128842 100644 --- a/docs/_newsfragments/1902.bugfix.rst +++ b/docs/_newsfragments/1902.bugfix.rst @@ -1,2 +1,5 @@ -Re-add ``api_helpers`` module that was renamed ``app_helpers``. -This module is considered deprecated, and will be removed in a future Falcon version. \ No newline at end of file +The ``api_helpers`` module was re-added, since it was renamed to +``app_helpers`` (and effectively removed) without announcing a corresponding +breaking change. +This module is now considered deprecated, and will be removed in a future +Falcon version. diff --git a/docs/_newsfragments/1911.bugfix.rst b/docs/_newsfragments/1911.bugfix.rst new file mode 100644 index 000000000..8dbe2f0d9 --- /dev/null +++ b/docs/_newsfragments/1911.bugfix.rst @@ -0,0 +1,3 @@ +ASGI HTTP headers were treated as UTF-8 encoded, not taking the incompatibility +with WSGI and porting of WSGI applications into consideration. +This was fixed, and ASGI headers are now decoded and encoded as ISO-8859-1. From adb6fbb120d1303c484ff1e89ffc1f1e2058a64b Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Sun, 2 May 2021 18:17:14 +0200 Subject: [PATCH 4/6] test(test_headers.py): add forgotten item to parameterized test (restore 100% coverage) --- tests/test_headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_headers.py b/tests/test_headers.py index 3d4892943..272aac7b8 100644 --- a/tests/test_headers.py +++ b/tests/test_headers.py @@ -560,7 +560,7 @@ def test_unicode_headers_contain_only_ascii(self, client): assert result.headers['X-Auth-Token'] == 'toomanysecrets' assert result.headers['X-Symbol'] == '@' - @pytest.mark.parametrize('method', ['CONNECT', 'POST', 'PUT']) + @pytest.mark.parametrize('method', ['CONNECT', 'PATCH', 'POST', 'PUT']) def test_unicode_headers_contain_non_ascii(self, method, client): app = client.app app.add_route('/', UnicodeHeaderResource()) From 7e002893764d0763ee0dabd858f09ce4d297a2f1 Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Sun, 2 May 2021 19:12:33 +0200 Subject: [PATCH 5/6] test(headers): add actual regression tests for the reported issue --- falcon/testing/helpers.py | 2 ++ tests/test_headers.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/falcon/testing/helpers.py b/falcon/testing/helpers.py index fff856e37..4507a570d 100644 --- a/falcon/testing/helpers.py +++ b/falcon/testing/helpers.py @@ -337,6 +337,8 @@ async def collect(self, event: Dict[str, Any]): if self._BAD_HEADER_VALUE_RE.search(value): raise ValueError('Bad header value: {!r}'.format(value)) + # NOTE(vytas): After the name validation above, the name is + # guaranteed to only contain a subset of ASCII. name_decoded = name.decode() if not name_decoded.islower(): raise ValueError('ASGI header names must be lowercase') diff --git a/tests/test_headers.py b/tests/test_headers.py index 272aac7b8..4f31bb72e 100644 --- a/tests/test_headers.py +++ b/tests/test_headers.py @@ -276,6 +276,14 @@ def on_post(self, req, resp): resp.set_headers(CustomHeadersNotCallable()) +class HeadersDebugResource: + def on_get(self, req, resp): + resp.media = {key.lower(): value for key, value in req.headers.items()} + + def on_get_header(self, req, resp, header): + resp.media = {header.lower(): req.get_header(header)} + + class TestHeaders: def test_content_length(self, client): @@ -539,6 +547,29 @@ def test_content_disposition_header(self, client, filename, expected): assert resp.status_code == 200 assert resp.headers['Content-Disposition'] == expected + def test_request_latin1_headers(self, client): + client.app.add_route('/headers', HeadersDebugResource()) + client.app.add_route('/headers/{header}', HeadersDebugResource(), suffix='header') + + headers = { + 'User-Agent': 'Mosaic/0.9', + 'X-Latin1-Header': 'FörmĂ„nsrĂ€tt', + 'X-Size': 'groß', + } + resp = client.simulate_get('/headers', headers=headers) + assert resp.status_code == 200 + assert resp.json == { + 'host': 'falconframework.org', + 'user-agent': 'Mosaic/0.9', + 'x-latin1-header': 'FörmĂ„nsrĂ€tt', + 'x-size': 'groß', + } + + resp = client.simulate_get('/headers/X-Latin1-Header', headers=headers) + assert resp.json == {'x-latin1-header': 'FörmĂ„nsrĂ€tt'} + resp = client.simulate_get('/headers/X-Size', headers=headers) + assert resp.json == {'x-size': 'groß'} + def test_unicode_location_headers(self, client): client.app.add_route('/', LocationHeaderUnicodeResource()) From 7ceb7a547d25ae328e353e51888dadfc45b28969 Mon Sep 17 00:00:00 2001 From: Vytautas Liuolia Date: Wed, 5 May 2021 21:29:05 +0200 Subject: [PATCH 6/6] docs(asgi.response): add a PERF note on _encode_items_to_latin1() usage --- falcon/asgi/response.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/falcon/asgi/response.py b/falcon/asgi/response.py index 4237bcd96..15ebfb06f 100644 --- a/falcon/asgi/response.py +++ b/falcon/asgi/response.py @@ -418,6 +418,11 @@ def _asgi_headers(self, media_type=None): # NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per # RFC 7230, Section 3.2.4; and to strive for maximum # compatibility with WSGI. + + # PERF(vytas): On CPython, _encode_items_to_latin1 is implemented + # in Cython (with a pure Python fallback), where the resulting + # C code speeds up the method substantially by directly invoking + # CPython's C API functions such as PyUnicode_EncodeLatin1. items = _encode_items_to_latin1(headers) except UnicodeEncodeError as ex: # TODO(vytas): In 3.1.0, update this error message to highlight the