Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ASGI): assume ISO-8859-1 for ASGI headers #1914

Merged
merged 6 commits into from May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/_newsfragments/1902.bugfix.rst
@@ -1,2 +1,5 @@
Re-add ``api_helpers`` module that was renamed ``app_helpers``.
This module is considered deprecated, and will be removed in a future Falcon version.
The ``api_helpers`` module was re-added, since it was renamed to
``app_helpers`` (and effectively removed) without announcing a corresponding
breaking change.
This module is now considered deprecated, and will be removed in a future
Falcon version.
3 changes: 3 additions & 0 deletions docs/_newsfragments/1911.bugfix.rst
@@ -0,0 +1,3 @@
ASGI HTTP headers were treated as UTF-8 encoded, not taking the incompatibility
with WSGI and porting of WSGI applications into consideration.
This was fixed, and ASGI headers are now decoded and encoded as ISO-8859-1.
5 changes: 4 additions & 1 deletion falcon/asgi/_request_helpers.py
Expand Up @@ -29,7 +29,10 @@ def header_property(header_name):

def fget(self):
try:
return self._asgi_headers[header_name].decode() or None
# NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per
# RFC 7230, Section 3.2.4; and to strive for maximum
# compatibility with WSGI.
return self._asgi_headers[header_name].decode('latin1') or None
except KeyError:
return None

Expand Down
59 changes: 34 additions & 25 deletions falcon/asgi/request.py
Expand Up @@ -456,17 +456,21 @@ def __init__(self, scope, receive, first_event=None, options=None):
# self._cached_uri = None

if self.method == 'GET':
# NOTE(vytas): We do not really expect the Content-Type to be
# non-ASCII, however we assume ISO-8859-1 here for maximum
# compatibility with WSGI.

# PERF(kgriffs): Normally we expect no Content-Type header, so
# use this pattern which is a little bit faster than dict.get()
if b'content-type' in req_headers:
self.content_type = req_headers[b'content-type'].decode()
self.content_type = req_headers[b'content-type'].decode('latin1')
else:
self.content_type = None
else:
# PERF(kgriffs): This is the most performant pattern when we expect
# the key to be present most of the time.
try:
self.content_type = req_headers[b'content-type'].decode()
self.content_type = req_headers[b'content-type'].decode('latin1')
except KeyError:
self.content_type = None

Expand Down Expand Up @@ -516,29 +520,34 @@ def accept(self):
# NOTE(kgriffs): Per RFC, a missing accept header is
# equivalent to '*/*'
try:
return self._asgi_headers[b'accept'].decode() or '*/*'
return self._asgi_headers[b'accept'].decode('latin1') or '*/*'
except KeyError:
return '*/*'

@property
def content_length(self):
try:
value = self._asgi_headers[b'content-length'].decode()
value = self._asgi_headers[b'content-length']
except KeyError:
return None

# NOTE(kgriffs): Normalize an empty value to behave as if
# the header were not included; wsgiref, at least, inserts
# an empty CONTENT_LENGTH value if the request does not
# set the header. Gunicorn and uWSGI do not do this, but
# others might if they are trying to match wsgiref's
# behavior too closely.
if not value:
return None

try:
# PERF(vytas): int() also works with a bytestring argument.
value_as_int = int(value)
except ValueError:
# PERF(vytas): Check for an empty value in the except clause,
# because we do not expect ASGI servers to inject any headers
# that the client did not provide.

# NOTE(kgriffs): Normalize an empty value to behave as if
# the header were not included; wsgiref, at least, inserts
# an empty CONTENT_LENGTH value if the request does not
# set the header. Gunicorn and uWSGI do not do this, but
# others might if they are trying to match wsgiref's
# behavior too closely.
if not value:
return None

msg = 'The value of the header must be a number.'
raise errors.HTTPInvalidHeader(msg, 'Content-Length')

Expand Down Expand Up @@ -612,7 +621,7 @@ def forwarded_scheme(self):
# first. Note also that the indexing operator is
# slightly faster than using get().
try:
scheme = self._asgi_headers[b'x-forwarded-proto'].decode().lower()
scheme = self._asgi_headers[b'x-forwarded-proto'].decode('latin1').lower()
except KeyError:
scheme = self.scheme

Expand All @@ -624,7 +633,7 @@ def host(self):
# NOTE(kgriffs): Prefer the host header; the web server
# isn't supposed to mess with it, so it should be what
# the client actually sent.
host_header = self._asgi_headers[b'host'].decode()
host_header = self._asgi_headers[b'host'].decode('latin1')
host, __ = parse_host(host_header)
except KeyError:
host, __ = self._asgi_server
Expand All @@ -647,7 +656,7 @@ def forwarded_host(self):
# just go for it without wasting time checking it
# first.
try:
host = self._asgi_headers[b'x-forwarded-host'].decode()
host = self._asgi_headers[b'x-forwarded-host'].decode('latin1')
except KeyError:
host = self.netloc

Expand Down Expand Up @@ -682,10 +691,10 @@ def access_route(self):
host, __ = parse_host(hop.src)
self._cached_access_route.append(host)
elif b'x-forwarded-for' in headers:
addresses = headers[b'x-forwarded-for'].decode().split(',')
addresses = headers[b'x-forwarded-for'].decode('latin1').split(',')
self._cached_access_route = [ip.strip() for ip in addresses]
elif b'x-real-ip' in headers:
self._cached_access_route = [headers[b'x-real-ip'].decode()]
self._cached_access_route = [headers[b'x-real-ip'].decode('latin1')]

if self._cached_access_route:
if self._cached_access_route[-1] != client:
Expand All @@ -703,7 +712,7 @@ def remote_addr(self):
@property
def port(self):
try:
host_header = self._asgi_headers[b'host'].decode()
host_header = self._asgi_headers[b'host'].decode('latin1')
default_port = 443 if self._secure_scheme else 80
__, port = parse_host(host_header, default_port=default_port)
except KeyError:
Expand All @@ -716,7 +725,7 @@ def netloc(self):
# PERF(kgriffs): try..except is faster than get() when we
# expect the key to be present most of the time.
try:
netloc_value = self._asgi_headers[b'host'].decode()
netloc_value = self._asgi_headers[b'host'].decode('latin1')
except KeyError:
netloc_value, port = self._asgi_server

Expand Down Expand Up @@ -825,7 +834,7 @@ def if_match(self):
if self._cached_if_match is None:
header_value = self._asgi_headers.get(b'if-match')
if header_value:
self._cached_if_match = helpers._parse_etags(header_value.decode())
self._cached_if_match = helpers._parse_etags(header_value.decode('latin1'))

return self._cached_if_match

Expand All @@ -834,7 +843,7 @@ def if_none_match(self):
if self._cached_if_none_match is None:
header_value = self._asgi_headers.get(b'if-none-match')
if header_value:
self._cached_if_none_match = helpers._parse_etags(header_value.decode())
self._cached_if_none_match = helpers._parse_etags(header_value.decode('latin1'))

return self._cached_if_none_match

Expand All @@ -844,7 +853,7 @@ def headers(self):
# have to do is clone it in the future.
if self._cached_headers is None:
self._cached_headers = {
name.decode(): value.decode()
name.decode('latin1'): value.decode('latin1')
for name, value in self._asgi_headers.items()
}

Expand Down Expand Up @@ -885,7 +894,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}):
try:
asgi_name = _name_cache[name]
except KeyError:
asgi_name = name.lower().encode()
asgi_name = name.lower().encode('latin1')
if len(_name_cache) < 64: # Somewhat arbitrary ceiling to mitigate abuse
_name_cache[name] = asgi_name

Expand All @@ -894,7 +903,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}):
# Don't take the time to cache beforehand, using HTTP naming.
# This will be faster, assuming that most headers are looked
# up only once, and not all headers will be requested.
return self._asgi_headers[asgi_name].decode()
return self._asgi_headers[asgi_name].decode('latin1')

except KeyError:
if not required:
Expand Down
14 changes: 12 additions & 2 deletions falcon/asgi/response.py
Expand Up @@ -20,7 +20,7 @@

from falcon import response
from falcon.constants import _UNSET
from falcon.util.misc import is_python_func
from falcon.util.misc import _encode_items_to_latin1, is_python_func

__all__ = ['Response']

Expand Down Expand Up @@ -415,8 +415,18 @@ def _asgi_headers(self, media_type=None):
headers['content-type'] = media_type

try:
items = [(n.encode('ascii'), v.encode('ascii')) for n, v in headers.items()]
# NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per
# RFC 7230, Section 3.2.4; and to strive for maximum
# compatibility with WSGI.

# PERF(vytas): On CPython, _encode_items_to_latin1 is implemented
# in Cython (with a pure Python fallback), where the resulting
# C code speeds up the method substantially by directly invoking
# CPython's C API functions such as PyUnicode_EncodeLatin1.
items = _encode_items_to_latin1(headers)
vytas7 marked this conversation as resolved.
Show resolved Hide resolved
except UnicodeEncodeError as ex:
# TODO(vytas): In 3.1.0, update this error message to highlight the
# fact that we decided to allow ISO-8859-1?
raise ValueError(
'The modern series of HTTP standards require that header names and values '
f'use only ASCII characters: {ex}'
Expand Down
13 changes: 12 additions & 1 deletion falcon/cyutil/misc.pyx
@@ -1,4 +1,4 @@
# Copyright 2020 by Vytautas Liuolia.
# Copyright 2020-2021 by Vytautas Liuolia.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,3 +38,14 @@ def isascii(unicode string not None):
return False

return True


def encode_items_to_latin1(dict data not None):
vytas7 marked this conversation as resolved.
Show resolved Hide resolved
cdef list result = []
cdef unicode key
cdef unicode value

for key, value in data.items():
result.append((key.encode('latin1'), value.encode('latin1')))

return result
20 changes: 16 additions & 4 deletions falcon/testing/helpers.py
Expand Up @@ -33,6 +33,7 @@
import itertools
import json
import random
import re
import socket
import sys
import time
Expand Down Expand Up @@ -278,7 +279,7 @@ class ASGIResponseEventCollector:
events (iterable): An iterable of events that were emitted by
the app, collected as-is from the app.
headers (iterable): An iterable of (str, str) tuples representing
the UTF-8 decoded headers emitted by the app in the body of
the ISO-8859-1 decoded headers emitted by the app in the body of
the ``'http.response.start'`` event.
status (int): HTTP status code emitted by the app in the body of
the ``'http.response.start'`` event.
Expand All @@ -300,6 +301,9 @@ class ASGIResponseEventCollector:
'lifespan.shutdown.failed',
])

_HEADER_NAME_RE = re.compile(br'^[a-zA-Z][a-zA-Z0-9\-_]*$')
_BAD_HEADER_VALUE_RE = re.compile(br'[\000-\037]')

def __init__(self):
self.events = []
self.headers = []
Expand Down Expand Up @@ -327,11 +331,19 @@ async def collect(self, event: Dict[str, Any]):
if not isinstance(value, bytes):
raise TypeError('ASGI header names must be byte strings')

# NOTE(vytas): Ported basic validation from wsgiref.validate.
if not self._HEADER_NAME_RE.match(name):
raise ValueError('Bad header name: {!r}'.format(name))
if self._BAD_HEADER_VALUE_RE.search(value):
raise ValueError('Bad header value: {!r}'.format(value))

# NOTE(vytas): After the name validation above, the name is
# guaranteed to only contain a subset of ASCII.
name_decoded = name.decode()
CaselIT marked this conversation as resolved.
Show resolved Hide resolved
if not name_decoded.islower():
raise ValueError('ASGI header names must be lowercase')

self.headers.append((name_decoded, value.decode()))
self.headers.append((name_decoded, value.decode('latin1')))

self.status = event['status']

Expand Down Expand Up @@ -1345,12 +1357,12 @@ def _add_headers_to_scope(scope, headers, content_length, host,
items = headers

for name, value in items:
n = name.lower().encode()
n = name.lower().encode('latin1')
found_ua = found_ua or (n == b'user-agent')

# NOTE(kgriffs): Value is stripped if not empty, otherwise defaults
# to b'' to be consistent with _add_headers_to_environ().
v = b'' if value is None else value.strip().encode()
v = b'' if value is None else value.strip().encode('latin1')

# NOTE(kgriffs): Expose as an iterable to ensure the framework/app
# isn't hard-coded to only work with a list or tuple.
Expand Down
25 changes: 25 additions & 0 deletions falcon/util/misc.py
Expand Up @@ -39,11 +39,17 @@
# public Falcon interface.
from .deprecation import deprecated

try:
from falcon.cyutil.misc import encode_items_to_latin1 as _cy_encode_items_to_latin1
except ImportError:
_cy_encode_items_to_latin1 = None

try:
from falcon.cyutil.misc import isascii as _cy_isascii
except ImportError:
_cy_isascii = None


__all__ = (
'is_python_func',
'deprecated',
Expand Down Expand Up @@ -470,6 +476,24 @@ def code_to_http_status(status):
return '{} {}'.format(code, _DEFAULT_HTTP_REASON)


def _encode_items_to_latin1(data):
"""Decode all key/values of a dict to Latin-1.

Args:
data (dict): A dict of string key/values to encode to a list of
bytestring items.

Returns:
A list of (bytes, bytes) tuples.
"""
result = []

for key, value in data.items():
result.append((key.encode('latin1'), value.encode('latin1')))

return result


def _isascii(string):
"""Return ``True`` if all characters in the string are ASCII.

Expand All @@ -495,4 +519,5 @@ def _isascii(string):
return False


_encode_items_to_latin1 = _cy_encode_items_to_latin1 or _encode_items_to_latin1
isascii = getattr(str, 'isascii', _cy_isascii or _isascii)