falconry · kgriffs · May 5, 2021 · Apr 26, 2021 · May 2, 2021 · May 2, 2021
diff --git a/docs/_newsfragments/1902.bugfix.rst b/docs/_newsfragments/1902.bugfix.rst
@@ -1,2 +1,5 @@
-Re-add ``api_helpers`` module that was renamed ``app_helpers``.
-This module is considered deprecated, and will be removed in a future Falcon version.
+The ``api_helpers`` module was re-added, since it was renamed to
+``app_helpers`` (and effectively removed) without announcing a corresponding
+breaking change.
+This module is now considered deprecated, and will be removed in a future
+Falcon version.
diff --git a/docs/_newsfragments/1911.bugfix.rst b/docs/_newsfragments/1911.bugfix.rst
@@ -0,0 +1,3 @@
+ASGI HTTP headers were treated as UTF-8 encoded, not taking the incompatibility
+with WSGI and porting of WSGI applications into consideration.
+This was fixed, and ASGI headers are now decoded and encoded as ISO-8859-1.
diff --git a/falcon/asgi/_request_helpers.py b/falcon/asgi/_request_helpers.py
@@ -29,7 +29,10 @@ def header_property(header_name):
 
     def fget(self):
         try:
-            return self._asgi_headers[header_name].decode() or None
+            # NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per
+            #   RFC 7230, Section 3.2.4; and to strive for maximum
+            #   compatibility with WSGI.
+            return self._asgi_headers[header_name].decode('latin1') or None
         except KeyError:
             return None
 

diff --git a/falcon/asgi/request.py b/falcon/asgi/request.py
@@ -456,17 +456,21 @@ def __init__(self, scope, receive, first_event=None, options=None):
         # self._cached_uri = None
 
         if self.method == 'GET':
+            # NOTE(vytas): We do not really expect the Content-Type to be
+            #   non-ASCII, however we assume ISO-8859-1 here for maximum
+            #   compatibility with WSGI.
+
             # PERF(kgriffs): Normally we expect no Content-Type header, so
             #   use this pattern which is a little bit faster than dict.get()
             if b'content-type' in req_headers:
-                self.content_type = req_headers[b'content-type'].decode()
+                self.content_type = req_headers[b'content-type'].decode('latin1')
             else:
                 self.content_type = None
         else:
             # PERF(kgriffs): This is the most performant pattern when we expect
             #   the key to be present most of the time.
             try:
-                self.content_type = req_headers[b'content-type'].decode()
+                self.content_type = req_headers[b'content-type'].decode('latin1')
             except KeyError:
                 self.content_type = None
 
@@ -516,29 +520,34 @@ def accept(self):
         # NOTE(kgriffs): Per RFC, a missing accept header is
         # equivalent to '*/*'
         try:
-            return self._asgi_headers[b'accept'].decode() or '*/*'
+            return self._asgi_headers[b'accept'].decode('latin1') or '*/*'
         except KeyError:
             return '*/*'
 
     @property
     def content_length(self):
         try:
-            value = self._asgi_headers[b'content-length'].decode()
+            value = self._asgi_headers[b'content-length']
         except KeyError:
             return None
 
-        # NOTE(kgriffs): Normalize an empty value to behave as if
-        # the header were not included; wsgiref, at least, inserts
-        # an empty CONTENT_LENGTH value if the request does not
-        # set the header. Gunicorn and uWSGI do not do this, but
-        # others might if they are trying to match wsgiref's
-        # behavior too closely.
-        if not value:
-            return None
-
         try:
+            # PERF(vytas): int() also works with a bytestring argument.
             value_as_int = int(value)
         except ValueError:
+            # PERF(vytas): Check for an empty value in the except clause,
+            #   because we do not expect ASGI servers to inject any headers
+            #   that the client did not provide.
+
+            # NOTE(kgriffs): Normalize an empty value to behave as if
+            # the header were not included; wsgiref, at least, inserts
+            # an empty CONTENT_LENGTH value if the request does not
+            # set the header. Gunicorn and uWSGI do not do this, but
+            # others might if they are trying to match wsgiref's
+            # behavior too closely.
+            if not value:
+                return None
+
             msg = 'The value of the header must be a number.'
             raise errors.HTTPInvalidHeader(msg, 'Content-Length')
 
@@ -612,7 +621,7 @@ def forwarded_scheme(self):
             # first. Note also that the indexing operator is
             # slightly faster than using get().
             try:
-                scheme = self._asgi_headers[b'x-forwarded-proto'].decode().lower()
+                scheme = self._asgi_headers[b'x-forwarded-proto'].decode('latin1').lower()
             except KeyError:
                 scheme = self.scheme
 
@@ -624,7 +633,7 @@ def host(self):
             # NOTE(kgriffs): Prefer the host header; the web server
             # isn't supposed to mess with it, so it should be what
             # the client actually sent.
-            host_header = self._asgi_headers[b'host'].decode()
+            host_header = self._asgi_headers[b'host'].decode('latin1')
             host, __ = parse_host(host_header)
         except KeyError:
             host, __ = self._asgi_server
@@ -647,7 +656,7 @@ def forwarded_host(self):
             # just go for it without wasting time checking it
             # first.
             try:
-                host = self._asgi_headers[b'x-forwarded-host'].decode()
+                host = self._asgi_headers[b'x-forwarded-host'].decode('latin1')
             except KeyError:
                 host = self.netloc
 
@@ -682,10 +691,10 @@ def access_route(self):
                         host, __ = parse_host(hop.src)
                         self._cached_access_route.append(host)
             elif b'x-forwarded-for' in headers:
-                addresses = headers[b'x-forwarded-for'].decode().split(',')
+                addresses = headers[b'x-forwarded-for'].decode('latin1').split(',')
                 self._cached_access_route = [ip.strip() for ip in addresses]
             elif b'x-real-ip' in headers:
-                self._cached_access_route = [headers[b'x-real-ip'].decode()]
+                self._cached_access_route = [headers[b'x-real-ip'].decode('latin1')]
 
             if self._cached_access_route:
                 if self._cached_access_route[-1] != client:
@@ -703,7 +712,7 @@ def remote_addr(self):
     @property
     def port(self):
         try:
-            host_header = self._asgi_headers[b'host'].decode()
+            host_header = self._asgi_headers[b'host'].decode('latin1')
             default_port = 443 if self._secure_scheme else 80
             __, port = parse_host(host_header, default_port=default_port)
         except KeyError:
@@ -716,7 +725,7 @@ def netloc(self):
         # PERF(kgriffs): try..except is faster than get() when we
         # expect the key to be present most of the time.
         try:
-            netloc_value = self._asgi_headers[b'host'].decode()
+            netloc_value = self._asgi_headers[b'host'].decode('latin1')
         except KeyError:
             netloc_value, port = self._asgi_server
 
@@ -825,7 +834,7 @@ def if_match(self):
         if self._cached_if_match is None:
             header_value = self._asgi_headers.get(b'if-match')
             if header_value:
-                self._cached_if_match = helpers._parse_etags(header_value.decode())
+                self._cached_if_match = helpers._parse_etags(header_value.decode('latin1'))
 
         return self._cached_if_match
 
@@ -834,7 +843,7 @@ def if_none_match(self):
         if self._cached_if_none_match is None:
             header_value = self._asgi_headers.get(b'if-none-match')
             if header_value:
-                self._cached_if_none_match = helpers._parse_etags(header_value.decode())
+                self._cached_if_none_match = helpers._parse_etags(header_value.decode('latin1'))
 
         return self._cached_if_none_match
 
@@ -844,7 +853,7 @@ def headers(self):
         # have to do is clone it in the future.
         if self._cached_headers is None:
             self._cached_headers = {
-                name.decode(): value.decode()
+                name.decode('latin1'): value.decode('latin1')
                 for name, value in self._asgi_headers.items()
             }
 
@@ -885,7 +894,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}):
         try:
             asgi_name = _name_cache[name]
         except KeyError:
-            asgi_name = name.lower().encode()
+            asgi_name = name.lower().encode('latin1')
             if len(_name_cache) < 64:  # Somewhat arbitrary ceiling to mitigate abuse
                 _name_cache[name] = asgi_name
 
@@ -894,7 +903,7 @@ def get_header(self, name, required=False, default=None, _name_cache={}):
             # Don't take the time to cache beforehand, using HTTP naming.
             # This will be faster, assuming that most headers are looked
             # up only once, and not all headers will be requested.
-            return self._asgi_headers[asgi_name].decode()
+            return self._asgi_headers[asgi_name].decode('latin1')
 
         except KeyError:
             if not required:

diff --git a/falcon/asgi/response.py b/falcon/asgi/response.py
@@ -20,7 +20,7 @@
 
 from falcon import response
 from falcon.constants import _UNSET
-from falcon.util.misc import is_python_func
+from falcon.util.misc import _encode_items_to_latin1, is_python_func
 
 __all__ = ['Response']
 
@@ -415,8 +415,18 @@ def _asgi_headers(self, media_type=None):
             headers['content-type'] = media_type
 
         try:
-            items = [(n.encode('ascii'), v.encode('ascii')) for n, v in headers.items()]
+            # NOTE(vytas): Supporting ISO-8859-1 for historical reasons as per
+            #   RFC 7230, Section 3.2.4; and to strive for maximum
+            #   compatibility with WSGI.
+
+            # PERF(vytas): On CPython, _encode_items_to_latin1 is implemented
+            #   in Cython (with a pure Python fallback), where the resulting
+            #   C code speeds up the method substantially by directly invoking
+            #   CPython's C API functions such as PyUnicode_EncodeLatin1.
+            items = _encode_items_to_latin1(headers)
         except UnicodeEncodeError as ex:
+            # TODO(vytas): In 3.1.0, update this error message to highlight the
+            #   fact that we decided to allow ISO-8859-1?
             raise ValueError(
                 'The modern series of HTTP standards require that header names and values '
                 f'use only ASCII characters: {ex}'

diff --git a/falcon/cyutil/misc.pyx b/falcon/cyutil/misc.pyx
@@ -1,4 +1,4 @@
-# Copyright 2020 by Vytautas Liuolia.
+# Copyright 2020-2021 by Vytautas Liuolia.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,3 +38,14 @@ def isascii(unicode string not None):
             return False
 
     return True
+
+
+def encode_items_to_latin1(dict data not None):
+    cdef list result = []
+    cdef unicode key
+    cdef unicode value
+
+    for key, value in data.items():
+        result.append((key.encode('latin1'), value.encode('latin1')))
+
+    return result
diff --git a/falcon/testing/helpers.py b/falcon/testing/helpers.py
@@ -33,6 +33,7 @@
 import itertools
 import json
 import random
+import re
 import socket
 import sys
 import time
@@ -278,7 +279,7 @@ class ASGIResponseEventCollector:
         events (iterable): An iterable of events that were emitted by
             the app, collected as-is from the app.
         headers (iterable): An iterable of (str, str) tuples representing
-            the UTF-8 decoded headers emitted by the app in the body of
+            the ISO-8859-1 decoded headers emitted by the app in the body of
             the ``'http.response.start'`` event.
         status (int): HTTP status code emitted by the app in the body of
             the ``'http.response.start'`` event.
@@ -300,6 +301,9 @@ class ASGIResponseEventCollector:
         'lifespan.shutdown.failed',
     ])
 
+    _HEADER_NAME_RE = re.compile(br'^[a-zA-Z][a-zA-Z0-9\-_]*$')
+    _BAD_HEADER_VALUE_RE = re.compile(br'[\000-\037]')
+
     def __init__(self):
         self.events = []
         self.headers = []
@@ -327,11 +331,19 @@ async def collect(self, event: Dict[str, Any]):
                 if not isinstance(value, bytes):
                     raise TypeError('ASGI header names must be byte strings')
 
+                # NOTE(vytas): Ported basic validation from wsgiref.validate.
+                if not self._HEADER_NAME_RE.match(name):
+                    raise ValueError('Bad header name: {!r}'.format(name))
+                if self._BAD_HEADER_VALUE_RE.search(value):
+                    raise ValueError('Bad header value: {!r}'.format(value))
+
+                # NOTE(vytas): After the name validation above, the name is
+                #   guaranteed to only contain a subset of ASCII.
                 name_decoded = name.decode()
                 if not name_decoded.islower():
                     raise ValueError('ASGI header names must be lowercase')
 
-                self.headers.append((name_decoded, value.decode()))
+                self.headers.append((name_decoded, value.decode('latin1')))
 
             self.status = event['status']
 
@@ -1345,12 +1357,12 @@ def _add_headers_to_scope(scope, headers, content_length, host,
             items = headers
 
         for name, value in items:
-            n = name.lower().encode()
+            n = name.lower().encode('latin1')
             found_ua = found_ua or (n == b'user-agent')
 
             # NOTE(kgriffs): Value is stripped if not empty, otherwise defaults
             #   to b'' to be consistent with _add_headers_to_environ().
-            v = b'' if value is None else value.strip().encode()
+            v = b'' if value is None else value.strip().encode('latin1')
 
             # NOTE(kgriffs): Expose as an iterable to ensure the framework/app
             #   isn't hard-coded to only work with a list or tuple.

diff --git a/falcon/util/misc.py b/falcon/util/misc.py
@@ -39,11 +39,17 @@
 # public Falcon interface.
 from .deprecation import deprecated
 
+try:
+    from falcon.cyutil.misc import encode_items_to_latin1 as _cy_encode_items_to_latin1
+except ImportError:
+    _cy_encode_items_to_latin1 = None
+
 try:
     from falcon.cyutil.misc import isascii as _cy_isascii
 except ImportError:
     _cy_isascii = None
 
+
 __all__ = (
     'is_python_func',
     'deprecated',
@@ -470,6 +476,24 @@ def code_to_http_status(status):
         return '{} {}'.format(code, _DEFAULT_HTTP_REASON)
 
 
+def _encode_items_to_latin1(data):
+    """Decode all key/values of a dict to Latin-1.
+
+    Args:
+        data (dict): A dict of string key/values to encode to a list of
+        bytestring items.
+
+    Returns:
+        A list of (bytes, bytes) tuples.
+    """
+    result = []
+
+    for key, value in data.items():
+        result.append((key.encode('latin1'), value.encode('latin1')))
+
+    return result
+
+
 def _isascii(string):
     """Return ``True`` if all characters in the string are ASCII.
 
@@ -495,4 +519,5 @@ def _isascii(string):
         return False
 
 
+_encode_items_to_latin1 = _cy_encode_items_to_latin1 or _encode_items_to_latin1
 isascii = getattr(str, 'isascii', _cy_isascii or _isascii)