check max_content_length consistently (#2620)

pallets · Mar 14, 2023 · 77c420b · 77c420b
2 parents 8fe91b7 + 4f7048e
commit 77c420b
Show file tree

Hide file tree

Showing 10 changed files with 324 additions and 364 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -64,6 +64,14 @@ Unreleased
         multiple header values. However, accessing the property only returns the first
         instance.
 
+-   If ``request.max_content_length`` is set, it is checked immediately when accessing
+    the stream, and while reading from the stream in general, rather than only during
+    form parsing. :issue:`1513`
+-   The development server, which must not be used in production, will exhaust the
+    request stream up to 10GB or 1000 reads. This allows clients to see a 413 error if
+    ``max_content_length`` is exceeded, instead of a "connection reset" failure.
+    :pr:`2620`
+
 
 Version 2.2.3
 -------------

diff --git a/docs/request_data.rst b/docs/request_data.rst
@@ -94,6 +94,11 @@ and HTTPS servers should set their own limits on size and timeouts. The operatin
 or container manager should set limits on memory and processing time for server
 processes.
 
+If a 413 Content Too Large error is returned before the entire request is read, clients
+may show a "connection reset" failure instead of the 413 error. This is based on how the
+WSGI/HTTP server and client handle connections, it's not something the WSGI application
+(Werkzeug) has control over.
+
 
 How to extend Parsing?
 ----------------------

diff --git a/src/werkzeug/formparser.py b/src/werkzeug/formparser.py
@@ -1,13 +1,12 @@
 import typing as t
-from functools import update_wrapper
 from io import BytesIO
 from typing import Union
 from urllib.parse import parse_qsl
 
-from . import exceptions
 from .datastructures import FileStorage
 from .datastructures import Headers
 from .datastructures import MultiDict
+from .exceptions import RequestEntityTooLarge
 from .http import parse_options_header
 from .sansio.multipart import Data
 from .sansio.multipart import Epilogue
@@ -47,12 +46,6 @@ def __call__(
 F = t.TypeVar("F", bound=t.Callable[..., t.Any])
 
 
-def _exhaust(stream: t.IO[bytes]) -> None:
-    bts = stream.read(64 * 1024)
-    while bts:
-        bts = stream.read(64 * 1024)
-
-
 def default_stream_factory(
     total_content_length: t.Optional[int],
     content_type: t.Optional[str],
@@ -130,27 +123,6 @@ def parse_form_data(
     ).parse_from_environ(environ)
 
 
-def exhaust_stream(f: F) -> F:
-    """Helper decorator for methods that exhausts the stream on return."""
-
-    def wrapper(self, stream, *args, **kwargs):  # type: ignore
-        try:
-            return f(self, stream, *args, **kwargs)
-        finally:
-            exhaust = getattr(stream, "exhaust", None)
-
-            if exhaust is not None:
-                exhaust()
-            else:
-                while True:
-                    chunk = stream.read(1024 * 64)
-
-                    if not chunk:
-                        break
-
-    return update_wrapper(t.cast(F, wrapper), f)
-
-
 class FormDataParser:
     """This class implements parsing of form data for Werkzeug.  By itself
     it can parse multipart and url encoded form data.  It can be subclassed
@@ -247,15 +219,6 @@ def parse(
                         the multipart boundary for instance)
         :return: A tuple in the form ``(stream, form, files)``.
         """
-        if (
-            self.max_content_length is not None
-            and content_length is not None
-            and content_length > self.max_content_length
-        ):
-            # if the input stream is not exhausted, firefox reports Connection Reset
-            _exhaust(stream)
-            raise exceptions.RequestEntityTooLarge()
-
         if options is None:
             options = {}
 
@@ -270,7 +233,6 @@ def parse(
 
         return stream, self.cls(), self.cls()
 
-    @exhaust_stream
     def _parse_multipart(
         self,
         stream: t.IO[bytes],
@@ -294,50 +256,6 @@ def _parse_multipart(
         form, files = parser.parse(stream, boundary, content_length)
         return stream, form, files
 
-    def _parse_urlencoded_stream(
-        self, stream: t.IO[bytes]
-    ) -> t.Iterator[t.Tuple[str, str]]:
-        """Read the stream in chunks and yield parsed ``key=value`` tuples. Data is
-        accumulated until at least one full field is available. This avoids reading the
-        whole stream into memory at once if possible, and reduces the number of calls to
-        ``parse_qsl``.
-        """
-        remaining_parts = self.max_form_parts
-        last_chunk = b""
-
-        while True:
-            chunks = [last_chunk]
-
-            while True:
-                chunk = stream.read(10_000)
-                chunk, has_sep, last_chunk = chunk.rpartition(b"&")
-                chunks.append(chunk)
-
-                if not chunk or has_sep:
-                    break
-
-            data = b"".join(chunks)
-
-            if not data and not last_chunk:
-                break
-
-            try:
-                items = parse_qsl(
-                    data.decode(self.charset),
-                    keep_blank_values=True,
-                    encoding=self.charset,
-                    errors=self.errors,
-                    max_num_fields=remaining_parts,
-                )
-            except ValueError as e:
-                raise exceptions.RequestEntityTooLarge() from e
-
-            if remaining_parts is not None:
-                remaining_parts -= len(items)
-
-            yield from items
-
-    @exhaust_stream
     def _parse_urlencoded(
         self,
         stream: t.IO[bytes],
@@ -350,12 +268,20 @@ def _parse_urlencoded(
             and content_length is not None
             and content_length > self.max_form_memory_size
         ):
-            # if the input stream is not exhausted, firefox reports Connection Reset
-            _exhaust(stream)
-            raise exceptions.RequestEntityTooLarge()
+            raise RequestEntityTooLarge()
 
-        form = self.cls(self._parse_urlencoded_stream(stream))
-        return stream, form, self.cls()
+        try:
+            items = parse_qsl(
+                stream.read().decode(self.charset),
+                keep_blank_values=True,
+                encoding=self.charset,
+                errors=self.errors,
+                max_num_fields=self.max_form_parts,
+            )
+        except ValueError as e:
+            raise RequestEntityTooLarge() from e
+
+        return stream, self.cls(items), self.cls()
 
     #: mapping of mimetypes to parsing functions
     parse_functions: t.Dict[

diff --git a/src/werkzeug/sansio/request.py b/src/werkzeug/sansio/request.py
@@ -30,6 +30,7 @@
 from ..utils import cached_property
 from ..utils import header_property
 from .http import parse_cookie
+from .utils import get_content_length
 from .utils import get_current_url
 from .utils import get_host
 
@@ -274,17 +275,10 @@ def content_length(self) -> t.Optional[int]:
         the entity-body that would have been sent had the request been a
         GET.
         """
-        if self.headers.get("Transfer-Encoding", "") == "chunked":
-            return None
-
-        content_length = self.headers.get("Content-Length")
-        if content_length is not None:
-            try:
-                return max(0, int(content_length))
-            except (ValueError, TypeError):
-                pass
-
-        return None
+        return get_content_length(
+            http_content_length=self.headers.get("Content-Length"),
+            http_transfer_encoding=self.headers.get("Transfer-Encoding"),
+        )
 
     content_encoding = header_property[str](
         "Content-Encoding",

diff --git a/src/werkzeug/sansio/utils.py b/src/werkzeug/sansio/utils.py
@@ -146,22 +146,21 @@ def get_current_url(
 
 def get_content_length(
     http_content_length: t.Union[str, None] = None,
-    http_transfer_encoding: t.Union[str, None] = "",
+    http_transfer_encoding: t.Union[str, None] = None,
 ) -> t.Optional[int]:
-    """Returns the content length as an integer or ``None`` if
-    unavailable or chunked transfer encoding is used.
+    """Return the ``Content-Length`` header value as an int. If the header is not given
+    or the ``Transfer-Encoding`` header is ``chunked``, ``None`` is returned to indicate
+    a streaming request. If the value is not an integer, or negative, 0 is returned.
 
     :param http_content_length: The Content-Length HTTP header.
     :param http_transfer_encoding: The Transfer-Encoding HTTP header.
 
     .. versionadded:: 2.2
     """
-    if http_transfer_encoding == "chunked":
+    if http_transfer_encoding == "chunked" or http_content_length is None:
         return None
 
-    if http_content_length is not None:
-        try:
-            return max(0, int(http_content_length))
-        except (ValueError, TypeError):
-            pass
-    return None
+    try:
+        return max(0, int(http_content_length))
+    except ValueError:
+        return 0
diff --git a/src/werkzeug/serving.py b/src/werkzeug/serving.py
@@ -14,6 +14,7 @@
 import errno
 import io
 import os
+import selectors
 import socket
 import socketserver
 import sys
@@ -326,6 +327,32 @@ def execute(app: "WSGIApplication") -> None:
                 if chunk_response:
                     self.wfile.write(b"0\r\n\r\n")
             finally:
+                # Check for any remaining data in the read socket, and discard it. This
+                # will read past request.max_content_length, but lets the client see a
+                # 413 response instead of a connection reset failure. If we supported
+                # keep-alive connections, this naive approach would break by reading the
+                # next request line. Since we know that write (above) closes every
+                # connection we can read everything.
+                selector = selectors.DefaultSelector()
+                selector.register(self.connection, selectors.EVENT_READ)
+                total_size = 0
+                total_reads = 0
+
+                # A timeout of 0 tends to fail because a client needs a small amount of
+                # time to continue sending its data.
+                while selector.select(timeout=0.01):
+                    # Only read 10MB into memory at a time.
+                    data = self.rfile.read(10_000_000)
+                    total_size += len(data)
+                    total_reads += 1
+
+                    # Stop reading on no data, >=10GB, or 1000 reads. If a client sends
+                    # more than that, they'll get a connection reset failure.
+                    if not data or total_size >= 10_000_000_000 or total_reads > 1000:
+                        break
+
+                selector.close()
+
                 if hasattr(application_iter, "close"):
                     application_iter.close()
 

diff --git a/src/werkzeug/wrappers/request.py b/src/werkzeug/wrappers/request.py
@@ -11,15 +11,15 @@
 from ..datastructures import ImmutableMultiDict
 from ..datastructures import iter_multi_items
 from ..datastructures import MultiDict
+from ..exceptions import BadRequest
+from ..exceptions import UnsupportedMediaType
 from ..formparser import default_stream_factory
 from ..formparser import FormDataParser
 from ..sansio.request import Request as _SansIORequest
 from ..utils import cached_property
 from ..utils import environ_property
 from ..wsgi import _get_server
 from ..wsgi import get_input_stream
-from werkzeug.exceptions import BadRequest
-from werkzeug.exceptions import UnsupportedMediaType
 
 if t.TYPE_CHECKING:
     import typing_extensions as te
@@ -323,44 +323,60 @@ def __exit__(self, exc_type, exc_value, tb) -> None:  # type: ignore
 
     @cached_property
     def stream(self) -> t.IO[bytes]:
-        """
-        If the incoming form data was not encoded with a known mimetype
-        the data is stored unmodified in this stream for consumption.  Most
-        of the time it is a better idea to use :attr:`data` which will give
-        you that data as a string.  The stream only returns the data once.
+        """The WSGI input stream, with safety checks. This stream can only be consumed
+        once.
+
+        Use :meth:`get_data` to get the full data as bytes or text. The :attr:`data`
+        attribute will contain the full bytes only if they do not represent form data.
+        The :attr:`form` attribute will contain the parsed form data in that case.
+
+        Unlike :attr:`input_stream`, this stream guards against infinite streams or
+        reading past :attr:`content_length` or :attr:`max_content_length`.
+
+        If :attr:`max_content_length` is set and the request has a
+        :attr:`content_length` (is not a streaming request), this will raise
+        :exc:`.RequestEntityTooLarge` if the max length is exceeded. Otherwise, the
+        limit will be checked during reads.
+
+        If the limit is reached before the underlying stream is exhausted (such as a
+        file that is too large, or an infinite stream), the remaining contents of the
+        stream cannot be read safely. Depending on how the server handles this, clients
+        may show a "connection reset" failure instead of seeing the 413 response.
 
-        Unlike :attr:`input_stream` this stream is properly guarded that you
-        can't accidentally read past the length of the input.  Werkzeug will
-        internally always refer to this stream to read data which makes it
-        possible to wrap this object with a stream that does filtering.
+        .. versionchanged:: 2.3
+            Check ``max_content_length`` preemptively and while reading.
 
         .. versionchanged:: 0.9
-           This stream is now always available but might be consumed by the
-           form parser later on.  Previously the stream was only set if no
-           parsing happened.
+            The stream is always set (but may be consumed) even if form parsing was
+            accessed first.
         """
         if self.shallow:
             raise RuntimeError(
                 "This request was created with 'shallow=True', reading"
                 " from the input stream is disabled."
             )
 
-        return get_input_stream(self.environ)
+        return get_input_stream(
+            self.environ, max_content_length=self.max_content_length
+        )
 
     input_stream = environ_property[t.IO[bytes]](
         "wsgi.input",
-        doc="""The WSGI input stream.
+        doc="""The raw WSGI input stream, without any safety checks.
+
+        This is dangerous to use. It does not guard against infinite streams or reading
+        past :attr:`content_length` or :attr:`max_content_length`.
 
-        In general it's a bad idea to use this one because you can
-        easily read past the boundary.  Use the :attr:`stream`
-        instead.""",
+        Use :attr:`stream` instead.
+        """,
     )
 
     @cached_property
     def data(self) -> bytes:
-        """
-        Contains the incoming request data as string in case it came with
-        a mimetype Werkzeug does not handle.
+        """The raw data read from :attr:`stream`. Will be empty if the request
+        represents form data.
+
+        To get the raw data even if it represents form data, use :meth:`get_data`.
         """
         return self.get_data(parse_form_data=True)