Skip to content

Commit

Permalink
check max_content_length consistently (#2620)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidism committed Mar 14, 2023
2 parents 8fe91b7 + 4f7048e commit 77c420b
Show file tree
Hide file tree
Showing 10 changed files with 324 additions and 364 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Expand Up @@ -64,6 +64,14 @@ Unreleased
multiple header values. However, accessing the property only returns the first
instance.

- If ``request.max_content_length`` is set, it is checked immediately when accessing
the stream, and while reading from the stream in general, rather than only during
form parsing. :issue:`1513`
- The development server, which must not be used in production, will exhaust the
request stream up to 10GB or 1000 reads. This allows clients to see a 413 error if
``max_content_length`` is exceeded, instead of a "connection reset" failure.
:pr:`2620`


Version 2.2.3
-------------
Expand Down
5 changes: 5 additions & 0 deletions docs/request_data.rst
Expand Up @@ -94,6 +94,11 @@ and HTTPS servers should set their own limits on size and timeouts. The operatin
or container manager should set limits on memory and processing time for server
processes.

If a 413 Content Too Large error is returned before the entire request is read, clients
may show a "connection reset" failure instead of the 413 error. This is based on how the
WSGI/HTTP server and client handle connections, it's not something the WSGI application
(Werkzeug) has control over.


How to extend Parsing?
----------------------
Expand Down
102 changes: 14 additions & 88 deletions src/werkzeug/formparser.py
@@ -1,13 +1,12 @@
import typing as t
from functools import update_wrapper
from io import BytesIO
from typing import Union
from urllib.parse import parse_qsl

from . import exceptions
from .datastructures import FileStorage
from .datastructures import Headers
from .datastructures import MultiDict
from .exceptions import RequestEntityTooLarge
from .http import parse_options_header
from .sansio.multipart import Data
from .sansio.multipart import Epilogue
Expand Down Expand Up @@ -47,12 +46,6 @@ def __call__(
F = t.TypeVar("F", bound=t.Callable[..., t.Any])


def _exhaust(stream: t.IO[bytes]) -> None:
bts = stream.read(64 * 1024)
while bts:
bts = stream.read(64 * 1024)


def default_stream_factory(
total_content_length: t.Optional[int],
content_type: t.Optional[str],
Expand Down Expand Up @@ -130,27 +123,6 @@ def parse_form_data(
).parse_from_environ(environ)


def exhaust_stream(f: F) -> F:
"""Helper decorator for methods that exhausts the stream on return."""

def wrapper(self, stream, *args, **kwargs): # type: ignore
try:
return f(self, stream, *args, **kwargs)
finally:
exhaust = getattr(stream, "exhaust", None)

if exhaust is not None:
exhaust()
else:
while True:
chunk = stream.read(1024 * 64)

if not chunk:
break

return update_wrapper(t.cast(F, wrapper), f)


class FormDataParser:
"""This class implements parsing of form data for Werkzeug. By itself
it can parse multipart and url encoded form data. It can be subclassed
Expand Down Expand Up @@ -247,15 +219,6 @@ def parse(
the multipart boundary for instance)
:return: A tuple in the form ``(stream, form, files)``.
"""
if (
self.max_content_length is not None
and content_length is not None
and content_length > self.max_content_length
):
# if the input stream is not exhausted, firefox reports Connection Reset
_exhaust(stream)
raise exceptions.RequestEntityTooLarge()

if options is None:
options = {}

Expand All @@ -270,7 +233,6 @@ def parse(

return stream, self.cls(), self.cls()

@exhaust_stream
def _parse_multipart(
self,
stream: t.IO[bytes],
Expand All @@ -294,50 +256,6 @@ def _parse_multipart(
form, files = parser.parse(stream, boundary, content_length)
return stream, form, files

def _parse_urlencoded_stream(
self, stream: t.IO[bytes]
) -> t.Iterator[t.Tuple[str, str]]:
"""Read the stream in chunks and yield parsed ``key=value`` tuples. Data is
accumulated until at least one full field is available. This avoids reading the
whole stream into memory at once if possible, and reduces the number of calls to
``parse_qsl``.
"""
remaining_parts = self.max_form_parts
last_chunk = b""

while True:
chunks = [last_chunk]

while True:
chunk = stream.read(10_000)
chunk, has_sep, last_chunk = chunk.rpartition(b"&")
chunks.append(chunk)

if not chunk or has_sep:
break

data = b"".join(chunks)

if not data and not last_chunk:
break

try:
items = parse_qsl(
data.decode(self.charset),
keep_blank_values=True,
encoding=self.charset,
errors=self.errors,
max_num_fields=remaining_parts,
)
except ValueError as e:
raise exceptions.RequestEntityTooLarge() from e

if remaining_parts is not None:
remaining_parts -= len(items)

yield from items

@exhaust_stream
def _parse_urlencoded(
self,
stream: t.IO[bytes],
Expand All @@ -350,12 +268,20 @@ def _parse_urlencoded(
and content_length is not None
and content_length > self.max_form_memory_size
):
# if the input stream is not exhausted, firefox reports Connection Reset
_exhaust(stream)
raise exceptions.RequestEntityTooLarge()
raise RequestEntityTooLarge()

form = self.cls(self._parse_urlencoded_stream(stream))
return stream, form, self.cls()
try:
items = parse_qsl(
stream.read().decode(self.charset),
keep_blank_values=True,
encoding=self.charset,
errors=self.errors,
max_num_fields=self.max_form_parts,
)
except ValueError as e:
raise RequestEntityTooLarge() from e

return stream, self.cls(items), self.cls()

#: mapping of mimetypes to parsing functions
parse_functions: t.Dict[
Expand Down
16 changes: 5 additions & 11 deletions src/werkzeug/sansio/request.py
Expand Up @@ -30,6 +30,7 @@
from ..utils import cached_property
from ..utils import header_property
from .http import parse_cookie
from .utils import get_content_length
from .utils import get_current_url
from .utils import get_host

Expand Down Expand Up @@ -274,17 +275,10 @@ def content_length(self) -> t.Optional[int]:
the entity-body that would have been sent had the request been a
GET.
"""
if self.headers.get("Transfer-Encoding", "") == "chunked":
return None

content_length = self.headers.get("Content-Length")
if content_length is not None:
try:
return max(0, int(content_length))
except (ValueError, TypeError):
pass

return None
return get_content_length(
http_content_length=self.headers.get("Content-Length"),
http_transfer_encoding=self.headers.get("Transfer-Encoding"),
)

content_encoding = header_property[str](
"Content-Encoding",
Expand Down
19 changes: 9 additions & 10 deletions src/werkzeug/sansio/utils.py
Expand Up @@ -146,22 +146,21 @@ def get_current_url(

def get_content_length(
http_content_length: t.Union[str, None] = None,
http_transfer_encoding: t.Union[str, None] = "",
http_transfer_encoding: t.Union[str, None] = None,
) -> t.Optional[int]:
"""Returns the content length as an integer or ``None`` if
unavailable or chunked transfer encoding is used.
"""Return the ``Content-Length`` header value as an int. If the header is not given
or the ``Transfer-Encoding`` header is ``chunked``, ``None`` is returned to indicate
a streaming request. If the value is not an integer, or negative, 0 is returned.
:param http_content_length: The Content-Length HTTP header.
:param http_transfer_encoding: The Transfer-Encoding HTTP header.
.. versionadded:: 2.2
"""
if http_transfer_encoding == "chunked":
if http_transfer_encoding == "chunked" or http_content_length is None:
return None

if http_content_length is not None:
try:
return max(0, int(http_content_length))
except (ValueError, TypeError):
pass
return None
try:
return max(0, int(http_content_length))
except ValueError:
return 0
27 changes: 27 additions & 0 deletions src/werkzeug/serving.py
Expand Up @@ -14,6 +14,7 @@
import errno
import io
import os
import selectors
import socket
import socketserver
import sys
Expand Down Expand Up @@ -326,6 +327,32 @@ def execute(app: "WSGIApplication") -> None:
if chunk_response:
self.wfile.write(b"0\r\n\r\n")
finally:
# Check for any remaining data in the read socket, and discard it. This
# will read past request.max_content_length, but lets the client see a
# 413 response instead of a connection reset failure. If we supported
# keep-alive connections, this naive approach would break by reading the
# next request line. Since we know that write (above) closes every
# connection we can read everything.
selector = selectors.DefaultSelector()
selector.register(self.connection, selectors.EVENT_READ)
total_size = 0
total_reads = 0

# A timeout of 0 tends to fail because a client needs a small amount of
# time to continue sending its data.
while selector.select(timeout=0.01):
# Only read 10MB into memory at a time.
data = self.rfile.read(10_000_000)
total_size += len(data)
total_reads += 1

# Stop reading on no data, >=10GB, or 1000 reads. If a client sends
# more than that, they'll get a connection reset failure.
if not data or total_size >= 10_000_000_000 or total_reads > 1000:
break

selector.close()

if hasattr(application_iter, "close"):
application_iter.close()

Expand Down
60 changes: 38 additions & 22 deletions src/werkzeug/wrappers/request.py
Expand Up @@ -11,15 +11,15 @@
from ..datastructures import ImmutableMultiDict
from ..datastructures import iter_multi_items
from ..datastructures import MultiDict
from ..exceptions import BadRequest
from ..exceptions import UnsupportedMediaType
from ..formparser import default_stream_factory
from ..formparser import FormDataParser
from ..sansio.request import Request as _SansIORequest
from ..utils import cached_property
from ..utils import environ_property
from ..wsgi import _get_server
from ..wsgi import get_input_stream
from werkzeug.exceptions import BadRequest
from werkzeug.exceptions import UnsupportedMediaType

if t.TYPE_CHECKING:
import typing_extensions as te
Expand Down Expand Up @@ -323,44 +323,60 @@ def __exit__(self, exc_type, exc_value, tb) -> None: # type: ignore

@cached_property
def stream(self) -> t.IO[bytes]:
"""
If the incoming form data was not encoded with a known mimetype
the data is stored unmodified in this stream for consumption. Most
of the time it is a better idea to use :attr:`data` which will give
you that data as a string. The stream only returns the data once.
"""The WSGI input stream, with safety checks. This stream can only be consumed
once.
Use :meth:`get_data` to get the full data as bytes or text. The :attr:`data`
attribute will contain the full bytes only if they do not represent form data.
The :attr:`form` attribute will contain the parsed form data in that case.
Unlike :attr:`input_stream`, this stream guards against infinite streams or
reading past :attr:`content_length` or :attr:`max_content_length`.
If :attr:`max_content_length` is set and the request has a
:attr:`content_length` (is not a streaming request), this will raise
:exc:`.RequestEntityTooLarge` if the max length is exceeded. Otherwise, the
limit will be checked during reads.
If the limit is reached before the underlying stream is exhausted (such as a
file that is too large, or an infinite stream), the remaining contents of the
stream cannot be read safely. Depending on how the server handles this, clients
may show a "connection reset" failure instead of seeing the 413 response.
Unlike :attr:`input_stream` this stream is properly guarded that you
can't accidentally read past the length of the input. Werkzeug will
internally always refer to this stream to read data which makes it
possible to wrap this object with a stream that does filtering.
.. versionchanged:: 2.3
Check ``max_content_length`` preemptively and while reading.
.. versionchanged:: 0.9
This stream is now always available but might be consumed by the
form parser later on. Previously the stream was only set if no
parsing happened.
The stream is always set (but may be consumed) even if form parsing was
accessed first.
"""
if self.shallow:
raise RuntimeError(
"This request was created with 'shallow=True', reading"
" from the input stream is disabled."
)

return get_input_stream(self.environ)
return get_input_stream(
self.environ, max_content_length=self.max_content_length
)

input_stream = environ_property[t.IO[bytes]](
"wsgi.input",
doc="""The WSGI input stream.
doc="""The raw WSGI input stream, without any safety checks.
This is dangerous to use. It does not guard against infinite streams or reading
past :attr:`content_length` or :attr:`max_content_length`.
In general it's a bad idea to use this one because you can
easily read past the boundary. Use the :attr:`stream`
instead.""",
Use :attr:`stream` instead.
""",
)

@cached_property
def data(self) -> bytes:
"""
Contains the incoming request data as string in case it came with
a mimetype Werkzeug does not handle.
"""The raw data read from :attr:`stream`. Will be empty if the request
represents form data.
To get the raw data even if it represents form data, use :meth:`get_data`.
"""
return self.get_data(parse_form_data=True)

Expand Down

0 comments on commit 77c420b

Please sign in to comment.