From e3b1d0fa6ddab5d2222a48ee2ba641408aef3477 Mon Sep 17 00:00:00 2001 From: Marcel Hellkamp Date: Wed, 3 Jan 2024 21:29:41 +0100 Subject: [PATCH] First working version of the new multipart parser. The new parser is used for all python versions, which may break backwards compatibility for certain edge cases and error scenarios. For a 'good' request there should be no difference, though. What changed: * Lax newline parsing (accepting \r or \n in place of \r\n) is a security risk and no longer implemented. Most modern browsers and client libraries follow the spec and should not have any issues producing valid multipart data. * Parsing errors are no longer silently ignored, but trigger an appropiate 400 error. --- bottle.py | 198 +++++++++++++------------------ test/test_multipart.py | 259 +++++++++++++++++++++++++++++++++++++++++ test/tools.py | 16 +-- 3 files changed, 348 insertions(+), 125 deletions(-) create mode 100644 test/test_multipart.py diff --git a/bottle.py b/bottle.py index d5feb5160..6f94fd058 100755 --- a/bottle.py +++ b/bottle.py @@ -1390,39 +1390,21 @@ def POST(self): post[key] = value return post - if py >= (3,11,0): - post.recode_unicode = False - charset = options.get("charset", "utf8") - boundary = options.get("boundary", "") - if not boundary: - raise MultipartError("Invalid content type header, missing boundary") - parser = _MultipartParser(self.body, boundary, self.content_length, - mem_limit=self.MEMFILE_MAX, memfile_limit=self.MEMFILE_MAX, - charset=charset) - - for part in parser: - if not part.filename and part.is_buffered(): - post[part.name] = part.value - else: - post[part.name] = FileUpload(part.file, part.name, - part.filename, part.headerlist) - - else: - safe_env = {'QUERY_STRING': ''} # Build a safe environment for cgi - for key in ('REQUEST_METHOD', 'CONTENT_TYPE', 'CONTENT_LENGTH'): - if key in self.environ: safe_env[key] = self.environ[key] - args = dict(fp=self.body, environ=safe_env, keep_blank_values=True) - if py3k: - args['encoding'] = 'utf8' - post.recode_unicode = False - data = cgi.FieldStorage(**args) - self['_cgi.FieldStorage'] = data #http://bugs.python.org/issue18394 - for item in data.list or []: - if item.filename is None: - post[item.name] = item.value - else: - post[item.name] = FileUpload(item.file, item.name, - item.filename, item.headers) + post.recode_unicode = False + charset = options.get("charset", "utf8") + boundary = options.get("boundary") + if not boundary: + raise MultipartError("Invalid content type header, missing boundary") + parser = _MultipartParser(self.body, boundary, self.content_length, + mem_limit=self.MEMFILE_MAX, memfile_limit=self.MEMFILE_MAX, + charset=charset) + + for part in parser.parse(): + if not part.filename and part.is_buffered(): + post[part.name] = tonat(part.value, 'utf8') + else: + post[part.name] = FileUpload(part.file, part.name, + part.filename, part.headerlist) return post @@ -3036,7 +3018,7 @@ def _parse_http_header(h): values.append((parts[0].strip(), {})) for attr in parts[1:]: name, value = attr.split('=', 1) - values[-1][1][name.strip()] = value.strip() + values[-1][1][name.strip().lower()] = value.strip() else: lop, key, attrs = ',', None, {} for quoted, plain, tok in _hsplit(h): @@ -3048,9 +3030,9 @@ def _parse_http_header(h): if tok == '=': key = value else: - attrs[value] = '' + attrs[value.strip().lower()] = '' elif lop == '=' and key: - attrs[key] = value + attrs[key.strip().lower()] = value key = None lop = tok return values @@ -3240,13 +3222,6 @@ def __init__( buffer_size=2 ** 16, charset="latin1", ): - """ Parse a multipart/form-data byte stream. This object is an iterator - over the parts of the message. - - :param stream: A file-like stream. Must implement ``.read(size)``. - :param boundary: The multipart boundary as a byte string. - :param content_length: The maximum number of bytes to read. - """ self.stream = stream self.boundary = boundary self.content_length = content_length @@ -3256,70 +3231,57 @@ def __init__( self.buffer_size = min(buffer_size, self.mem_limit) self.charset = charset + if not boundary: + raise MultipartError("No boundary.") + if self.buffer_size - 6 < len(boundary): # "--boundary--\r\n" raise MultipartError("Boundary does not fit into buffer_size.") - self._done = [] - self._part_iter = None - - def __iter__(self): - """ Iterate over the parts of the multipart message. """ - if not self._part_iter: - self._part_iter = self._iterparse() - - for part in self._done: - yield part - - for part in self._part_iter: - self._done.append(part) - yield part - def _lineiter(self): - """ Iterate over a binary file-like object line by line. Each line is - returned as a (line, line_ending) tuple. If the line does not fit - into self.buffer_size, line_ending is empty and the rest of the line - is returned with the next iteration. + """ Iterate over a binary file-like object (crlf terminated) line by + line. Each line is returned as a (line, crlf) tuple. Lines larger + than buffer_size are split into chunks where all but the last chunk + has an empty string instead of crlf. Maximum chunk size is twice the + buffer size. """ + read = self.stream.read maxread, maxbuf = self.content_length, self.buffer_size - buffer = b"" # buffer for the last (partial) line + partial = b"" # Contains the last (partial) line while True: - data = read(maxbuf if maxread < 0 else min(maxbuf, maxread)) - maxread -= len(data) - lines = (buffer + data).splitlines(True) - len_first_line = len(lines[0]) - - # be sure that the first line does not become too big - if len_first_line > self.buffer_size: - # at the same time don't split a '\r\n' accidentally - if len_first_line == self.buffer_size + 1 and lines[0].endswith(b"\r\n"): - splitpos = self.buffer_size - 1 - else: - splitpos = self.buffer_size - lines[:1] = [lines[0][:splitpos], lines[0][splitpos:]] - - if data: - buffer = lines[-1] - lines = lines[:-1] - - for line in lines: - if line.endswith(b"\r\n"): - yield line[:-2], b"\r\n" - elif line.endswith(b"\n"): - yield line[:-1], b"\n" - elif line.endswith(b"\r"): - yield line[:-1], b"\r" - else: - yield line, b"" - - if not data: + chunk = read(maxbuf if maxread < 0 else min(maxbuf, maxread)) + maxread -= len(chunk) + if not chunk: + if partial: + yield partial, b'' break - def _iterparse(self): + if partial: + chunk = partial + chunk + + scanpos = 0 + while True: + i = chunk.find(b'\r\n', scanpos) + if i >= 0: + yield chunk[scanpos:i], b'\r\n' + scanpos = i + 2 + else: # CRLF not found + partial = chunk[scanpos:] if scanpos else chunk + break + + if len(partial) > maxbuf: + yield partial[:-1], b"" + partial = partial[-1:] + + def parse(self): + """ Return a MultiPart iterator. Can only be called once. """ + lines, line = self._lineiter(), "" separator = b"--" + tob(self.boundary) - terminator = b"--" + tob(self.boundary) + b"--" + terminator = separator + b"--" + mem_used, disk_used = 0, 0 # Track used resources to prevent DoS + is_tail = False # True if the last line was incomplete (cutted) # Consume first boundary. Ignore any preamble, as required by RFC # 2046, section 5.1.1. @@ -3329,46 +3291,34 @@ def _iterparse(self): else: raise MultipartError("Stream does not contain boundary") - # Check for empty data + # First line is termainating boundary -> empty multipart stream if line == terminator: for _ in lines: - raise MultipartError("Data after end of stream") + raise MultipartError("Found data after empty multipart stream") return - # For each part in stream... - mem_used, disk_used = 0, 0 # Track used resources to prevent DoS - is_tail = False # True if the last line was incomplete (cutted) - - opts = { + part_options = { "buffer_size": self.buffer_size, "memfile_limit": self.memfile_limit, "charset": self.charset, } - - part = _MultipartPart(**opts) + part = _MultipartPart(**part_options) for line, nl in lines: - if line == terminator and not is_tail: - part.file.seek(0) - yield part - break - - elif line == separator and not is_tail: + if not is_tail and (line == separator or line == terminator): + part.finish() if part.is_buffered(): mem_used += part.size else: disk_used += part.size - part.file.seek(0) - yield part - - part = _MultipartPart(**opts) - + if line == terminator: + break + part = _MultipartPart(**part_options) else: is_tail = not nl # The next line continues this one try: part.feed(line, nl) - if part.is_buffered(): if part.size + mem_used > self.mem_limit: raise MultipartError("Memory limit reached.") @@ -3436,7 +3386,13 @@ def write_body(self, line, nl): if self.size > self.memfile_limit and isinstance(self.file, BytesIO): self.file, old = NamedTemporaryFile(mode="w+b"), self.file old.seek(0) - copy_file(old, self.file, self.size, self.buffer_size) + + copied, maxcopy, chunksize = 0, self.size, self.buffer_size + read, write = old.read, self.file.write + while copied < maxcopy: + chunk = read(min(chunksize, maxcopy - copied)) + write(chunk) + copied += len(chunk) def finish_header(self): self.file = BytesIO() @@ -3458,6 +3414,11 @@ def finish_header(self): self.charset = options.get("charset") or self.charset self.content_length = int(self.headers.get("Content-Length", "-1")) + def finish(self): + if not self.file: + raise MultipartError("Incomplete part: Header section not closed.") + self.file.seek(0) + def is_buffered(self): """ Return true if the data is fully buffered in memory.""" return isinstance(self.file, BytesIO) @@ -3479,7 +3440,10 @@ def raw(self): finally: self.file.seek(pos) - + def close(self): + if self.file: + self.file.close() + self.file = False ############################################################################### # Server Adapter ############################################################### diff --git a/test/test_multipart.py b/test/test_multipart.py new file mode 100644 index 000000000..bef3794ef --- /dev/null +++ b/test/test_multipart.py @@ -0,0 +1,259 @@ +# -*- coding: utf-8 -*- +import unittest +import base64 +import sys, os.path, tempfile +from io import BytesIO + +import bottle + +class BaseMultipartTest(unittest.TestCase): + def setUp(self): + # if bottle.py < (3,11,0): + # self.skipTest("Not used") + self.data = BytesIO() + self.parts = None + + def write(self, *lines): + for line in lines: + self.data.write(bottle.tob(line)) + + def parse(self, ctype=None, clen=-1, **kwargs): + self.data.seek(0) + h = bottle._parse_http_header(ctype or "multipart/form-data; boundary=foo") + charset = h[0][1].get("charset", "utf8") + boundary = h[0][1].get("boundary") + parser = bottle._MultipartParser(self.data, boundary, clen, **kwargs) + return list(parser.parse()) + + def assertFile(self, name, filename, ctype, data): + for part in self.parts: + if part.name != name: continue + self.assertEqual(part.filename, expected[0]) + self.assertEqual(part.content_type, expected[1]) + self.assertEqual(part.file.read(), bottle.tob(expected[2])) + break + else: + self.fail("Field %s not found" % name) + + def assertForm(self, name, data): + for part in self.parts: + if part.name != name: continue + self.assertEqual(part.filename, None) + self.assertEqual(part.content_type, None) + self.assertEqual(part.value, data) + break + else: + self.fail("Field %s not found" % name) + + +class TestHeaderParser(BaseMultipartTest): + + def test_options_parser(self): + parse = bottle._parse_http_header + self.assertEqual( + parse('form-data; name="Test"; filename="Test.txt"'), + [('form-data', {"name": "Test", "filename": "Test.txt"})]) + self.assertEqual(parse('form-data; name="Test"; FileName="Te\\"st.txt"'), + [('form-data', {"name": "Test", "filename": "Te\"st.txt"})]) + self.assertEqual(parse('form-data; name="Test"; filename="C:\\test\\bla.txt"'), + [('form-data', {"name": "Test", "filename": "C:\\test\\bla.txt"})]) + self.assertEqual(parse('form-data; name="Test"; filename="\\\\test\\bla.txt"'), + [('form-data', {"name": "Test", "filename": "\\\\test\\bla.txt"})]) + + +class TestMultipartParser(BaseMultipartTest): + + def assertIterline(self, data, *expected, **options): + self.assertEqual( + list(bottle._MultipartParser(BytesIO(bottle.tob(data)), 'foo', **options)._lineiter()), + [(bottle.tob(l), bottle.tob(nl)) for l,nl in expected]) + + def test_iterlines(self): + self.assertIterline('abc\ndef\r\nghi', ('abc\ndef','\r\n'), ('ghi', '')) + + def test_iterlines_limit(self): + self.assertIterline('abc\ndef\r\nghi', ('abc\ndef','\r\n'), ('g', ''), content_length=10) + self.assertIterline('abc\ndef\r\nghi', ('abc\ndef\r',''), content_length=8) + + def test_fuzzy_lineiter(self): + """ Test all possible buffer sizes """ + minbuflen = 9 # boundary size of '--foo--\r\n' + data = b'data\rdata\ndata\r\ndata\n\rdata\r\n'.replace(b'data', b'X'*minbuflen*2) + lines = data.split(b"\r\n")[:-1] + for tail in (b"", b"tail"): + for buffer_size in range(minbuflen, len(data+tail)+1): + splits = list(bottle._MultipartParser( + BytesIO(data+tail), 'foo', + buffer_size=buffer_size)._lineiter()) + partial = b"" + merged = [] + for part, nl in splits: + self.assertTrue(nl in (b"", b"\r\n")) + self.assertTrue(len(part) >= buffer_size or nl or part == tail) + partial += part + if nl: + merged.append(partial) + partial = b"" + self.assertEqual(merged, lines) + self.assertEqual(tail, partial) + + def test_big_file(self): + ''' If the size of an uploaded part exceeds memfile_limit, + it is written to disk. ''' + test_file = 'abc'*1024 + boundary = '---------------------------186454651713519341951581030105' + request = BytesIO(bottle.tob('\r\n').join(map(bottle.tob,[ + '--' + boundary, + 'Content-Disposition: form-data; name="file1"; filename="random.png"', + 'Content-Type: image/png', '', test_file, '--' + boundary, + 'Content-Disposition: form-data; name="file2"; filename="random.png"', + 'Content-Type: image/png', '', test_file + 'a', '--' + boundary, + 'Content-Disposition: form-data; name="file3"; filename="random.png"', + 'Content-Type: image/png', '', test_file*2, '--'+boundary+'--','']))) + parts = list(bottle._MultipartParser(request, boundary, memfile_limit=len(test_file)).parse()) + p = {p.name: p for p in parts} + try: + self.assertEqual(p.get('file1').file.read(), bottle.tob(test_file)) + self.assertTrue(p.get('file1').is_buffered()) + self.assertEqual(p.get('file2').file.read(), bottle.tob(test_file + 'a')) + self.assertFalse(p.get('file2').is_buffered()) + self.assertEqual(p.get('file3').file.read(), bottle.tob(test_file*2)) + self.assertFalse(p.get('file3').is_buffered()) + finally: + for part in parts: + part.close() + + def test_file_seek(self): + ''' The file object should be readable withoud a seek(0). ''' + test_file = 'abc'*1024 + boundary = '---------------------------186454651713519341951581030105' + request = BytesIO(bottle.tob('\r\n').join(map(bottle.tob,[ + '--' + boundary, + 'Content-Disposition: form-data; name="file1"; filename="random.png"', + 'Content-Type: image/png', '', test_file, '--' + boundary + '--','']))) + p = list(bottle._MultipartParser(request, boundary).parse()) + self.assertEqual(p[0].file.read(), bottle.tob(test_file)) + self.assertEqual(p[0].value, test_file) + + def test_unicode_value(self): + ''' The .value property always returns unicode ''' + test_file = 'abc'*1024 + boundary = '---------------------------186454651713519341951581030105' + request = BytesIO(bottle.tob('\r\n').join(map(bottle.tob,[ + '--' + boundary, + 'Content-Disposition: form-data; name="file1"; filename="random.png"', + 'Content-Type: image/png', '', test_file, '--' + boundary + '--','']))) + p = list(bottle._MultipartParser(request, boundary).parse()) + self.assertEqual(p[0].file.read(), bottle.tob(test_file)) + self.assertEqual(p[0].value, test_file) + self.assertTrue(hasattr(p[0].value, 'encode')) + + def test_multiline_header(self): + ''' HTTP allows headers to be multiline. ''' + test_file = bottle.tob('abc'*1024) + test_text = u'Test text\n with\r\n ümläuts!' + boundary = '---------------------------186454651713519341951581030105' + request = BytesIO(bottle.tob('\r\n').join(map(bottle.tob,[ + '--' + boundary, + 'Content-Disposition: form-data;', + '\tname="file1"; filename="random.png"', + 'Content-Type: image/png', '', test_file, '--' + boundary, + 'Content-Disposition: form-data;', + ' name="text"', '', test_text, + '--' + boundary + '--','']))) + p = list(bottle._MultipartParser(request, boundary, charset='utf8').parse()) + self.assertEqual(p[0].name, "file1") + self.assertEqual(p[0].file.read(), test_file) + self.assertEqual(p[0].filename, 'random.png') + self.assertEqual(p[1].name, "text") + self.assertEqual(p[1].value, test_text) + + +class TestBrokenMultipart(BaseMultipartTest): + + def assertMPError(self, **ka): + self.assertRaises(bottle.MultipartError, self.parse, **ka) + + def test_big_boundary(self): + self.assertMPError(buffer_size=1024*3) + + def test_missing_content_type(self): + self.assertMPError(ctype="") + + def test_unsupported_content_type(self): + self.assertMPError(ctype='multipart/fantasy') + + def test_missing_boundary(self): + self.assertMPError(ctype="multipart/form-data") + + def test_no_terminator(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc') + self.assertMPError() + + def test_no_newline_after_content(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc', '--foo--') + self.assertMPError() + + def test_no_newline_after_middle_content(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc', '--foo\r\n' + 'Content-Disposition: form-data; name="file2"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc\r\n', '--foo--') + parts = self.parse() + self.assertEqual(len(parts), 1) + self.assertTrue('name="file2"' in parts[0].value) + + def test_preamble_before_start_boundary(self): + parts = self.write('Preamble\r\n', '--foo\r\n' + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc\r\n', '--foo--') + parts = self.parse() + self.assertEqual(parts[0].file.read(), bottle.tob('abc')) + self.assertEqual(parts[0].filename, 'random.png') + self.assertEqual(parts[0].name, 'file1') + self.assertEqual(parts[0].content_type, 'image/png') + + def test_no_start_boundary(self): + self.write('--bar\r\n','--nonsense\r\n' + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc\r\n', '--nonsense--') + self.assertMPError() + + def test_disk_limit(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo--') + self.assertMPError(memfile_limit=0, disk_limit=1024) + + def test_mem_limit(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo\r\n', + 'Content-Disposition: form-data; name="file2"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo--') + self.assertMPError(mem_limit=1024*3) + + def test_invalid_header(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', + 'Bad header\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo--') + self.assertMPError() + + def test_content_length_to_small(self): + self.write('--foo\r\n', + 'Content-Disposition: form-data; name="file1"; filename="random.png"\r\n', + 'Content-Type: image/png\r\n', + 'Content-Length: 111\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo--') + self.assertMPError() + + def test_no_disposition_header(self): + self.write('--foo\r\n', + 'Content-Type: image/png\r\n', '\r\n', 'abc'*1024+'\r\n', '--foo--') + self.assertMPError() + diff --git a/test/tools.py b/test/tools.py index a66315c1b..ed0319cf7 100755 --- a/test/tools.py +++ b/test/tools.py @@ -166,17 +166,17 @@ def multipart_environ(fields, files): boundary = '--' + boundary body = '' for name, value in fields: - body += boundary + '\n' - body += 'Content-Disposition: form-data; name="%s"\n\n' % name - body += value + '\n' + body += boundary + '\r\n' + body += 'Content-Disposition: form-data; name="%s"\r\n\r\n' % name + body += value + '\r\n' for name, filename, content in files: mimetype = str(mimetypes.guess_type(filename)[0]) or 'application/octet-stream' - body += boundary + '\n' - body += 'Content-Disposition: file; name="%s"; filename="%s"\n' % \ + body += boundary + '\r\n' + body += 'Content-Disposition: file; name="%s"; filename="%s"\r\n' % \ (name, filename) - body += 'Content-Type: %s\n\n' % mimetype - body += content + '\n' - body += boundary + '--\n' + body += 'Content-Type: %s\r\n\r\n' % mimetype + body += content + '\r\n' + body += boundary + '--\r\n' if isinstance(body, unicode): body = body.encode('utf8') env['CONTENT_LENGTH'] = str(len(body))