Skip to content

Commit

Permalink
Fixed MemoryError when decoding large definite strings (#204)
Browse files Browse the repository at this point in the history
Also fixed a return value check in `CBORTag_hash()`.
  • Loading branch information
agronholm committed Jan 14, 2024
1 parent 0d54000 commit 387755e
Show file tree
Hide file tree
Showing 6 changed files with 263 additions and 47 deletions.
29 changes: 27 additions & 2 deletions cbor2/_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import struct
import sys
from codecs import getincrementaldecoder
from collections.abc import Callable, Mapping, Sequence
from datetime import date, datetime, timedelta, timezone
from io import BytesIO
Expand Down Expand Up @@ -31,6 +32,7 @@
timestamp_re = re.compile(
r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)" r"(?:\.(\d{1,6})\d*)?(?:Z|([+-])(\d\d):(\d\d))$"
)
incremental_utf8_decoder = getincrementaldecoder("utf-8")


class CBORDecoder:
Expand Down Expand Up @@ -305,8 +307,19 @@ def decode_bytestring(self, subtype: int) -> bytes:
else:
if length > sys.maxsize:
raise CBORDecodeValueError("invalid length for bytestring 0x%x" % length)
elif length <= 65536:
result = self.read(length)
else:
# Read large bytestrings 65536 (2 ** 16) bytes at a time
left = length
buffer = bytearray()
while left:
chunk_size = min(left, 65536)
buffer.extend(self.read(chunk_size))
left -= chunk_size

result = bytes(buffer)

result = self.read(length)
self._stringref_namespace_add(result, length)

return self.set_shareable(result)
Expand Down Expand Up @@ -350,7 +363,19 @@ def decode_string(self, subtype: int) -> str:
if length > sys.maxsize:
raise CBORDecodeValueError("invalid length for string 0x%x" % length)

result = self.read(length).decode("utf-8", self._str_errors)
if length <= 65536:
result = self.read(length).decode("utf-8", self._str_errors)
else:
# Read and decode large text strings 65536 (2 ** 16) bytes at a time
codec = incremental_utf8_decoder(self._str_errors)
left = length
result = ""
while left:
chunk_size = min(left, 65536)
final = left <= chunk_size
result += codec.decode(self.read(chunk_size), final)
left -= chunk_size

self._stringref_namespace_add(result, length)

return self.set_shareable(result)
Expand Down
4 changes: 3 additions & 1 deletion docs/versionhistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ This library adheres to `Semantic Versioning <http://semver.org/>`_.
**UNRELEASED**

- Added the ``cbor2`` command line tool (for ``pipx run cbor2``)
- Added support for native date encoding (bschoenmaeckers)
- Fixed ``SystemError`` in the C extension when decoding a ``Fractional`` with a bad
number of arguments
- Fixed ``SystemError`` in the C extension when the decoder object hook raises an
exception
- Fixed a segmentation fault when decoding invalid unicode data
- Fixed infinite recursion when trying to hash a CBOR tag whose value points to the tag
itself
- Added support for native date encoding (bschoenmaeckers)
- Fixed ``MemoryError`` when maliciously constructed bytestrings or string (declared to be absurdly
large) are being decoded

**5.5.1** (2023-11-02)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def check_libc():
"source/tags.c",
"source/halffloat.c",
],
optional=True,
# optional=True,
)
kwargs = {"ext_modules": [_cbor2]}
else:
Expand Down
232 changes: 195 additions & 37 deletions source/decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,31 +348,44 @@ _CBORDecoder_get_immutable(CBORDecoderObject *self, void *closure)

// Utility functions /////////////////////////////////////////////////////////

static int
fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
static PyObject *
fp_read_object(CBORDecoderObject *self, const Py_ssize_t size)
{
PyObject *ret = NULL;
PyObject *obj, *size_obj;
char *data;
int ret = -1;

size_obj = PyLong_FromSsize_t(size);
if (size_obj) {
obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL);
Py_DECREF(size_obj);
if (obj) {
assert(PyBytes_CheckExact(obj));
if (PyBytes_GET_SIZE(obj) == (Py_ssize_t) size) {
data = PyBytes_AS_STRING(obj);
memcpy(buf, data, size);
ret = 0;
ret = obj;
} else {
Py_DECREF(obj);
PyErr_Format(
_CBOR2_CBORDecodeEOF,
"premature end of stream (expected to read %zd bytes, "
"got %zd instead)", size, PyBytes_GET_SIZE(obj));
}
Py_DECREF(obj);
}
Py_DECREF(size_obj);
}
return ret;
}


static int
fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
{
int ret = -1;
PyObject *obj = fp_read_object(self, size);
if (obj) {
char *data = PyBytes_AS_STRING(obj);
if (data) {
memcpy(buf, data, size);
ret = 0;
}
Py_DECREF(obj);
}
return ret;
}
Expand Down Expand Up @@ -538,17 +551,12 @@ decode_negint(CBORDecoderObject *self, uint8_t subtype)


static PyObject *
decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
decode_definite_short_bytestring(CBORDecoderObject *self, Py_ssize_t length)
{
PyObject *ret = NULL;

ret = PyBytes_FromStringAndSize(NULL, length);
PyObject *ret = fp_read_object(self, length);
if (!ret)
return NULL;
if (fp_read(self, PyBytes_AS_STRING(ret), length) == -1) {
Py_DECREF(ret);
return NULL;
}

if (string_namespace_add(self, ret, length) == -1) {
Py_DECREF(ret);
return NULL;
Expand All @@ -557,6 +565,56 @@ decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
}


static PyObject *
decode_definite_long_bytestring(CBORDecoderObject *self, Py_ssize_t length)
{
PyObject *buffer = NULL;
Py_ssize_t left = length;
while (left) {
Py_ssize_t chunk_length = length <= 65536 ? length : 65536;
PyObject *chunk = fp_read_object(self, chunk_length);
if (!chunk) {
break;
}

if (!PyBytes_CheckExact(chunk)) {
Py_DECREF(chunk);
break;
}

if (buffer) {
PyObject *new_buffer = PyByteArray_Concat(buffer, chunk);
Py_DECREF(chunk);
if (!new_buffer)
break;

if (new_buffer != buffer) {
Py_DECREF(buffer);
buffer = new_buffer;
}
} else {
buffer = PyByteArray_FromObject(chunk);
Py_DECREF(chunk);
if (!buffer)
break;
}
left -= chunk_length;
}

PyObject *ret = NULL;
if (buffer) {
ret = PyBytes_FromObject(buffer);
Py_DECREF(buffer);

if (ret && string_namespace_add(self, ret, length) == -1) {
Py_DECREF(ret);
ret = NULL;
}
}
return ret;
}


static PyObject *
decode_indefinite_bytestrings(CBORDecoderObject *self)
{
Expand Down Expand Up @@ -615,9 +673,14 @@ decode_bytestring(CBORDecoderObject *self, uint8_t subtype)
}
if (indefinite)
ret = decode_indefinite_bytestrings(self);
else if (length <= 65536)
ret = decode_definite_short_bytestring(self, (Py_ssize_t)length);
else
ret = decode_definite_bytestring(self, (Py_ssize_t)length);
set_shareable(self, ret);
ret = decode_definite_long_bytestring(self, (Py_ssize_t)length);

if (ret)
set_shareable(self, ret);

return ret;
}

Expand All @@ -637,31 +700,121 @@ decode_bytestring(CBORDecoderObject *self, uint8_t subtype)


static PyObject *
decode_definite_string(CBORDecoderObject *self, Py_ssize_t length)
decode_definite_short_string(CBORDecoderObject *self, Py_ssize_t length)
{
PyObject *ret = NULL;
char *buf;

buf = PyMem_Malloc(length);
if (!buf)
return PyErr_NoMemory();

if (fp_read(self, buf, length) == 0)
ret = PyUnicode_DecodeUTF8(
buf, length, PyBytes_AS_STRING(self->str_errors));
PyMem_Free(buf);

if (!ret)
PyObject *bytes_obj = fp_read_object(self, length);
if (!bytes_obj)
return NULL;

if (string_namespace_add(self, ret, length) == -1) {
const char *bytes = PyBytes_AS_STRING(bytes_obj);
PyObject *ret = PyUnicode_FromStringAndSize(bytes, length);
Py_DECREF(bytes_obj);
if (ret && string_namespace_add(self, ret, length) == -1) {
Py_DECREF(ret);
return NULL;
}
return ret;
}


static PyObject *
decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
{
PyObject *ret = NULL, *chunk = NULL, *string = NULL;
Py_ssize_t left = length;
Py_ssize_t consumed;
Py_ssize_t buffer_size = 0; // how many bytes are allocated for the buffer
Py_ssize_t buffer_length = 0; // how many bytes are actually stored in the buffer
char *buffer = NULL;
while (left) {
// Read up to 65536 bytes of data from the stream
Py_ssize_t chunk_length = 65536 - buffer_size;
if (left < chunk_length)
chunk_length = left;

PyObject *chunk = fp_read_object(self, chunk_length);
left -= chunk_length;
if (!chunk)
goto error;

// Get the internal buffer of the bytes object
char *bytes_buffer = PyBytes_AsString(chunk);
if (!bytes_buffer)
goto error;

char *source_buffer;
if (buffer) {
// Grow the buffer to accommodate the previous data plus the new chunk
if (buffer_length + chunk_length > buffer_size) {
buffer_size = buffer_length + chunk_length;
char *new_buffer = PyMem_Realloc(buffer, buffer_size);
if (!new_buffer)
goto error;

buffer = new_buffer;
}

// Concatenate the chunk into the buffer
memcpy(buffer + buffer_length, bytes_buffer, chunk_length);
buffer_length += chunk_length;

source_buffer = buffer;
chunk_length = buffer_length;
} else {
// Use the chunk's internal buffer directly to decode as many characters as possible
source_buffer = bytes_buffer;
}

string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, NULL, &consumed);
if (!string)
goto error;

if (ret) {
// Concatenate the result to the existing result
PyObject *joined = PyUnicode_Concat(ret, string);
if (!joined)
goto error;

Py_DECREF(string);
string = NULL;
ret = joined;
} else {
// Set the result to the decoded string
ret = string;
}

Py_ssize_t unconsumed = chunk_length - consumed;
if (consumed != chunk_length) {
if (buffer) {
// Move the unconsumed bytes to the start of the buffer
memmove(buffer, buffer + consumed, unconsumed);
} else {
// Create a new buffer
buffer = PyMem_Malloc(unconsumed);
if (!buffer)
goto error;

memcpy(buffer, bytes_buffer + consumed, unconsumed);
}
buffer_length = unconsumed;
}
}

if (ret && string_namespace_add(self, ret, length) == -1)
goto error;

return ret;
error:
Py_XDECREF(ret);
Py_XDECREF(chunk);
Py_XDECREF(string);
if (buffer)
PyMem_Free(buffer);

return NULL;
}


static PyObject *
decode_indefinite_strings(CBORDecoderObject *self)
{
Expand Down Expand Up @@ -719,9 +872,14 @@ decode_string(CBORDecoderObject *self, uint8_t subtype)
}
if (indefinite)
ret = decode_indefinite_strings(self);
else if (length <= 65536)
ret = decode_definite_short_string(self, (Py_ssize_t)length);
else
ret = decode_definite_string(self, (Py_ssize_t)length);
set_shareable(self, ret);
ret = decode_definite_long_string(self, (Py_ssize_t)length);

if (ret)
set_shareable(self, ret);

return ret;
}

Expand Down
2 changes: 1 addition & 1 deletion source/tags.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ CBORTag_hash(CBORTagObject *self)

// Check how many more references there are in running_hashes
Py_ssize_t length = PySequence_Length(running_hashes);
if (length == 1) {
if (length == -1) {
ret = -1;
goto exit;
}
Expand Down

0 comments on commit 387755e

Please sign in to comment.