Fixed MemoryError when decoding large definite strings (#204)

Also fixed a return value check in `CBORTag_hash()`.
agronholm · Jan 14, 2024 · 387755e · 387755e
1 parent 0d54000
commit 387755e
Show file tree

Hide file tree

Showing 6 changed files with 263 additions and 47 deletions.
diff --git a/cbor2/_decoder.py b/cbor2/_decoder.py
@@ -3,6 +3,7 @@
 import re
 import struct
 import sys
+from codecs import getincrementaldecoder
 from collections.abc import Callable, Mapping, Sequence
 from datetime import date, datetime, timedelta, timezone
 from io import BytesIO
@@ -31,6 +32,7 @@
 timestamp_re = re.compile(
     r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)" r"(?:\.(\d{1,6})\d*)?(?:Z|([+-])(\d\d):(\d\d))$"
 )
+incremental_utf8_decoder = getincrementaldecoder("utf-8")
 
 
 class CBORDecoder:
@@ -305,8 +307,19 @@ def decode_bytestring(self, subtype: int) -> bytes:
         else:
             if length > sys.maxsize:
                 raise CBORDecodeValueError("invalid length for bytestring 0x%x" % length)
+            elif length <= 65536:
+                result = self.read(length)
+            else:
+                # Read large bytestrings 65536 (2 ** 16) bytes at a time
+                left = length
+                buffer = bytearray()
+                while left:
+                    chunk_size = min(left, 65536)
+                    buffer.extend(self.read(chunk_size))
+                    left -= chunk_size
+
+                result = bytes(buffer)
 
-            result = self.read(length)
             self._stringref_namespace_add(result, length)
 
         return self.set_shareable(result)
@@ -350,7 +363,19 @@ def decode_string(self, subtype: int) -> str:
             if length > sys.maxsize:
                 raise CBORDecodeValueError("invalid length for string 0x%x" % length)
 
-            result = self.read(length).decode("utf-8", self._str_errors)
+            if length <= 65536:
+                result = self.read(length).decode("utf-8", self._str_errors)
+            else:
+                # Read and decode large text strings 65536 (2 ** 16) bytes at a time
+                codec = incremental_utf8_decoder(self._str_errors)
+                left = length
+                result = ""
+                while left:
+                    chunk_size = min(left, 65536)
+                    final = left <= chunk_size
+                    result += codec.decode(self.read(chunk_size), final)
+                    left -= chunk_size
+
             self._stringref_namespace_add(result, length)
 
         return self.set_shareable(result)

diff --git a/docs/versionhistory.rst b/docs/versionhistory.rst
@@ -8,14 +8,16 @@ This library adheres to `Semantic Versioning <http://semver.org/>`_.
 **UNRELEASED**
 
 - Added the ``cbor2`` command line tool (for ``pipx run cbor2``)
+- Added support for native date encoding (bschoenmaeckers)
 - Fixed ``SystemError`` in the C extension when decoding a ``Fractional`` with a bad
   number of arguments
 - Fixed ``SystemError`` in the C extension when the decoder object hook raises an
   exception
 - Fixed a segmentation fault when decoding invalid unicode data
 - Fixed infinite recursion when trying to hash a CBOR tag whose value points to the tag
   itself
-- Added support for native date encoding (bschoenmaeckers)
+- Fixed ``MemoryError`` when maliciously constructed bytestrings or string (declared to be absurdly
+  large) are being decoded
 
 **5.5.1** (2023-11-02)
 

diff --git a/setup.py b/setup.py
@@ -57,7 +57,7 @@ def check_libc():
             "source/tags.c",
             "source/halffloat.c",
         ],
-        optional=True,
+        # optional=True,
     )
     kwargs = {"ext_modules": [_cbor2]}
 else:

diff --git a/source/decoder.c b/source/decoder.c
@@ -348,31 +348,44 @@ _CBORDecoder_get_immutable(CBORDecoderObject *self, void *closure)
 
 // Utility functions /////////////////////////////////////////////////////////
 
-static int
-fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+static PyObject *
+fp_read_object(CBORDecoderObject *self, const Py_ssize_t size)
 {
+    PyObject *ret = NULL;
     PyObject *obj, *size_obj;
-    char *data;
-    int ret = -1;
-
     size_obj = PyLong_FromSsize_t(size);
     if (size_obj) {
         obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL);
+        Py_DECREF(size_obj);
         if (obj) {
             assert(PyBytes_CheckExact(obj));
             if (PyBytes_GET_SIZE(obj) == (Py_ssize_t) size) {
-                data = PyBytes_AS_STRING(obj);
-                memcpy(buf, data, size);
-                ret = 0;
+                ret = obj;
             } else {
+                Py_DECREF(obj);
                 PyErr_Format(
                     _CBOR2_CBORDecodeEOF,
                     "premature end of stream (expected to read %zd bytes, "
                     "got %zd instead)", size, PyBytes_GET_SIZE(obj));
             }
-            Py_DECREF(obj);
         }
-        Py_DECREF(size_obj);
+    }
+    return ret;
+}
+
+
+static int
+fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+{
+    int ret = -1;
+    PyObject *obj = fp_read_object(self, size);
+    if (obj) {
+        char *data = PyBytes_AS_STRING(obj);
+        if (data) {
+            memcpy(buf, data, size);
+            ret = 0;
+        }
+        Py_DECREF(obj);
     }
     return ret;
 }
@@ -538,17 +551,12 @@ decode_negint(CBORDecoderObject *self, uint8_t subtype)
 
 
 static PyObject *
-decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_bytestring(CBORDecoderObject *self, Py_ssize_t length)
 {
-    PyObject *ret = NULL;
-
-    ret = PyBytes_FromStringAndSize(NULL, length);
+    PyObject *ret = fp_read_object(self, length);
     if (!ret)
         return NULL;
-    if (fp_read(self, PyBytes_AS_STRING(ret), length) == -1) {
-        Py_DECREF(ret);
-        return NULL;
-    }
+
     if (string_namespace_add(self, ret, length) == -1) {
         Py_DECREF(ret);
         return NULL;
@@ -557,6 +565,56 @@ decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
 }
 
 
+static PyObject *
+decode_definite_long_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+{
+    PyObject *buffer = NULL;
+    Py_ssize_t left = length;
+    while (left) {
+        Py_ssize_t chunk_length = length <= 65536 ? length : 65536;
+        PyObject *chunk = fp_read_object(self, chunk_length);
+        if (!chunk) {
+            break;
+        }
+
+        if (!PyBytes_CheckExact(chunk)) {
+            Py_DECREF(chunk);
+            break;
+        }
+
+        if (buffer) {
+            PyObject *new_buffer = PyByteArray_Concat(buffer, chunk);
+            Py_DECREF(chunk);
+            if (!new_buffer)
+                break;
+
+            if (new_buffer != buffer) {
+                Py_DECREF(buffer);
+                buffer = new_buffer;
+            }
+        } else {
+            buffer = PyByteArray_FromObject(chunk);
+            Py_DECREF(chunk);
+            if (!buffer)
+                break;
+        }
+        left -= chunk_length;
+    }
+
+    PyObject *ret = NULL;
+    if (buffer) {
+        ret = PyBytes_FromObject(buffer);
+        Py_DECREF(buffer);
+
+        if (ret && string_namespace_add(self, ret, length) == -1) {
+            Py_DECREF(ret);
+            ret = NULL;
+        }
+    }
+    return ret;
+}
+
+
 static PyObject *
 decode_indefinite_bytestrings(CBORDecoderObject *self)
 {
@@ -615,9 +673,14 @@ decode_bytestring(CBORDecoderObject *self, uint8_t subtype)
     }
     if (indefinite)
         ret = decode_indefinite_bytestrings(self);
+    else if (length <= 65536)
+        ret = decode_definite_short_bytestring(self, (Py_ssize_t)length);
     else
-        ret = decode_definite_bytestring(self, (Py_ssize_t)length);
-    set_shareable(self, ret);
+        ret = decode_definite_long_bytestring(self, (Py_ssize_t)length);
+
+    if (ret)
+        set_shareable(self, ret);
+
     return ret;
 }
 
@@ -637,31 +700,121 @@ decode_bytestring(CBORDecoderObject *self, uint8_t subtype)
 
 
 static PyObject *
-decode_definite_string(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_string(CBORDecoderObject *self, Py_ssize_t length)
 {
-    PyObject *ret = NULL;
-    char *buf;
-
-    buf = PyMem_Malloc(length);
-    if (!buf)
-        return PyErr_NoMemory();
-
-    if (fp_read(self, buf, length) == 0)
-        ret = PyUnicode_DecodeUTF8(
-                buf, length, PyBytes_AS_STRING(self->str_errors));
-    PyMem_Free(buf);
-
-    if (!ret)
+    PyObject *bytes_obj = fp_read_object(self, length);
+    if (!bytes_obj)
         return NULL;
 
-    if (string_namespace_add(self, ret, length) == -1) {
+    const char *bytes = PyBytes_AS_STRING(bytes_obj);
+    PyObject *ret = PyUnicode_FromStringAndSize(bytes, length);
+    Py_DECREF(bytes_obj);
+    if (ret && string_namespace_add(self, ret, length) == -1) {
         Py_DECREF(ret);
         return NULL;
     }
     return ret;
 }
 
 
+static PyObject *
+decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
+{
+    PyObject *ret = NULL, *chunk = NULL, *string = NULL;
+    Py_ssize_t left = length;
+    Py_ssize_t consumed;
+    Py_ssize_t buffer_size = 0;  // how many bytes are allocated for the buffer
+    Py_ssize_t buffer_length = 0;  // how many bytes are actually stored in the buffer
+    char *buffer = NULL;
+    while (left) {
+        // Read up to 65536 bytes of data from the stream
+        Py_ssize_t chunk_length = 65536 - buffer_size;
+        if (left < chunk_length)
+            chunk_length = left;
+
+        PyObject *chunk = fp_read_object(self, chunk_length);
+        left -= chunk_length;
+        if (!chunk)
+            goto error;
+
+        // Get the internal buffer of the bytes object
+        char *bytes_buffer = PyBytes_AsString(chunk);
+        if (!bytes_buffer)
+            goto error;
+
+        char *source_buffer;
+        if (buffer) {
+            // Grow the buffer to accommodate the previous data plus the new chunk
+            if (buffer_length + chunk_length > buffer_size) {
+                buffer_size = buffer_length + chunk_length;
+                char *new_buffer = PyMem_Realloc(buffer, buffer_size);
+                if (!new_buffer)
+                    goto error;
+
+                buffer = new_buffer;
+            }
+
+            // Concatenate the chunk into the buffer
+            memcpy(buffer + buffer_length, bytes_buffer, chunk_length);
+            buffer_length += chunk_length;
+
+            source_buffer = buffer;
+            chunk_length = buffer_length;
+        } else {
+            // Use the chunk's internal buffer directly to decode as many characters as possible
+            source_buffer = bytes_buffer;
+        }
+
+        string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, NULL, &consumed);
+        if (!string)
+            goto error;
+
+        if (ret) {
+            // Concatenate the result to the existing result
+            PyObject *joined = PyUnicode_Concat(ret, string);
+            if (!joined)
+                goto error;
+
+            Py_DECREF(string);
+            string = NULL;
+            ret = joined;
+        } else {
+            // Set the result to the decoded string
+            ret = string;
+        }
+
+        Py_ssize_t unconsumed = chunk_length - consumed;
+        if (consumed != chunk_length) {
+            if (buffer) {
+                // Move the unconsumed bytes to the start of the buffer
+                memmove(buffer, buffer + consumed, unconsumed);
+            } else {
+                // Create a new buffer
+                buffer = PyMem_Malloc(unconsumed);
+                if (!buffer)
+                    goto error;
+
+                memcpy(buffer, bytes_buffer + consumed, unconsumed);
+            }
+            buffer_length = unconsumed;
+        }
+    }
+
+    if (ret && string_namespace_add(self, ret, length) == -1)
+        goto error;
+
+    return ret;
+error:
+    Py_XDECREF(ret);
+    Py_XDECREF(chunk);
+    Py_XDECREF(string);
+    if (buffer)
+        PyMem_Free(buffer);
+
+    return NULL;
+}
+
+
 static PyObject *
 decode_indefinite_strings(CBORDecoderObject *self)
 {
@@ -719,9 +872,14 @@ decode_string(CBORDecoderObject *self, uint8_t subtype)
     }
     if (indefinite)
         ret = decode_indefinite_strings(self);
+    else if (length <= 65536)
+        ret = decode_definite_short_string(self, (Py_ssize_t)length);
     else
-        ret = decode_definite_string(self, (Py_ssize_t)length);
-    set_shareable(self, ret);
+        ret = decode_definite_long_string(self, (Py_ssize_t)length);
+
+    if (ret)
+        set_shareable(self, ret);
+
     return ret;
 }
 

diff --git a/source/tags.c b/source/tags.c
@@ -191,7 +191,7 @@ CBORTag_hash(CBORTagObject *self)
 
     // Check how many more references there are in running_hashes
     Py_ssize_t length = PySequence_Length(running_hashes);
-    if (length == 1) {
+    if (length == -1) {
         ret = -1;
         goto exit;
     }