From 3d2678d1767ea550435affe4dd469336280012db Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 4 Oct 2022 13:36:08 +0200 Subject: [PATCH] MAINT: Ensure graceful handling of large header sizes This ensures graceful handling of large header files. Unfortunately, it may be a bit inconvenient for users, thus the new kwarg and the work-around of also accepting allow-pickle. See also the documation here: https://docs.python.org/3.10/library/ast.html#ast.literal_eval --- numpy/lib/format.py | 59 ++++++++++++++++++++++++++++------ numpy/lib/npyio.py | 32 ++++++++++++++---- numpy/lib/tests/test_format.py | 47 +++++++++++++++++++++++++-- numpy/lib/utils.py | 6 ++++ 4 files changed, 126 insertions(+), 18 deletions(-) diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 625768b62640..19fec48ed75f 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -186,6 +186,10 @@ (3, 0): (' max_header_size: + raise ValueError( + f"Header info length ({len(header)}) is large and may not be safe " + "to load securely.\n" + "To allow loading, adjust `max_header_size` or fully trust " + "the `.npy` file using `allow_pickle=True`.\n" + "For safety against large resource use or crashes, sandboxing " + "may be necessary.") # The header is a pretty-printed string representation of a literal # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte @@ -694,7 +716,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): fp.write(chunk.tobytes('C')) -def read_array(fp, allow_pickle=False, pickle_kwargs=None): +def read_array(fp, allow_pickle=False, pickle_kwargs=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Read an array from an NPY file. @@ -713,6 +736,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. + This option is ignored when `allow_pickle` is passed. In that case + the file is by definition trusted and the limit is unnecessary. Returns ------- @@ -726,9 +755,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): an object array. """ + if allow_pickle: + # Effectively ignore max_header_size, since `allow_pickle` indicates + # that the input is fully trusted. + max_header_size = 2**64 + version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if len(shape) == 0: count = 1 else: @@ -788,7 +823,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): def open_memmap(filename, mode='r+', dtype=None, shape=None, - fortran_order=False, version=None): + fortran_order=False, version=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Open a .npy file as a memory-mapped array. @@ -819,6 +855,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, If the mode is a "write" mode, then this is the version of the file format used to create the file. None means use the oldest supported version that is able to store the data. Default: None + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. Returns ------- @@ -866,7 +906,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index f0ec40a2eae8..343d76ae3f7c 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -139,6 +139,12 @@ class NpzFile(Mapping): Additional keyword arguments to pass on to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. + This option is ignored when `allow_pickle` is passed. In that case + the file is by definition trusted and the limit is unnecessary. Parameters ---------- @@ -174,13 +180,15 @@ class NpzFile(Mapping): fid = None def __init__(self, fid, own_fid=False, allow_pickle=False, - pickle_kwargs=None): + pickle_kwargs=None, *, + max_header_size=format._MAX_HEADER_SIZE): # Import is postponed to here since zipfile depends on gzip, an # optional component of the so-called standard library. _zip = zipfile_factory(fid) self._files = _zip.namelist() self.files = [] self.allow_pickle = allow_pickle + self.max_header_size = max_header_size self.pickle_kwargs = pickle_kwargs for x in self._files: if x.endswith('.npy'): @@ -244,7 +252,8 @@ def __getitem__(self, key): bytes = self.zip.open(key) return format.read_array(bytes, allow_pickle=self.allow_pickle, - pickle_kwargs=self.pickle_kwargs) + pickle_kwargs=self.pickle_kwargs, + max_header_size=self.max_header_size) else: return self.zip.read(key) else: @@ -253,7 +262,7 @@ def __getitem__(self, key): @set_module('numpy') def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, - encoding='ASCII'): + encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE): """ Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files. @@ -297,6 +306,12 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, npy/npz files containing object arrays. Values other than 'latin1', 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical data. Default: 'ASCII' + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. + This option is ignored when `allow_pickle` is passed. In that case + the file is by definition trusted and the limit is unnecessary. Returns ------- @@ -403,15 +418,20 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, # Potentially transfer file ownership to NpzFile stack.pop_all() ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle, - pickle_kwargs=pickle_kwargs) + pickle_kwargs=pickle_kwargs, + max_header_size=max_header_size) return ret elif magic == format.MAGIC_PREFIX: # .npy file if mmap_mode: - return format.open_memmap(file, mode=mmap_mode) + if allow_pickle: + max_header_size = 2**64 + return format.open_memmap(file, mode=mmap_mode, + max_header_size=max_header_size) else: return format.read_array(fid, allow_pickle=allow_pickle, - pickle_kwargs=pickle_kwargs) + pickle_kwargs=pickle_kwargs, + max_header_size=max_header_size) else: # Try a pickle if not allow_pickle: diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py index 581d067de375..53d3bf1d3f8c 100644 --- a/numpy/lib/tests/test_format.py +++ b/numpy/lib/tests/test_format.py @@ -459,6 +459,7 @@ def test_long_str(): assert_array_equal(long_str_arr, long_str_arr2) +@pytest.mark.slow def test_memmap_roundtrip(tmpdir): for i, arr in enumerate(basic_arrays + record_arrays): if arr.dtype.hasobject: @@ -667,7 +668,7 @@ def test_version_2_0(): assert_(len(header) % format.ARRAY_ALIGN == 0) f.seek(0) - n = format.read_array(f) + n = format.read_array(f, max_header_size=200000) assert_array_equal(d, n) # 1.0 requested but data cannot be saved this way @@ -689,7 +690,7 @@ def test_version_2_0_memmap(tmpdir): shape=d.shape, version=(2, 0)) ma[...] = d ma.flush() - ma = format.open_memmap(tf1, mode='r') + ma = format.open_memmap(tf1, mode='r', max_header_size=200000) assert_array_equal(ma, d) with warnings.catch_warnings(record=True) as w: @@ -700,9 +701,49 @@ def test_version_2_0_memmap(tmpdir): ma[...] = d ma.flush() - ma = format.open_memmap(tf2, mode='r') + ma = format.open_memmap(tf2, mode='r', max_header_size=200000) + assert_array_equal(ma, d) +@pytest.mark.parametrize("mmap_mode", ["r", None]) +def test_huge_header(tmpdir, mmap_mode): + f = os.path.join(tmpdir, f'large_header.npy') + arr = np.array(1, dtype="i,"*10000+"i") + + with pytest.warns(UserWarning, match=".*format 2.0"): + np.save(f, arr) + + with pytest.raises(ValueError, match="Header.*large"): + np.load(f, mmap_mode=mmap_mode) + + with pytest.raises(ValueError, match="Header.*large"): + np.load(f, mmap_mode=mmap_mode, max_header_size=20000) + + res = np.load(f, mmap_mode=mmap_mode, allow_pickle=True) + assert_array_equal(res, arr) + + res = np.load(f, mmap_mode=mmap_mode, max_header_size=180000) + assert_array_equal(res, arr) + +def test_huge_header_npz(tmpdir): + f = os.path.join(tmpdir, f'large_header.npz') + arr = np.array(1, dtype="i,"*10000+"i") + + with pytest.warns(UserWarning, match=".*format 2.0"): + np.savez(f, arr=arr) + + # Only getting the array from the file actually reads it + with pytest.raises(ValueError, match="Header.*large"): + np.load(f)["arr"] + + with pytest.raises(ValueError, match="Header.*large"): + np.load(f, max_header_size=20000)["arr"] + + res = np.load(f, allow_pickle=True)["arr"] + assert_array_equal(res, arr) + + res = np.load(f, max_header_size=180000)["arr"] + assert_array_equal(res, arr) def test_write_version(): f = BytesIO() diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py index 9e4fe7ebb2a9..831a78ec1b3a 100644 --- a/numpy/lib/utils.py +++ b/numpy/lib/utils.py @@ -971,6 +971,12 @@ def safe_eval(source): Evaluate a string containing a Python literal expression without allowing the execution of arbitrary non-literal code. + .. warning:: + + This function is identical to :py:meth:`ast.literal_eval` and + has the same security implications. It may not always be safe + to evaluate large input strings. + Parameters ---------- source : str