Skip to content

Commit

Permalink
Merge pull request #22592 from charris/backport-22393
Browse files Browse the repository at this point in the history
MAINT: Ensure graceful handling of large header sizes
  • Loading branch information
charris committed Nov 15, 2022
2 parents e5c39c8 + 3d2678d commit 8cededd
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 18 deletions.
59 changes: 50 additions & 9 deletions numpy/lib/format.py
Expand Up @@ -186,6 +186,10 @@
(3, 0): ('<I', 'utf8'),
}

# Python's literal_eval is not actually safe for large inputs, since parsing
# may become slow or even cause interpreter crashes.
# This is an arbitrary, low limit which should make it safe in practice.
_MAX_HEADER_SIZE = 10000

def _check_version(version):
if version not in [(1, 0), (2, 0), (3, 0), None]:
Expand Down Expand Up @@ -465,7 +469,7 @@ def write_array_header_2_0(fp, d):
"""
_write_array_header(fp, d, (2, 0))

def read_array_header_1_0(fp):
def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 1.0 file format
version.
Expand All @@ -487,16 +491,21 @@ def read_array_header_1_0(fp):
contiguous before writing it out.
dtype : dtype
The dtype of the file's data.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
Raises
------
ValueError
If the data is invalid.
"""
return _read_array_header(fp, version=(1, 0))
return _read_array_header(
fp, version=(1, 0), max_header_size=max_header_size)

def read_array_header_2_0(fp):
def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 2.0 file format
version.
Expand All @@ -509,6 +518,10 @@ def read_array_header_2_0(fp):
----------
fp : filelike object
A file object or something with a `.read()` method like a file.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
Returns
-------
Expand All @@ -527,7 +540,8 @@ def read_array_header_2_0(fp):
If the data is invalid.
"""
return _read_array_header(fp, version=(2, 0))
return _read_array_header(
fp, version=(2, 0), max_header_size=max_header_size)


def _filter_header(s):
Expand Down Expand Up @@ -565,7 +579,7 @@ def _filter_header(s):
return tokenize.untokenize(tokens)


def _read_array_header(fp, version):
def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
"""
see read_array_header_1_0
"""
Expand All @@ -581,6 +595,14 @@ def _read_array_header(fp, version):
header_length = struct.unpack(hlength_type, hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
header = header.decode(encoding)
if len(header) > max_header_size:
raise ValueError(
f"Header info length ({len(header)}) is large and may not be safe "
"to load securely.\n"
"To allow loading, adjust `max_header_size` or fully trust "
"the `.npy` file using `allow_pickle=True`.\n"
"For safety against large resource use or crashes, sandboxing "
"may be necessary.")

# The header is a pretty-printed string representation of a literal
# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
Expand Down Expand Up @@ -694,7 +716,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
fp.write(chunk.tobytes('C'))


def read_array(fp, allow_pickle=False, pickle_kwargs=None):
def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
max_header_size=_MAX_HEADER_SIZE):
"""
Read an array from an NPY file.
Expand All @@ -713,6 +736,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
Additional keyword arguments to pass to pickle.load. These are only
useful when loading object arrays saved on Python 2 when using
Python 3.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.
Returns
-------
Expand All @@ -726,9 +755,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
an object array.
"""
if allow_pickle:
# Effectively ignore max_header_size, since `allow_pickle` indicates
# that the input is fully trusted.
max_header_size = 2**64

version = read_magic(fp)
_check_version(version)
shape, fortran_order, dtype = _read_array_header(fp, version)
shape, fortran_order, dtype = _read_array_header(
fp, version, max_header_size=max_header_size)
if len(shape) == 0:
count = 1
else:
Expand Down Expand Up @@ -788,7 +823,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):


def open_memmap(filename, mode='r+', dtype=None, shape=None,
fortran_order=False, version=None):
fortran_order=False, version=None, *,
max_header_size=_MAX_HEADER_SIZE):
"""
Open a .npy file as a memory-mapped array.
Expand Down Expand Up @@ -819,6 +855,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
If the mode is a "write" mode, then this is the version of the file
format used to create the file. None means use the oldest
supported version that is able to store the data. Default: None
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
Returns
-------
Expand Down Expand Up @@ -866,7 +906,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
version = read_magic(fp)
_check_version(version)

shape, fortran_order, dtype = _read_array_header(fp, version)
shape, fortran_order, dtype = _read_array_header(
fp, version, max_header_size=max_header_size)
if dtype.hasobject:
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
Expand Down
32 changes: 26 additions & 6 deletions numpy/lib/npyio.py
Expand Up @@ -139,6 +139,12 @@ class NpzFile(Mapping):
Additional keyword arguments to pass on to pickle.load.
These are only useful when loading object arrays saved on
Python 2 when using Python 3.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.
Parameters
----------
Expand Down Expand Up @@ -174,13 +180,15 @@ class NpzFile(Mapping):
fid = None

def __init__(self, fid, own_fid=False, allow_pickle=False,
pickle_kwargs=None):
pickle_kwargs=None, *,
max_header_size=format._MAX_HEADER_SIZE):
# Import is postponed to here since zipfile depends on gzip, an
# optional component of the so-called standard library.
_zip = zipfile_factory(fid)
self._files = _zip.namelist()
self.files = []
self.allow_pickle = allow_pickle
self.max_header_size = max_header_size
self.pickle_kwargs = pickle_kwargs
for x in self._files:
if x.endswith('.npy'):
Expand Down Expand Up @@ -244,7 +252,8 @@ def __getitem__(self, key):
bytes = self.zip.open(key)
return format.read_array(bytes,
allow_pickle=self.allow_pickle,
pickle_kwargs=self.pickle_kwargs)
pickle_kwargs=self.pickle_kwargs,
max_header_size=self.max_header_size)
else:
return self.zip.read(key)
else:
Expand All @@ -253,7 +262,7 @@ def __getitem__(self, key):

@set_module('numpy')
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
encoding='ASCII'):
encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE):
"""
Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
Expand Down Expand Up @@ -297,6 +306,12 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
npy/npz files containing object arrays. Values other than 'latin1',
'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
data. Default: 'ASCII'
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.
Returns
-------
Expand Down Expand Up @@ -403,15 +418,20 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
# Potentially transfer file ownership to NpzFile
stack.pop_all()
ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
pickle_kwargs=pickle_kwargs,
max_header_size=max_header_size)
return ret
elif magic == format.MAGIC_PREFIX:
# .npy file
if mmap_mode:
return format.open_memmap(file, mode=mmap_mode)
if allow_pickle:
max_header_size = 2**64
return format.open_memmap(file, mode=mmap_mode,
max_header_size=max_header_size)
else:
return format.read_array(fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
pickle_kwargs=pickle_kwargs,
max_header_size=max_header_size)
else:
# Try a pickle
if not allow_pickle:
Expand Down
47 changes: 44 additions & 3 deletions numpy/lib/tests/test_format.py
Expand Up @@ -459,6 +459,7 @@ def test_long_str():
assert_array_equal(long_str_arr, long_str_arr2)


@pytest.mark.slow
def test_memmap_roundtrip(tmpdir):
for i, arr in enumerate(basic_arrays + record_arrays):
if arr.dtype.hasobject:
Expand Down Expand Up @@ -667,7 +668,7 @@ def test_version_2_0():
assert_(len(header) % format.ARRAY_ALIGN == 0)

f.seek(0)
n = format.read_array(f)
n = format.read_array(f, max_header_size=200000)
assert_array_equal(d, n)

# 1.0 requested but data cannot be saved this way
Expand All @@ -689,7 +690,7 @@ def test_version_2_0_memmap(tmpdir):
shape=d.shape, version=(2, 0))
ma[...] = d
ma.flush()
ma = format.open_memmap(tf1, mode='r')
ma = format.open_memmap(tf1, mode='r', max_header_size=200000)
assert_array_equal(ma, d)

with warnings.catch_warnings(record=True) as w:
Expand All @@ -700,9 +701,49 @@ def test_version_2_0_memmap(tmpdir):
ma[...] = d
ma.flush()

ma = format.open_memmap(tf2, mode='r')
ma = format.open_memmap(tf2, mode='r', max_header_size=200000)

assert_array_equal(ma, d)

@pytest.mark.parametrize("mmap_mode", ["r", None])
def test_huge_header(tmpdir, mmap_mode):
f = os.path.join(tmpdir, f'large_header.npy')
arr = np.array(1, dtype="i,"*10000+"i")

with pytest.warns(UserWarning, match=".*format 2.0"):
np.save(f, arr)

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, mmap_mode=mmap_mode)

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, mmap_mode=mmap_mode, max_header_size=20000)

res = np.load(f, mmap_mode=mmap_mode, allow_pickle=True)
assert_array_equal(res, arr)

res = np.load(f, mmap_mode=mmap_mode, max_header_size=180000)
assert_array_equal(res, arr)

def test_huge_header_npz(tmpdir):
f = os.path.join(tmpdir, f'large_header.npz')
arr = np.array(1, dtype="i,"*10000+"i")

with pytest.warns(UserWarning, match=".*format 2.0"):
np.savez(f, arr=arr)

# Only getting the array from the file actually reads it
with pytest.raises(ValueError, match="Header.*large"):
np.load(f)["arr"]

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, max_header_size=20000)["arr"]

res = np.load(f, allow_pickle=True)["arr"]
assert_array_equal(res, arr)

res = np.load(f, max_header_size=180000)["arr"]
assert_array_equal(res, arr)

def test_write_version():
f = BytesIO()
Expand Down
6 changes: 6 additions & 0 deletions numpy/lib/utils.py
Expand Up @@ -971,6 +971,12 @@ def safe_eval(source):
Evaluate a string containing a Python literal expression without
allowing the execution of arbitrary non-literal code.
.. warning::
This function is identical to :py:meth:`ast.literal_eval` and
has the same security implications. It may not always be safe
to evaluate large input strings.
Parameters
----------
source : str
Expand Down

0 comments on commit 8cededd

Please sign in to comment.