Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Ensure graceful handling of large header sizes #22592

Merged
merged 1 commit into from Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
59 changes: 50 additions & 9 deletions numpy/lib/format.py
Expand Up @@ -186,6 +186,10 @@
(3, 0): ('<I', 'utf8'),
}

# Python's literal_eval is not actually safe for large inputs, since parsing
# may become slow or even cause interpreter crashes.
# This is an arbitrary, low limit which should make it safe in practice.
_MAX_HEADER_SIZE = 10000

def _check_version(version):
if version not in [(1, 0), (2, 0), (3, 0), None]:
Expand Down Expand Up @@ -465,7 +469,7 @@ def write_array_header_2_0(fp, d):
"""
_write_array_header(fp, d, (2, 0))

def read_array_header_1_0(fp):
def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 1.0 file format
version.
Expand All @@ -487,16 +491,21 @@ def read_array_header_1_0(fp):
contiguous before writing it out.
dtype : dtype
The dtype of the file's data.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.

Raises
------
ValueError
If the data is invalid.

"""
return _read_array_header(fp, version=(1, 0))
return _read_array_header(
fp, version=(1, 0), max_header_size=max_header_size)

def read_array_header_2_0(fp):
def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 2.0 file format
version.
Expand All @@ -509,6 +518,10 @@ def read_array_header_2_0(fp):
----------
fp : filelike object
A file object or something with a `.read()` method like a file.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.

Returns
-------
Expand All @@ -527,7 +540,8 @@ def read_array_header_2_0(fp):
If the data is invalid.

"""
return _read_array_header(fp, version=(2, 0))
return _read_array_header(
fp, version=(2, 0), max_header_size=max_header_size)


def _filter_header(s):
Expand Down Expand Up @@ -565,7 +579,7 @@ def _filter_header(s):
return tokenize.untokenize(tokens)


def _read_array_header(fp, version):
def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
"""
see read_array_header_1_0
"""
Expand All @@ -581,6 +595,14 @@ def _read_array_header(fp, version):
header_length = struct.unpack(hlength_type, hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
header = header.decode(encoding)
if len(header) > max_header_size:
raise ValueError(
f"Header info length ({len(header)}) is large and may not be safe "
"to load securely.\n"
"To allow loading, adjust `max_header_size` or fully trust "
"the `.npy` file using `allow_pickle=True`.\n"
"For safety against large resource use or crashes, sandboxing "
"may be necessary.")

# The header is a pretty-printed string representation of a literal
# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
Expand Down Expand Up @@ -694,7 +716,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
fp.write(chunk.tobytes('C'))


def read_array(fp, allow_pickle=False, pickle_kwargs=None):
def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
max_header_size=_MAX_HEADER_SIZE):
"""
Read an array from an NPY file.

Expand All @@ -713,6 +736,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
Additional keyword arguments to pass to pickle.load. These are only
useful when loading object arrays saved on Python 2 when using
Python 3.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.

Returns
-------
Expand All @@ -726,9 +755,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
an object array.

"""
if allow_pickle:
# Effectively ignore max_header_size, since `allow_pickle` indicates
# that the input is fully trusted.
max_header_size = 2**64

version = read_magic(fp)
_check_version(version)
shape, fortran_order, dtype = _read_array_header(fp, version)
shape, fortran_order, dtype = _read_array_header(
fp, version, max_header_size=max_header_size)
if len(shape) == 0:
count = 1
else:
Expand Down Expand Up @@ -788,7 +823,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):


def open_memmap(filename, mode='r+', dtype=None, shape=None,
fortran_order=False, version=None):
fortran_order=False, version=None, *,
max_header_size=_MAX_HEADER_SIZE):
"""
Open a .npy file as a memory-mapped array.

Expand Down Expand Up @@ -819,6 +855,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
If the mode is a "write" mode, then this is the version of the file
format used to create the file. None means use the oldest
supported version that is able to store the data. Default: None
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.

Returns
-------
Expand Down Expand Up @@ -866,7 +906,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
version = read_magic(fp)
_check_version(version)

shape, fortran_order, dtype = _read_array_header(fp, version)
shape, fortran_order, dtype = _read_array_header(
fp, version, max_header_size=max_header_size)
if dtype.hasobject:
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
Expand Down
32 changes: 26 additions & 6 deletions numpy/lib/npyio.py
Expand Up @@ -139,6 +139,12 @@ class NpzFile(Mapping):
Additional keyword arguments to pass on to pickle.load.
These are only useful when loading object arrays saved on
Python 2 when using Python 3.
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.

Parameters
----------
Expand Down Expand Up @@ -174,13 +180,15 @@ class NpzFile(Mapping):
fid = None

def __init__(self, fid, own_fid=False, allow_pickle=False,
pickle_kwargs=None):
pickle_kwargs=None, *,
max_header_size=format._MAX_HEADER_SIZE):
# Import is postponed to here since zipfile depends on gzip, an
# optional component of the so-called standard library.
_zip = zipfile_factory(fid)
self._files = _zip.namelist()
self.files = []
self.allow_pickle = allow_pickle
self.max_header_size = max_header_size
self.pickle_kwargs = pickle_kwargs
for x in self._files:
if x.endswith('.npy'):
Expand Down Expand Up @@ -244,7 +252,8 @@ def __getitem__(self, key):
bytes = self.zip.open(key)
return format.read_array(bytes,
allow_pickle=self.allow_pickle,
pickle_kwargs=self.pickle_kwargs)
pickle_kwargs=self.pickle_kwargs,
max_header_size=self.max_header_size)
else:
return self.zip.read(key)
else:
Expand All @@ -253,7 +262,7 @@ def __getitem__(self, key):

@set_module('numpy')
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
encoding='ASCII'):
encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE):
"""
Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.

Expand Down Expand Up @@ -297,6 +306,12 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
npy/npz files containing object arrays. Values other than 'latin1',
'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
data. Default: 'ASCII'
max_header_size : int, optional
Maximum allowed size of the header. Large headers may not be safe
to load securely and thus require explicitly passing a larger value.
See :py:meth:`ast.literal_eval()` for details.
This option is ignored when `allow_pickle` is passed. In that case
the file is by definition trusted and the limit is unnecessary.

Returns
-------
Expand Down Expand Up @@ -403,15 +418,20 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
# Potentially transfer file ownership to NpzFile
stack.pop_all()
ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
pickle_kwargs=pickle_kwargs,
max_header_size=max_header_size)
return ret
elif magic == format.MAGIC_PREFIX:
# .npy file
if mmap_mode:
return format.open_memmap(file, mode=mmap_mode)
if allow_pickle:
max_header_size = 2**64
return format.open_memmap(file, mode=mmap_mode,
max_header_size=max_header_size)
else:
return format.read_array(fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
pickle_kwargs=pickle_kwargs,
max_header_size=max_header_size)
else:
# Try a pickle
if not allow_pickle:
Expand Down
47 changes: 44 additions & 3 deletions numpy/lib/tests/test_format.py
Expand Up @@ -459,6 +459,7 @@ def test_long_str():
assert_array_equal(long_str_arr, long_str_arr2)


@pytest.mark.slow
def test_memmap_roundtrip(tmpdir):
for i, arr in enumerate(basic_arrays + record_arrays):
if arr.dtype.hasobject:
Expand Down Expand Up @@ -667,7 +668,7 @@ def test_version_2_0():
assert_(len(header) % format.ARRAY_ALIGN == 0)

f.seek(0)
n = format.read_array(f)
n = format.read_array(f, max_header_size=200000)
assert_array_equal(d, n)

# 1.0 requested but data cannot be saved this way
Expand All @@ -689,7 +690,7 @@ def test_version_2_0_memmap(tmpdir):
shape=d.shape, version=(2, 0))
ma[...] = d
ma.flush()
ma = format.open_memmap(tf1, mode='r')
ma = format.open_memmap(tf1, mode='r', max_header_size=200000)
assert_array_equal(ma, d)

with warnings.catch_warnings(record=True) as w:
Expand All @@ -700,9 +701,49 @@ def test_version_2_0_memmap(tmpdir):
ma[...] = d
ma.flush()

ma = format.open_memmap(tf2, mode='r')
ma = format.open_memmap(tf2, mode='r', max_header_size=200000)

assert_array_equal(ma, d)

@pytest.mark.parametrize("mmap_mode", ["r", None])
def test_huge_header(tmpdir, mmap_mode):
f = os.path.join(tmpdir, f'large_header.npy')
arr = np.array(1, dtype="i,"*10000+"i")

with pytest.warns(UserWarning, match=".*format 2.0"):
np.save(f, arr)

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, mmap_mode=mmap_mode)

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, mmap_mode=mmap_mode, max_header_size=20000)

res = np.load(f, mmap_mode=mmap_mode, allow_pickle=True)
assert_array_equal(res, arr)

res = np.load(f, mmap_mode=mmap_mode, max_header_size=180000)
assert_array_equal(res, arr)

def test_huge_header_npz(tmpdir):
f = os.path.join(tmpdir, f'large_header.npz')
arr = np.array(1, dtype="i,"*10000+"i")

with pytest.warns(UserWarning, match=".*format 2.0"):
np.savez(f, arr=arr)

# Only getting the array from the file actually reads it
with pytest.raises(ValueError, match="Header.*large"):
np.load(f)["arr"]

with pytest.raises(ValueError, match="Header.*large"):
np.load(f, max_header_size=20000)["arr"]

res = np.load(f, allow_pickle=True)["arr"]
assert_array_equal(res, arr)

res = np.load(f, max_header_size=180000)["arr"]
assert_array_equal(res, arr)

def test_write_version():
f = BytesIO()
Expand Down
6 changes: 6 additions & 0 deletions numpy/lib/utils.py
Expand Up @@ -971,6 +971,12 @@ def safe_eval(source):
Evaluate a string containing a Python literal expression without
allowing the execution of arbitrary non-literal code.

.. warning::

This function is identical to :py:meth:`ast.literal_eval` and
has the same security implications. It may not always be safe
to evaluate large input strings.

Parameters
----------
source : str
Expand Down