From 5b855cc8b94df719c64965f205c1355749fcd0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 2 Feb 2022 16:36:10 +0100 Subject: [PATCH 01/29] wip --- joblib/numpy_pickle.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index cc593af22..e42d174cc 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -95,6 +95,14 @@ def write_array(self, array, pickler): # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: + current_pos = pickler.file_handle.tell() + alignment = current_pos % 8 + # print('write', current_pos) + if alignment != 0: + padding = b' ' * (8 - alignment) + pickler.file_handle.write(padding) + # print('after padding write', pickler.file_handle.tell()) + for chunk in pickler.np.nditer(array, flags=['external_loop', 'buffered', @@ -121,6 +129,16 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: + # print('read', unpickler.file_handle.tell()) + current_pos = unpickler.file_handle.tell() + alignment = current_pos % 8 + + if alignment != 0: + padding_length = 8 - alignment + unpickler.file_handle.seek(current_pos + padding_length) + + # print('read after padding', unpickler.file_handle.tell()) + # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, @@ -153,7 +171,11 @@ def read_array(self, unpickler): def read_mmap(self, unpickler): """Read an array using numpy memmap.""" - offset = unpickler.file_handle.tell() + current_pos = unpickler.file_handle.tell() + offset = current_pos + alignment = current_pos % 8 + if alignment != 0: + offset += 8 - alignment if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' From 232588b19410255802ce8af9ab98d88a3ea84bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 2 Feb 2022 17:25:00 +0100 Subject: [PATCH 02/29] fix --- joblib/numpy_pickle.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index e42d174cc..7c8f5471c 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -133,7 +133,8 @@ def read_array(self, unpickler): current_pos = unpickler.file_handle.tell() alignment = current_pos % 8 - if alignment != 0: + current_byte = unpickler.file_handle.peek() + if alignment != 0 and current_byte == b' ': padding_length = 8 - alignment unpickler.file_handle.seek(current_pos + padding_length) @@ -350,6 +351,7 @@ def load_build(self): replace them directly in the stack of pickler. NDArrayWrapper is used for backward compatibility with joblib <= 0.9. """ + print('load_build', self.stack) Unpickler.load_build(self) # For backward compatibility, we support NDArrayWrapper objects. From 282d953f49cd1e73049364b9022e2a4abff44807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 2 Feb 2022 17:46:43 +0100 Subject: [PATCH 03/29] test --- joblib/test/test_numpy_pickle.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 34a9e231c..77fe735ef 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1056,3 +1056,45 @@ def test_lz4_compression_without_lz4(tmpdir): with raises(ValueError) as excinfo: numpy_pickle.dump(data, fname + '.lz4') excinfo.match(msg) + + +@with_numpy +@parametrize('protocol', range(0, pickle.HIGHEST_PROTOCOL + 1)) +def test_memmap_alignment_padding(tmpdir, protocol): + # Test that memmaped arrays returned by numpy.load are correctly aligned + fname = tmpdir.join('test.mmap').strpath + + a = np.random.randn(2) + numpy_pickle.dump(a, fname, protocol=protocol) + memmap = numpy_pickle.load(fname, mmap_mode='r') + assert isinstance(memmap, np.memmap) + np.testing.assert_array_equal(a, memmap) + assert memmap.ctypes.data % 8 == 0 + assert memmap.flags.aligned + + l = [np.random.randn(2), np.random.randn(2), + np.random.randn(2), np.random.randn(2)] + + numpy_pickle.dump(l, fname, protocol=protocol) + l_reloaded = numpy_pickle.load(fname, mmap_mode='r') + + for idx, memmap in enumerate(l_reloaded): + assert isinstance(memmap, np.memmap) + np.testing.assert_array_equal(l[idx], memmap) + print("MODULO: {}".format(memmap.ctypes.data % 8)) + assert memmap.ctypes.data % 8 == 0 + assert memmap.flags.aligned + + d = {'a1': np.random.randn(100), + 'a2': np.random.randn(200), + 'a3': np.random.randn(300), + 'a4': np.random.randn(400)} + + numpy_pickle.dump(d, fname, protocol=protocol) + d_reloaded = numpy_pickle.load(fname, mmap_mode='r') + + for key, memmap in d_reloaded.items(): + assert isinstance(memmap, np.memmap) + np.testing.assert_array_equal(d[key], memmap) + assert memmap.ctypes.data % 8 == 0 + assert memmap.flags.aligned From 6b72bcfa40c59e8ae7370165dbd860873c794a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 3 Feb 2022 09:24:51 +0100 Subject: [PATCH 04/29] Remove debug print --- joblib/numpy_pickle.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 7c8f5471c..f09d4f135 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -97,11 +97,10 @@ def write_array(self, array, pickler): else: current_pos = pickler.file_handle.tell() alignment = current_pos % 8 - # print('write', current_pos) + if alignment != 0: padding = b' ' * (8 - alignment) pickler.file_handle.write(padding) - # print('after padding write', pickler.file_handle.tell()) for chunk in pickler.np.nditer(array, flags=['external_loop', @@ -129,7 +128,6 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: - # print('read', unpickler.file_handle.tell()) current_pos = unpickler.file_handle.tell() alignment = current_pos % 8 @@ -138,8 +136,6 @@ def read_array(self, unpickler): padding_length = 8 - alignment unpickler.file_handle.seek(current_pos + padding_length) - # print('read after padding', unpickler.file_handle.tell()) - # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, @@ -351,7 +347,6 @@ def load_build(self): replace them directly in the stack of pickler. NDArrayWrapper is used for backward compatibility with joblib <= 0.9. """ - print('load_build', self.stack) Unpickler.load_build(self) # For backward compatibility, we support NDArrayWrapper objects. From ef5ddcd53f5d8089263f26c8f0e48bb38e242f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 3 Feb 2022 12:05:56 +0100 Subject: [PATCH 05/29] fix (peek not supported in io.BytesIO) --- joblib/numpy_pickle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index f09d4f135..9b1922e36 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -131,7 +131,10 @@ def read_array(self, unpickler): current_pos = unpickler.file_handle.tell() alignment = current_pos % 8 - current_byte = unpickler.file_handle.peek() + # peek not supported in io.BytesIO ... + current_byte = unpickler.file_handle.read(1) + unpickler.file_handle.seek(current_pos) + if alignment != 0 and current_byte == b' ': padding_length = 8 - alignment unpickler.file_handle.seek(current_pos + padding_length) From a851f5c7bb176a3cae4a1d7866582138406c9b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 3 Feb 2022 12:10:16 +0100 Subject: [PATCH 06/29] better variables --- joblib/numpy_pickle.py | 2 +- joblib/test/test_numpy_pickle.py | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 9b1922e36..31436c8be 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -97,7 +97,7 @@ def write_array(self, array, pickler): else: current_pos = pickler.file_handle.tell() alignment = current_pos % 8 - + if alignment != 0: padding = b' ' * (8 - alignment) pickler.file_handle.write(padding) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 77fe735ef..b1e138bf0 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1072,29 +1072,33 @@ def test_memmap_alignment_padding(tmpdir, protocol): assert memmap.ctypes.data % 8 == 0 assert memmap.flags.aligned - l = [np.random.randn(2), np.random.randn(2), - np.random.randn(2), np.random.randn(2)] + array_list = [ + np.random.randn(2), np.random.randn(2), + np.random.randn(2), np.random.randn(2) + ] - numpy_pickle.dump(l, fname, protocol=protocol) + numpy_pickle.dump(array_list, fname, protocol=protocol) l_reloaded = numpy_pickle.load(fname, mmap_mode='r') for idx, memmap in enumerate(l_reloaded): assert isinstance(memmap, np.memmap) - np.testing.assert_array_equal(l[idx], memmap) + np.testing.assert_array_equal(array_list[idx], memmap) print("MODULO: {}".format(memmap.ctypes.data % 8)) assert memmap.ctypes.data % 8 == 0 assert memmap.flags.aligned - d = {'a1': np.random.randn(100), - 'a2': np.random.randn(200), - 'a3': np.random.randn(300), - 'a4': np.random.randn(400)} + array_dict = { + 'a1': np.random.randn(100), + 'a2': np.random.randn(200), + 'a3': np.random.randn(300), + 'a4': np.random.randn(400) + } - numpy_pickle.dump(d, fname, protocol=protocol) + numpy_pickle.dump(array_dict, fname, protocol=protocol) d_reloaded = numpy_pickle.load(fname, mmap_mode='r') for key, memmap in d_reloaded.items(): assert isinstance(memmap, np.memmap) - np.testing.assert_array_equal(d[key], memmap) + np.testing.assert_array_equal(array_dict[key], memmap) assert memmap.ctypes.data % 8 == 0 assert memmap.flags.aligned From 7cd90ed567a57b3176cb2a03738d6a267036b2a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 3 Feb 2022 12:31:33 +0100 Subject: [PATCH 07/29] handle case where .tell not supported --- joblib/numpy_pickle.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 31436c8be..b1874add7 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -7,6 +7,8 @@ import pickle import os import warnings +import io + try: from pathlib import Path except ImportError: @@ -95,12 +97,16 @@ def write_array(self, array, pickler): # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: - current_pos = pickler.file_handle.tell() - alignment = current_pos % 8 + try: + current_pos = pickler.file_handle.tell() + alignment = current_pos % 8 - if alignment != 0: - padding = b' ' * (8 - alignment) - pickler.file_handle.write(padding) + if alignment != 0: + padding = b' ' * (8 - alignment) + pickler.file_handle.write(padding) + except io.UnsupportedOperation: + # TODO log something somewhere? + pass for chunk in pickler.np.nditer(array, flags=['external_loop', @@ -128,16 +134,20 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: - current_pos = unpickler.file_handle.tell() - alignment = current_pos % 8 + try: + current_pos = unpickler.file_handle.tell() + alignment = current_pos % 8 - # peek not supported in io.BytesIO ... - current_byte = unpickler.file_handle.read(1) - unpickler.file_handle.seek(current_pos) + # peek not supported in io.BytesIO ... + current_byte = unpickler.file_handle.read(1) + unpickler.file_handle.seek(current_pos) - if alignment != 0 and current_byte == b' ': - padding_length = 8 - alignment - unpickler.file_handle.seek(current_pos + padding_length) + if alignment != 0 and current_byte == b' ': + padding_length = 8 - alignment + unpickler.file_handle.seek(current_pos + padding_length) + except io.UnsupportedOperation: + # TODO log something somewhere? + pass # This is not a real file. We have to read it the # memory-intensive way. @@ -174,6 +184,7 @@ def read_mmap(self, unpickler): current_pos = unpickler.file_handle.tell() offset = current_pos alignment = current_pos % 8 + # Do I need to check whether current byte is b' '? if alignment != 0: offset += 8 - alignment if unpickler.mmap_mode == 'w+': From e0a994a04ddf9e9eaafe4d2fc23a2ef1b741d3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 3 Feb 2022 15:59:34 +0100 Subject: [PATCH 08/29] Fix memmap with old pickles loaded in mmap mode --- joblib/numpy_pickle.py | 12 +++++++++--- joblib/test/test_numpy_pickle.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index b1874add7..1b0a0ab71 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -184,9 +184,15 @@ def read_mmap(self, unpickler): current_pos = unpickler.file_handle.tell() offset = current_pos alignment = current_pos % 8 - # Do I need to check whether current byte is b' '? - if alignment != 0: - offset += 8 - alignment + + # peek not supported in io.BytesIO ... + current_byte = unpickler.file_handle.read(1) + unpickler.file_handle.seek(current_pos) + + if alignment != 0 and current_byte == b' ': + padding_length = 8 - alignment + offset += padding_length + if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index b1e138bf0..ee45f509e 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -462,6 +462,27 @@ def test_joblib_pickle_across_python_versions(): _check_pickle(fname, expected_list) +@with_numpy +def test_joblib_pickle_across_python_versions_with_mmap(): + expected_list = [np.arange(5, dtype=np.dtype(' Date: Thu, 3 Feb 2022 16:45:39 +0100 Subject: [PATCH 09/29] Use module variable for array bytes alignment --- joblib/numpy_pickle.py | 14 ++++++++------ joblib/test/test_numpy_pickle.py | 10 ++++++---- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 1b0a0ab71..438616092 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -44,6 +44,8 @@ ############################################################################### # Utility objects for persistence. +NUMPY_ARRAY_ALIGNMENT_BYTES = 16 + class NumpyArrayWrapper(object): """An object to be persisted instead of numpy arrays. @@ -99,10 +101,10 @@ def write_array(self, array, pickler): else: try: current_pos = pickler.file_handle.tell() - alignment = current_pos % 8 + alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES if alignment != 0: - padding = b' ' * (8 - alignment) + padding = b' ' * (NUMPY_ARRAY_ALIGNMENT_BYTES - alignment) pickler.file_handle.write(padding) except io.UnsupportedOperation: # TODO log something somewhere? @@ -136,14 +138,14 @@ def read_array(self, unpickler): else: try: current_pos = unpickler.file_handle.tell() - alignment = current_pos % 8 + alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES # peek not supported in io.BytesIO ... current_byte = unpickler.file_handle.read(1) unpickler.file_handle.seek(current_pos) if alignment != 0 and current_byte == b' ': - padding_length = 8 - alignment + padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment unpickler.file_handle.seek(current_pos + padding_length) except io.UnsupportedOperation: # TODO log something somewhere? @@ -183,14 +185,14 @@ def read_mmap(self, unpickler): """Read an array using numpy memmap.""" current_pos = unpickler.file_handle.tell() offset = current_pos - alignment = current_pos % 8 + alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES # peek not supported in io.BytesIO ... current_byte = unpickler.file_handle.read(1) unpickler.file_handle.seek(current_pos) if alignment != 0 and current_byte == b' ': - padding_length = 8 - alignment + padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment offset += padding_length if unpickler.mmap_mode == 'w+': diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index ee45f509e..cdc22d285 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1090,7 +1090,8 @@ def test_memmap_alignment_padding(tmpdir, protocol): memmap = numpy_pickle.load(fname, mmap_mode='r') assert isinstance(memmap, np.memmap) np.testing.assert_array_equal(a, memmap) - assert memmap.ctypes.data % 8 == 0 + assert ( + memmap.ctypes.data % numpy_pickle.NUMPY_ARRAY_ALIGNMENT_BYTES == 0) assert memmap.flags.aligned array_list = [ @@ -1104,8 +1105,8 @@ def test_memmap_alignment_padding(tmpdir, protocol): for idx, memmap in enumerate(l_reloaded): assert isinstance(memmap, np.memmap) np.testing.assert_array_equal(array_list[idx], memmap) - print("MODULO: {}".format(memmap.ctypes.data % 8)) - assert memmap.ctypes.data % 8 == 0 + assert ( + memmap.ctypes.data % numpy_pickle.NUMPY_ARRAY_ALIGNMENT_BYTES == 0) assert memmap.flags.aligned array_dict = { @@ -1121,5 +1122,6 @@ def test_memmap_alignment_padding(tmpdir, protocol): for key, memmap in d_reloaded.items(): assert isinstance(memmap, np.memmap) np.testing.assert_array_equal(array_dict[key], memmap) - assert memmap.ctypes.data % 8 == 0 + assert ( + memmap.ctypes.data % numpy_pickle.NUMPY_ARRAY_ALIGNMENT_BYTES == 0) assert memmap.flags.aligned From 495273252fafafcc8f64408f8b97277f116316d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 4 Feb 2022 14:48:02 +0100 Subject: [PATCH 10/29] Fix for Windows --- joblib/test/test_numpy_pickle.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index cdc22d285..4505f2f6b 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1099,6 +1099,8 @@ def test_memmap_alignment_padding(tmpdir, protocol): np.random.randn(2), np.random.randn(2) ] + # On Windows OSError 22 if reusing the same path for memmap ... + fname = tmpdir.join('test1.mmap').strpath numpy_pickle.dump(array_list, fname, protocol=protocol) l_reloaded = numpy_pickle.load(fname, mmap_mode='r') @@ -1116,6 +1118,8 @@ def test_memmap_alignment_padding(tmpdir, protocol): 'a4': np.random.randn(400) } + # On Windows OSError 22 if reusing the same path for memmap ... + fname = tmpdir.join('test2.mmap').strpath numpy_pickle.dump(array_dict, fname, protocol=protocol) d_reloaded = numpy_pickle.load(fname, mmap_mode='r') From 513e0d176847f5f1c94ebf9ee239095294d8e27f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 4 Feb 2022 17:12:50 +0100 Subject: [PATCH 11/29] Fix test checking old non-aligned arrays with memmap --- joblib/test/test_numpy_pickle.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 439517417..479cdaa46 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -368,7 +368,7 @@ def test_compressed_pickle_dump_and_load(tmpdir): assert result == expected -def _check_pickle(filename, expected_list): +def _check_pickle(filename, expected_list, mmap_mode=None): """Helper function to test joblib pickle content. Note: currently only pickles containing an iterable are supported @@ -388,7 +388,7 @@ def _check_pickle(filename, expected_list): warnings.filterwarnings( 'ignore', module='numpy', message='The compiler package is deprecated') - result_list = numpy_pickle.load(filename) + result_list = numpy_pickle.load(filename, mmap_mode=mmap_mode) filename_base = os.path.basename(filename) expected_nb_warnings = 1 if ("_0.9" in filename_base or "_0.8.4" in filename_base) else 0 @@ -483,7 +483,7 @@ def test_joblib_pickle_across_python_versions_with_mmap(): os.path.join(test_data_dir, fn) for fn in os.listdir(test_data_dir) if fn.endswith('.pkl')] for fname in pickle_filenames: - _check_pickle(fname, expected_list) + _check_pickle(fname, expected_list, mmap_mode='r') @with_numpy From d7cbd3f285003939de36ddb7037728bf16155b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 8 Feb 2022 11:47:40 +0100 Subject: [PATCH 12/29] Add align_bytes_array attribute in NumpyArrayWrapper --- joblib/numpy_pickle.py | 60 +++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index b1c0b222f..ab4a5745e 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -73,13 +73,19 @@ class NumpyArrayWrapper(object): Default: False. """ - def __init__(self, subclass, shape, order, dtype, allow_mmap=False): + def __init__(self, subclass, shape, order, dtype, allow_mmap=False, + array_bytes_aligned=True): """Constructor. Store the useful information for later.""" self.subclass = subclass self.shape = shape self.order = order self.dtype = dtype self.allow_mmap = allow_mmap + self.array_bytes_aligned = array_bytes_aligned + + def _array_bytes_aligned(self): + # old pickles don't have the array_bytes_aligned attribute + return getattr(self, 'array_bytes_aligned', False) def write_array(self, array, pickler): """Write array bytes to pickler file handle. @@ -95,16 +101,12 @@ def write_array(self, array, pickler): # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: - try: + if self._array_bytes_aligned(): current_pos = pickler.file_handle.tell() alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES - if alignment != 0: padding = b' ' * (NUMPY_ARRAY_ALIGNMENT_BYTES - alignment) pickler.file_handle.write(padding) - except io.UnsupportedOperation: - # TODO log something somewhere? - pass for chunk in pickler.np.nditer(array, flags=['external_loop', @@ -132,20 +134,17 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: - try: - current_pos = unpickler.file_handle.tell() - alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES - - # peek not supported in io.BytesIO ... - current_byte = unpickler.file_handle.read(1) - unpickler.file_handle.seek(current_pos) - - if alignment != 0 and current_byte == b' ': - padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment - unpickler.file_handle.seek(current_pos + padding_length) - except io.UnsupportedOperation: - # TODO log something somewhere? - pass + if self._array_bytes_aligned(): + try: + current_pos = unpickler.file_handle.tell() + alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES + if alignment != 0: + padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment + unpickler.file_handle.seek(current_pos + padding_length) + except io.UnsupportedOperation as exc: + raise RuntimeError( + 'Trying to read a joblib pickle with bytes aligned ' + 'numpy arrays in a file_handle that does not support .tell') from exc # This is not a real file. We have to read it the # memory-intensive way. @@ -183,13 +182,12 @@ def read_mmap(self, unpickler): offset = current_pos alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES - # peek not supported in io.BytesIO ... - current_byte = unpickler.file_handle.read(1) - unpickler.file_handle.seek(current_pos) + if self._array_bytes_aligned(): + unpickler.file_handle.seek(current_pos) - if alignment != 0 and current_byte == b' ': - padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment - offset += padding_length + if alignment != 0: + padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment + offset += padding_length if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' @@ -279,9 +277,17 @@ def _create_array_wrapper(self, array): order = 'F' if (array.flags.f_contiguous and not array.flags.c_contiguous) else 'C' allow_mmap = not self.buffered and not array.dtype.hasobject + + try: + self.file_handle.tell() + array_bytes_aligned = True + except io.UnsupportedOperation: + array_bytes_aligned = False + wrapper = NumpyArrayWrapper(type(array), array.shape, order, array.dtype, - allow_mmap=allow_mmap) + allow_mmap=allow_mmap, + array_bytes_aligned=array_bytes_aligned) return wrapper From d66f2bda4c9b096aa963ff5c2214f4a3c246e23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 8 Feb 2022 15:15:13 +0100 Subject: [PATCH 13/29] lint --- joblib/numpy_pickle.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index ab4a5745e..cc8b136ba 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -139,12 +139,15 @@ def read_array(self, unpickler): current_pos = unpickler.file_handle.tell() alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES if alignment != 0: - padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment - unpickler.file_handle.seek(current_pos + padding_length) + padding_length = ( + NUMPY_ARRAY_ALIGNMENT_BYTES - alignment) + unpickler.file_handle.seek( + current_pos + padding_length) except io.UnsupportedOperation as exc: raise RuntimeError( 'Trying to read a joblib pickle with bytes aligned ' - 'numpy arrays in a file_handle that does not support .tell') from exc + 'numpy arrays in a file_handle ' + 'that does not support .tell') from exc # This is not a real file. We have to read it the # memory-intensive way. From 7dab61b3fed68fe9ec7e98b06091882dd373ea4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 9 Feb 2022 16:13:54 +0100 Subject: [PATCH 14/29] Tweak --- joblib/numpy_pickle.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index cc8b136ba..dd3a0e472 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -84,7 +84,8 @@ def __init__(self, subclass, shape, order, dtype, allow_mmap=False, self.array_bytes_aligned = array_bytes_aligned def _array_bytes_aligned(self): - # old pickles don't have the array_bytes_aligned attribute + # joblib <= 1.1 pickles NumpyArrayWrapper instances don't have an + # array_bytes_aligned attribute return getattr(self, 'array_bytes_aligned', False) def write_array(self, array, pickler): @@ -186,8 +187,6 @@ def read_mmap(self, unpickler): alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES if self._array_bytes_aligned(): - unpickler.file_handle.seek(current_pos) - if alignment != 0: padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment offset += padding_length From f943214b3a52e54eb5b4426266e329f100c93566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 10 Feb 2022 09:09:12 +0100 Subject: [PATCH 15/29] trigger CI From fe5036d54d97c577cdab4ff29a3a5985363660c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 17 Feb 2022 16:02:15 +0100 Subject: [PATCH 16/29] Trigger CI From 0522f2335d0f42954f23e0d1636dcb7c77e083b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 17 Feb 2022 16:25:36 +0100 Subject: [PATCH 17/29] Create a numpy_array_alignment_bytes attribute to allow for future change of alignment --- joblib/numpy_pickle.py | 49 ++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index dd3a0e472..805967bfb 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -40,6 +40,9 @@ ############################################################################### # Utility objects for persistence. +# For convenience, 16 bytes are used to be sure to cover all the possible +# dtypes' alignments. For reference, see: +# https://numpy.org/devdocs/dev/alignment.html NUMPY_ARRAY_ALIGNMENT_BYTES = 16 @@ -74,19 +77,22 @@ class NumpyArrayWrapper(object): """ def __init__(self, subclass, shape, order, dtype, allow_mmap=False, - array_bytes_aligned=True): + numpy_array_alignment_bytes=NUMPY_ARRAY_ALIGNMENT_BYTES): """Constructor. Store the useful information for later.""" self.subclass = subclass self.shape = shape self.order = order self.dtype = dtype self.allow_mmap = allow_mmap - self.array_bytes_aligned = array_bytes_aligned + # We make numpy_array_alignment_bytes an instance attribute to allow us + # to change our mind about the default alignment and still load the old + # pickles (with the previous alignment) correctly + self.numpy_array_alignment_bytes = numpy_array_alignment_bytes - def _array_bytes_aligned(self): - # joblib <= 1.1 pickles NumpyArrayWrapper instances don't have an - # array_bytes_aligned attribute - return getattr(self, 'array_bytes_aligned', False) + def safe_get_numpy_array_alignment_bytes(self): + # NumpyArrayWrapper instances loaded from joblib <= 1.1 pickles don't + # have an numpy_array_alignment_bytes attribute + return getattr(self, 'numpy_array_alignment_bytes', None) def write_array(self, array, pickler): """Write array bytes to pickler file handle. @@ -102,11 +108,13 @@ def write_array(self, array, pickler): # pickle protocol. pickle.dump(array, pickler.file_handle, protocol=2) else: - if self._array_bytes_aligned(): + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + if numpy_array_alignment_bytes is not None: current_pos = pickler.file_handle.tell() - alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES + alignment = current_pos % numpy_array_alignment_bytes if alignment != 0: - padding = b' ' * (NUMPY_ARRAY_ALIGNMENT_BYTES - alignment) + padding = b' ' * (numpy_array_alignment_bytes - alignment) pickler.file_handle.write(padding) for chunk in pickler.np.nditer(array, @@ -135,13 +143,15 @@ def read_array(self, unpickler): # The array contained Python objects. We need to unpickle the data. array = pickle.load(unpickler.file_handle) else: - if self._array_bytes_aligned(): + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + if numpy_array_alignment_bytes is not None: try: current_pos = unpickler.file_handle.tell() - alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES + alignment = current_pos % numpy_array_alignment_bytes if alignment != 0: padding_length = ( - NUMPY_ARRAY_ALIGNMENT_BYTES - alignment) + numpy_array_alignment_bytes - alignment) unpickler.file_handle.seek( current_pos + padding_length) except io.UnsupportedOperation as exc: @@ -184,11 +194,14 @@ def read_mmap(self, unpickler): """Read an array using numpy memmap.""" current_pos = unpickler.file_handle.tell() offset = current_pos - alignment = current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES + numpy_array_alignment_bytes = \ + self.safe_get_numpy_array_alignment_bytes() + + if numpy_array_alignment_bytes is not None: + alignment = current_pos % numpy_array_alignment_bytes - if self._array_bytes_aligned(): if alignment != 0: - padding_length = NUMPY_ARRAY_ALIGNMENT_BYTES - alignment + padding_length = self.numpy_array_alignment_bytes - alignment offset += padding_length if unpickler.mmap_mode == 'w+': @@ -280,16 +293,16 @@ def _create_array_wrapper(self, array): not array.flags.c_contiguous) else 'C' allow_mmap = not self.buffered and not array.dtype.hasobject + kwargs = {} try: self.file_handle.tell() - array_bytes_aligned = True except io.UnsupportedOperation: - array_bytes_aligned = False + kwargs = {'numpy_array_alignment_bytes': None} wrapper = NumpyArrayWrapper(type(array), array.shape, order, array.dtype, allow_mmap=allow_mmap, - array_bytes_aligned=array_bytes_aligned) + **kwargs) return wrapper From 41dc185335d3c9d7973bea3cb88df60633430c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 17 Feb 2022 16:40:07 +0100 Subject: [PATCH 18/29] Add test for edge case where aligned numpy arrays are read in a file handle that does not support tell --- joblib/numpy_pickle.py | 2 +- joblib/test/test_numpy_pickle.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 805967bfb..dde165266 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -157,7 +157,7 @@ def read_array(self, unpickler): except io.UnsupportedOperation as exc: raise RuntimeError( 'Trying to read a joblib pickle with bytes aligned ' - 'numpy arrays in a file_handle ' + 'numpy arrays in a file handle ' 'that does not support .tell') from exc # This is not a real file. We have to read it the diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 479cdaa46..409eaabcd 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -935,6 +935,15 @@ def test_pickle_in_socket(): np.testing.assert_array_equal(array_reloaded, test_array) + bytes_to_send = io.BytesIO() + numpy_pickle.dump(test_array, bytes_to_send) + server.send(bytes_to_send.getvalue()) + + with client.makefile("rb") as cf: + match = 'bytes aligned numpy arrays.+does not support .tell' + with pytest.raises(RuntimeError, match=match): + array_reloaded = numpy_pickle.load(cf) + @with_numpy def test_load_memmap_with_big_offset(tmpdir): From 73eb719bea1af64b39cd2c8ffb1282faddda9297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 15:09:36 +0100 Subject: [PATCH 19/29] Tackled most comments --- joblib/numpy_pickle.py | 41 +++++++++++++++----------------- joblib/test/test_numpy_pickle.py | 14 +++++++---- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 15d3ae2a0..cddba9a86 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -112,11 +112,18 @@ def write_array(self, array, pickler): self.safe_get_numpy_array_alignment_bytes() if numpy_array_alignment_bytes is not None: current_pos = pickler.file_handle.tell() - alignment = current_pos % numpy_array_alignment_bytes - if alignment != 0: - padding = b' ' * (numpy_array_alignment_bytes - alignment) + pos_after_padding_byte = current_pos + 1 + padding_length = numpy_array_alignment_bytes - ( + pos_after_padding_byte % numpy_array_alignment_bytes) + # TODO: byteorder is a required parameter but it does not matter since it is only one byte + padding_length_byte = int.to_bytes( + padding_length, length=1, byteorder='little') + pickler.file_handle.write(padding_length_byte) + + if padding_length != 0: + padding = b'\xff' * padding_length pickler.file_handle.write(padding) - + for chunk in pickler.np.nditer(array, flags=['external_loop', 'buffered', @@ -146,19 +153,10 @@ def read_array(self, unpickler): numpy_array_alignment_bytes = \ self.safe_get_numpy_array_alignment_bytes() if numpy_array_alignment_bytes is not None: - try: - current_pos = unpickler.file_handle.tell() - alignment = current_pos % numpy_array_alignment_bytes - if alignment != 0: - padding_length = ( - numpy_array_alignment_bytes - alignment) - unpickler.file_handle.seek( - current_pos + padding_length) - except io.UnsupportedOperation as exc: - raise RuntimeError( - 'Trying to read a joblib pickle with bytes aligned ' - 'numpy arrays in a file handle ' - 'that does not support .tell') from exc + padding_byte = unpickler.file_handle.read(1) + padding_length = int.from_bytes(padding_byte, byteorder='little') + if padding_length != 0: + unpickler.file_handle.read(padding_length) # This is not a real file. We have to read it the # memory-intensive way. @@ -198,11 +196,10 @@ def read_mmap(self, unpickler): self.safe_get_numpy_array_alignment_bytes() if numpy_array_alignment_bytes is not None: - alignment = current_pos % numpy_array_alignment_bytes - - if alignment != 0: - padding_length = self.numpy_array_alignment_bytes - alignment - offset += padding_length + padding_byte = unpickler.file_handle.read(1) + padding_length = int.from_bytes(padding_byte, byteorder='little') + # + 1 is for the padding byte + offset += padding_length + 1 if unpickler.mmap_mode == 'w+': unpickler.mmap_mode = 'r+' diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 6aa8a0b7b..4a0cf2c40 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -937,14 +937,16 @@ def test_pickle_in_socket(): np.testing.assert_array_equal(array_reloaded, test_array) + # Check that a byte-aligned numpy array written in a file can be send over + # a socket and then read on the other side bytes_to_send = io.BytesIO() numpy_pickle.dump(test_array, bytes_to_send) server.send(bytes_to_send.getvalue()) with client.makefile("rb") as cf: - match = 'bytes aligned numpy arrays.+does not support .tell' - with pytest.raises(RuntimeError, match=match): - array_reloaded = numpy_pickle.load(cf) + array_reloaded = numpy_pickle.load(cf) + + np.testing.assert_array_equal(array_reloaded, test_array) @with_numpy @@ -1088,8 +1090,12 @@ def test_lz4_compression_without_lz4(tmpdir): excinfo.match(msg) +protocols = [pickle.DEFAULT_PROTOCOL] +if pickle.HIGHEST_PROTOCOL != pickle.DEFAULT_PROTOCOL: + protocols.append(pickle.HIGHEST_PROTOCOL) + @with_numpy -@parametrize('protocol', range(0, pickle.HIGHEST_PROTOCOL + 1)) +@parametrize('protocol', protocols) def test_memmap_alignment_padding(tmpdir, protocol): # Test that memmaped arrays returned by numpy.load are correctly aligned fname = tmpdir.join('test.mmap').strpath From e6409e9edb7992b8066a83e856d4baddf06f6825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 15:54:02 +0100 Subject: [PATCH 20/29] Add warning when loading an old pickle with non memory aligned array --- joblib/numpy_pickle.py | 16 +++++++++++++++- joblib/test/test_numpy_pickle.py | 26 ++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index cddba9a86..11b13b990 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -123,7 +123,7 @@ def write_array(self, array, pickler): if padding_length != 0: padding = b'\xff' * padding_length pickler.file_handle.write(padding) - + for chunk in pickler.np.nditer(array, flags=['external_loop', 'buffered', @@ -213,6 +213,20 @@ def read_mmap(self, unpickler): # update the offset so that it corresponds to the end of the read array unpickler.file_handle.seek(offset + marray.nbytes) + if (numpy_array_alignment_bytes is None and + current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES != 0): + message = ( + 'The memmapped array {marray} loaded from the file ' + f'{unpickler.file_handle.name} is not not bytes aligned. ' + 'This may cause segmentation faults if this memmapped array ' + 'is used in some libraries like BLAS or PyTorch. ' + 'To get rid of this warning, regenerate your pickle file ' + ' with joblib >= 1.2.0. ' + 'See https://github.com/joblib/joblib/issues/563 ' + 'for more details' + ) + warnings.warn(message) + return marray def read(self, unpickler): diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 4a0cf2c40..54e15c395 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -392,15 +392,33 @@ def _check_pickle(filename, expected_list, mmap_mode=None): message='The compiler package is deprecated') result_list = numpy_pickle.load(filename, mmap_mode=mmap_mode) filename_base = os.path.basename(filename) - expected_nb_warnings = 1 if ("_0.9" in filename_base or - "_0.8.4" in filename_base) else 0 + expected_nb_deprecation_warnings = 1 if ( + "_0.9" in filename_base or "_0.8.4" in filename_base) else 0 + + expected_nb_user_warnings = 3 if ( + re.search("_0.1.+.pkl$", filename_base) and + mmap_mode is not None) else 0 + expected_nb_warnings = \ + expected_nb_deprecation_warnings + expected_nb_user_warnings assert len(warninfo) == expected_nb_warnings - for w in warninfo: - assert w.category == DeprecationWarning + + deprecation_warnings = [ + w for w in warninfo if issubclass( + w.category, DeprecationWarning)] + user_warnings = [ + w for w in warninfo if issubclass( + w.category, UserWarning)] + for w in deprecation_warnings: assert (str(w.message) == "The file '{0}' has been generated with a joblib " "version less than 0.10. Please regenerate this " "pickle file.".format(filename)) + + for w in user_warnings: + assert re.search( + f"memmapped.+{filename}.+segmentation fault", + str(w.message)) + for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): expected = _ensure_native_byte_order(expected) From a36af7ca666eee5a88f7795f7ca1c28b9f96d0c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 16:01:53 +0100 Subject: [PATCH 21/29] Add changelog entry. --- CHANGES.rst | 3 +++ joblib/numpy_pickle.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index c5c428a92..e0631f402 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -16,6 +16,9 @@ Development version worker processes in case of a crash. https://github.com/joblib/joblib/pull/1269 +- Fix memory alignment bug for pickles containing numpy arrays. + https://github.com/joblib/joblib/pull/1254/files + Release 1.1.0 -------------- diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 11b13b990..35e250518 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -221,7 +221,7 @@ def read_mmap(self, unpickler): 'This may cause segmentation faults if this memmapped array ' 'is used in some libraries like BLAS or PyTorch. ' 'To get rid of this warning, regenerate your pickle file ' - ' with joblib >= 1.2.0. ' + 'with joblib >= 1.2.0. ' 'See https://github.com/joblib/joblib/issues/563 ' 'for more details' ) From 54c2773776f3e549e5b82416642189ca914c4dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 16:09:36 +0100 Subject: [PATCH 22/29] tweak --- joblib/numpy_pickle.py | 6 ++++-- joblib/test/test_numpy_pickle.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 35e250518..928342fbb 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -115,7 +115,8 @@ def write_array(self, array, pickler): pos_after_padding_byte = current_pos + 1 padding_length = numpy_array_alignment_bytes - ( pos_after_padding_byte % numpy_array_alignment_bytes) - # TODO: byteorder is a required parameter but it does not matter since it is only one byte + # A single byte is written that contains the padding length in + # bytes padding_length_byte = int.to_bytes( padding_length, length=1, byteorder='little') pickler.file_handle.write(padding_length_byte) @@ -154,7 +155,8 @@ def read_array(self, unpickler): self.safe_get_numpy_array_alignment_bytes() if numpy_array_alignment_bytes is not None: padding_byte = unpickler.file_handle.read(1) - padding_length = int.from_bytes(padding_byte, byteorder='little') + padding_length = int.from_bytes( + padding_byte, byteorder='little') if padding_length != 0: unpickler.file_handle.read(padding_length) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 54e15c395..99294e147 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1112,6 +1112,7 @@ def test_lz4_compression_without_lz4(tmpdir): if pickle.HIGHEST_PROTOCOL != pickle.DEFAULT_PROTOCOL: protocols.append(pickle.HIGHEST_PROTOCOL) + @with_numpy @parametrize('protocol', protocols) def test_memmap_alignment_padding(tmpdir, protocol): From dfccb94a58bb0f4a08a02ff58b0d4c587544869e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 16:20:35 +0100 Subject: [PATCH 23/29] Forgotten f-string --- joblib/numpy_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 928342fbb..fa450fbba 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -218,7 +218,7 @@ def read_mmap(self, unpickler): if (numpy_array_alignment_bytes is None and current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES != 0): message = ( - 'The memmapped array {marray} loaded from the file ' + f'The memmapped array {marray} loaded from the file ' f'{unpickler.file_handle.name} is not not bytes aligned. ' 'This may cause segmentation faults if this memmapped array ' 'is used in some libraries like BLAS or PyTorch. ' From 236e0421740fe49f1238c2ca269cf6f4b356dff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 16:50:26 +0100 Subject: [PATCH 24/29] Windows fix --- joblib/test/test_numpy_pickle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 99294e147..2c7005c5b 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -415,8 +415,9 @@ def _check_pickle(filename, expected_list, mmap_mode=None): "pickle file.".format(filename)) for w in user_warnings: + escaped_filename = re.escape(filename) assert re.search( - f"memmapped.+{filename}.+segmentation fault", + f"memmapped.+{escaped_filename}.+segmentation fault", str(w.message)) for result, expected in zip(result_list, expected_list): From 3995ed4f7ea1cbb82b458f1135b16290a7433d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 24 Feb 2022 18:05:57 +0100 Subject: [PATCH 25/29] Trigger CI From 4705aadb494436a9f654f0629fcb7038b19a4822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 25 Feb 2022 05:16:02 +0100 Subject: [PATCH 26/29] Apply suggestions from code review Co-authored-by: Olivier Grisel --- CHANGES.rst | 11 ++++++++++- joblib/test/test_numpy_pickle.py | 13 +++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e0631f402..272880a57 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -17,7 +17,16 @@ Development version https://github.com/joblib/joblib/pull/1269 - Fix memory alignment bug for pickles containing numpy arrays. - https://github.com/joblib/joblib/pull/1254/files + This is especially important when loading the pkl with + `mmap_mode != None` as the resulting numpy.memmap object + would not be able to correct the misalignment without performing + a memory copy. + This bug would cause invalid computation and segmentation faults + with native code that would directly access the underlying data + buffer of a numpy array, for instance C/C++/Cython code compiled + with older GCC versions, some old OpenBLAS written in platform + specific assembly...). + https://github.com/joblib/joblib/pull/1254 Release 1.1.0 -------------- diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 2c7005c5b..3174a00bc 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1147,10 +1147,15 @@ def test_memmap_alignment_padding(tmpdir, protocol): assert memmap.flags.aligned array_dict = { - 'a1': np.random.randn(100), - 'a2': np.random.randn(200), - 'a3': np.random.randn(300), - 'a4': np.random.randn(400) + 'a0': np.arange(2, dtype=np.uint8), + 'a1': np.arange(3, dtype=np.uint8), + 'a2': np.arange(5, dtype=np.uint8), + 'a3': np.arange(7, dtype=np.uint8), + 'a4': np.arange(11, dtype=np.uint8), + 'a5': np.arange(13, dtype=np.uint8), + 'a6': np.arange(17, dtype=np.uint8), + 'a7': np.arange(19, dtype=np.uint8), + 'a8': np.arange(23, dtype=np.uint8), } # On Windows OSError 22 if reusing the same path for memmap ... From 2e380bb2a35bb4a6913730e5db356f5b47358b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 25 Feb 2022 05:19:40 +0100 Subject: [PATCH 27/29] tweak --- CHANGES.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 272880a57..56c8e764b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -17,15 +17,15 @@ Development version https://github.com/joblib/joblib/pull/1269 - Fix memory alignment bug for pickles containing numpy arrays. - This is especially important when loading the pkl with - `mmap_mode != None` as the resulting numpy.memmap object + This is especially important when loading the pickle with + ``mmap_mode != None`` as the resulting ``numpy.memmap`` object would not be able to correct the misalignment without performing a memory copy. This bug would cause invalid computation and segmentation faults with native code that would directly access the underlying data buffer of a numpy array, for instance C/C++/Cython code compiled - with older GCC versions, some old OpenBLAS written in platform - specific assembly...). + with older GCC versions or some old OpenBLAS written in platform + specific assembly. https://github.com/joblib/joblib/pull/1254 Release 1.1.0 From eb698030c66c516d78d196ba180e375e6fc956a2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 25 Feb 2022 15:49:05 +0100 Subject: [PATCH 28/29] Update joblib/test/test_numpy_pickle.py --- joblib/test/test_numpy_pickle.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index 3174a00bc..cce16e381 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -1147,15 +1147,15 @@ def test_memmap_alignment_padding(tmpdir, protocol): assert memmap.flags.aligned array_dict = { - 'a0': np.arange(2, dtype=np.uint8), + 'a0': np.arange(2, dtype=np.uint8), 'a1': np.arange(3, dtype=np.uint8), 'a2': np.arange(5, dtype=np.uint8), 'a3': np.arange(7, dtype=np.uint8), 'a4': np.arange(11, dtype=np.uint8), - 'a5': np.arange(13, dtype=np.uint8), - 'a6': np.arange(17, dtype=np.uint8), - 'a7': np.arange(19, dtype=np.uint8), - 'a8': np.arange(23, dtype=np.uint8), + 'a5': np.arange(13, dtype=np.uint8), + 'a6': np.arange(17, dtype=np.uint8), + 'a7': np.arange(19, dtype=np.uint8), + 'a8': np.arange(23, dtype=np.uint8), } # On Windows OSError 22 if reusing the same path for memmap ... From 188c0898d55bc0aa30489b27e39290a8e29845a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 25 Feb 2022 17:31:02 +0100 Subject: [PATCH 29/29] Trigger CI