joblib · ogrisel · Feb 25, 2022 · Feb 2, 2022 · Feb 2, 2022 · Feb 2, 2022
diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py
@@ -7,6 +7,8 @@
 import pickle
 import os
 import warnings
+import io
+
 try:
     from pathlib import Path
 except ImportError:
@@ -95,6 +97,17 @@ def write_array(self, array, pickler):
             # pickle protocol.
             pickle.dump(array, pickler.file_handle, protocol=2)
         else:
+            try:
+                current_pos = pickler.file_handle.tell()
+                alignment = current_pos % 8
+
+                if alignment != 0:
+                    padding = b' ' * (8 - alignment)
+                    pickler.file_handle.write(padding)
+            except io.UnsupportedOperation:
+                # TODO log something somewhere?
+                pass
+
             for chunk in pickler.np.nditer(array,
                                            flags=['external_loop',
                                                   'buffered',
@@ -121,6 +134,21 @@ def read_array(self, unpickler):
             # The array contained Python objects. We need to unpickle the data.
             array = pickle.load(unpickler.file_handle)
         else:
+            try:
+                current_pos = unpickler.file_handle.tell()
+                alignment = current_pos % 8
+
+                # peek not supported in io.BytesIO ...
+                current_byte = unpickler.file_handle.read(1)
+                unpickler.file_handle.seek(current_pos)
+
+                if alignment != 0 and current_byte == b' ':
+                    padding_length = 8 - alignment
+                    unpickler.file_handle.seek(current_pos + padding_length)
+            except io.UnsupportedOperation:
+                # TODO log something somewhere?
+                pass
+
             # This is not a real file. We have to read it the
             # memory-intensive way.
             # crc32 module fails on reads greater than 2 ** 32 bytes,
@@ -153,7 +181,12 @@ def read_array(self, unpickler):
 
     def read_mmap(self, unpickler):
         """Read an array using numpy memmap."""
-        offset = unpickler.file_handle.tell()
+        current_pos = unpickler.file_handle.tell()
+        offset = current_pos
+        alignment = current_pos % 8
+        # Do I need to check whether current byte is b' '?
+        if alignment != 0:
+            offset += 8 - alignment
         if unpickler.mmap_mode == 'w+':
             unpickler.mmap_mode = 'r+'
 

diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py
@@ -1056,3 +1056,49 @@ def test_lz4_compression_without_lz4(tmpdir):
     with raises(ValueError) as excinfo:
         numpy_pickle.dump(data, fname + '.lz4')
     excinfo.match(msg)
+
+
+@with_numpy
+@parametrize('protocol', range(0, pickle.HIGHEST_PROTOCOL + 1))
+def test_memmap_alignment_padding(tmpdir, protocol):
+    # Test that memmaped arrays returned by numpy.load are correctly aligned
+    fname = tmpdir.join('test.mmap').strpath
+
+    a = np.random.randn(2)
+    numpy_pickle.dump(a, fname, protocol=protocol)
+    memmap = numpy_pickle.load(fname, mmap_mode='r')
+    assert isinstance(memmap, np.memmap)
+    np.testing.assert_array_equal(a, memmap)
+    assert memmap.ctypes.data % 8 == 0
+    assert memmap.flags.aligned
+
+    array_list = [
+        np.random.randn(2), np.random.randn(2),
+        np.random.randn(2), np.random.randn(2)
+    ]
+
+    numpy_pickle.dump(array_list, fname, protocol=protocol)
+    l_reloaded = numpy_pickle.load(fname, mmap_mode='r')
+
+    for idx, memmap in enumerate(l_reloaded):
+        assert isinstance(memmap, np.memmap)
+        np.testing.assert_array_equal(array_list[idx], memmap)
+        print("MODULO: {}".format(memmap.ctypes.data % 8))
+        assert memmap.ctypes.data % 8 == 0
+        assert memmap.flags.aligned
+
+    array_dict = {
+        'a1': np.random.randn(100),
+        'a2': np.random.randn(200),
+        'a3': np.random.randn(300),
+        'a4': np.random.randn(400)
+    }
+
+    numpy_pickle.dump(array_dict, fname, protocol=protocol)
+    d_reloaded = numpy_pickle.load(fname, mmap_mode='r')
+
+    for key, memmap in d_reloaded.items():
+        assert isinstance(memmap, np.memmap)
+        np.testing.assert_array_equal(array_dict[key], memmap)
+        assert memmap.ctypes.data % 8 == 0
+        assert memmap.flags.aligned