diff --git a/CHANGES.rst b/CHANGES.rst index 93144b324..ccb276e94 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,12 @@ Latest changes Development version ------------------- +- Fix byte order inconsistency issue during deserialization using joblib.load + in cross-endian environment: the numpy arrays are now always loaded to + use the system byte order, independently of the byte order of the system + that serialized the pickle. + https://github.com/joblib/joblib/pull/1181 + - Fix joblib.Memory bug with the ``ignore`` parameter when the cached function is a decorated function. https://github.com/joblib/joblib/pull/1165 diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index 93e5537ea..cc593af22 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -20,6 +20,7 @@ from .numpy_pickle_utils import Unpickler, Pickler from .numpy_pickle_utils import _read_fileobject, _write_fileobject from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE +from .numpy_pickle_utils import _ensure_native_byte_order from .numpy_pickle_compat import load_compatibility from .numpy_pickle_compat import NDArrayWrapper # For compatibility with old versions of joblib, we need ZNDArrayWrapper @@ -147,7 +148,8 @@ def read_array(self, unpickler): else: array.shape = self.shape - return array + # Detect byte order mis-match and swap as needed. + return _ensure_native_byte_order(array) def read_mmap(self, unpickler): """Read an array using numpy memmap.""" diff --git a/joblib/numpy_pickle_compat.py b/joblib/numpy_pickle_compat.py index 6541a066a..096acbcf0 100644 --- a/joblib/numpy_pickle_compat.py +++ b/joblib/numpy_pickle_compat.py @@ -9,7 +9,7 @@ from .numpy_pickle_utils import _ZFILE_PREFIX from .numpy_pickle_utils import Unpickler - +from .numpy_pickle_utils import _ensure_native_byte_order def hex_str(an_int): """Convert an int to an hexadecimal string.""" @@ -105,6 +105,9 @@ def read(self, unpickler): kwargs["allow_pickle"] = True array = unpickler.np.load(filename, **kwargs) + # Detect byte order mis-match and swap as needed. + array = _ensure_native_byte_order(array) + # Reconstruct subclasses. This does not work with old # versions of numpy if (hasattr(array, '__array_prepare__') and diff --git a/joblib/numpy_pickle_utils.py b/joblib/numpy_pickle_utils.py index a50105547..02a88ffaf 100644 --- a/joblib/numpy_pickle_utils.py +++ b/joblib/numpy_pickle_utils.py @@ -6,6 +6,7 @@ import pickle import io +import sys import warnings import contextlib @@ -48,6 +49,30 @@ def _get_prefixes_max_len(): return max(prefixes) +def _is_numpy_array_byte_order_mismatch(array): + """Check if numpy array is having byte order mis-match""" + return ((sys.byteorder == 'big' and + (array.dtype.byteorder == '<' or + (array.dtype.byteorder == '|' and array.dtype.fields and + all(e[0].byteorder == '<' + for e in array.dtype.fields.values())))) or + (sys.byteorder == 'little' and + (array.dtype.byteorder == '>' or + (array.dtype.byteorder == '|' and array.dtype.fields and + all(e[0].byteorder == '>' + for e in array.dtype.fields.values()))))) + + +def _ensure_native_byte_order(array): + """Use the byte order of the host while preserving values + + Does nothing if array already uses the system byte order. + """ + if _is_numpy_array_byte_order_mismatch(array): + array = array.byteswap().newbyteorder('=') + return array + + ############################################################################### # Cache file utilities def _detect_compressor(fileobj): diff --git a/joblib/test/test_numpy_pickle.py b/joblib/test/test_numpy_pickle.py index db130b1f4..7cdd90f28 100644 --- a/joblib/test/test_numpy_pickle.py +++ b/joblib/test/test_numpy_pickle.py @@ -5,6 +5,7 @@ import random import re import io +import sys import warnings import gzip import zlib @@ -30,6 +31,8 @@ from joblib.numpy_pickle_utils import _IO_BUFFER_SIZE from joblib.numpy_pickle_utils import _detect_compressor +from joblib.numpy_pickle_utils import _is_numpy_array_byte_order_mismatch +from joblib.numpy_pickle_utils import _ensure_native_byte_order from joblib.compressor import (_COMPRESSORS, _LZ4_PREFIX, CompressorWrapper, LZ4_NOT_INSTALLED_ERROR, BinaryZlibFile) @@ -355,6 +358,7 @@ def test_compressed_pickle_dump_and_load(tmpdir): result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): + expected = _ensure_native_byte_order(expected) assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: @@ -394,6 +398,7 @@ def _check_pickle(filename, expected_list): "pickle file.".format(filename)) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): + expected = _ensure_native_byte_order(expected) assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: @@ -457,6 +462,47 @@ def test_joblib_pickle_across_python_versions(): _check_pickle(fname, expected_list) +@with_numpy +def test_numpy_array_byte_order_mismatch_detection(): + # List of numpy arrays with big endian byteorder. + be_arrays = [np.array([(1, 2.0), (3, 4.0)], + dtype=[('', '>i8'), ('', '>f8')]), + np.arange(3, dtype=np.dtype('>i8')), + np.arange(3, dtype=np.dtype('>f8'))] + + # Verify the byteorder mismatch is correctly detected. + for array in be_arrays: + if sys.byteorder == 'big': + assert not _is_numpy_array_byte_order_mismatch(array) + else: + assert _is_numpy_array_byte_order_mismatch(array) + converted = _ensure_native_byte_order(array) + if converted.dtype.fields: + for f in converted.dtype.fields.values(): + f[0].byteorder == '=' + else: + assert converted.dtype.byteorder == "=" + + # List of numpy arrays with little endian byteorder. + le_arrays = [np.array([(1, 2.0), (3, 4.0)], + dtype=[('', '