From afdea9ebf67981e967903e4df2993c1ffb4cb705 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 30 Jun 2021 12:45:39 +0200 Subject: [PATCH 1/5] Make function pickling deterministic by default Co-authored-by: Christopher J. Markiewicz --- CHANGES.md | 4 ++++ cloudpickle/cloudpickle.py | 7 +++++-- tests/cloudpickle_test.py | 33 +++++++++++++++++++++++++++++++-- tests/testutils.py | 30 ++++++++++++++++++++++++------ 4 files changed, 64 insertions(+), 10 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 91f57174c..0bc5eeb94 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,6 +15,10 @@ dev _is_parametrized_type_hint to limit false positives. ([PR #409](https://github.com/cloudpipe/cloudpickle/pull/409)) +- Suppressed a source of non-determinism when pickling dynamically defined + functions. + ([PR #428](https://github.com/cloudpipe/cloudpickle/pull/428)) + 1.6.0 ===== diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 20e9a9550..07dc7617c 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -236,7 +236,10 @@ def _extract_code_globals(co): out_names = _extract_code_globals_cache.get(co) if out_names is None: names = co.co_names - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} + # We use a dict with None values instead of a set to get a + # deterministic order (assuming Python 3.6+) and avoid introducing + # non-deterministic pickle bytes as a results. + out_names = {names[oparg]: None for _, oparg in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." # syntax generates a constant code object corresponding to the one @@ -247,7 +250,7 @@ def _extract_code_globals(co): if co.co_consts: for const in co.co_consts: if isinstance(const, types.CodeType): - out_names |= _extract_code_globals(const) + out_names.update(_extract_code_globals(const)) _extract_code_globals_cache[co] = out_names diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 57a2ca0f4..4bb7b1604 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -10,6 +10,7 @@ import logging import math from operator import itemgetter, attrgetter +import pickletools import platform import random import shutil @@ -50,6 +51,7 @@ from cloudpickle.cloudpickle import _lookup_module_and_qualname from .testutils import subprocess_pickle_echo +from .testutils import subprocess_pickle_string from .testutils import assert_run_python_script from .testutils import subprocess_worker @@ -57,6 +59,7 @@ _TEST_GLOBAL_VARIABLE = "default_value" +_TEST_GLOBAL_VARIABLE2 = "another_value" class RaiserOnPickle(object): @@ -2095,8 +2098,8 @@ def inner_function(): return _TEST_GLOBAL_VARIABLE return inner_function - globals_ = cloudpickle.cloudpickle._extract_code_globals( - function_factory.__code__) + globals_ = set(cloudpickle.cloudpickle._extract_code_globals( + function_factory.__code__).keys()) assert globals_ == {'_TEST_GLOBAL_VARIABLE'} depickled_factory = pickle_depickle(function_factory, @@ -2330,6 +2333,32 @@ def __type__(self): o = MyClass() pickle_depickle(o, protocol=self.protocol) + @pytest.mark.skipif( + sys.version_info < (3, 7), + reason="Determinism can only be guaranteed for Python 3.7+" + ) + def test_deterministic_pickle_bytes_for_function(self): + # Ensure that functions with references to several global names are + # pickled to fixed bytes that do not depend on the PYTHONHASHSEED of + # the Python process. + vals = set() + + def func_with_globals(): + return _TEST_GLOBAL_VARIABLE + _TEST_GLOBAL_VARIABLE2 + + for i in range(5): + vals.add( + subprocess_pickle_string(func_with_globals, + protocol=self.protocol, + add_env={"PYTHONHASHSEED": str(i)})) + if len(vals) > 1: + # Print additional debug info on stdout with dis: + for val in vals: + pickletools.dis(val) + pytest.fail( + f"Expected a single deterministic payload, got {len(vals)}/5" + ) + class Protocol2CloudPickleTest(CloudPickleTest): diff --git a/tests/testutils.py b/tests/testutils.py index 6acc998d4..5b3784ff0 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -2,7 +2,6 @@ import os import os.path as op import tempfile -import base64 from subprocess import Popen, check_output, PIPE, STDOUT, CalledProcessError from cloudpickle.compat import pickle from contextlib import contextmanager @@ -38,15 +37,16 @@ def _make_cwd_env(): return cloudpickle_repo_folder, env -def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): - """Echo function with a child Python process +def subprocess_pickle_string(input_data, protocol=None, timeout=TIMEOUT, + add_env=None): + """Retrieve pickle string of an object generated by a child Python process Pickle the input data into a buffer, send it to a subprocess via stdin, expect the subprocess to unpickle, re-pickle that data back and send it back to the parent process via stdout for final unpickling. - >>> subprocess_pickle_echo([1, 'a', None]) - [1, 'a', None] + >>> testutils.subprocess_pickle_string([1, 'a', None], protocol=2) + b'\x80\x02]q\x00(K\x01X\x01\x00\x00\x00aq\x01Ne.' """ # run then pickle_echo(protocol=protocol) in __main__: @@ -56,6 +56,8 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): # which is deprecated in python 3.8 cmd = [sys.executable, '-W ignore', __file__, "--protocol", str(protocol)] cwd, env = _make_cwd_env() + if add_env: + env.update(add_env) proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env, bufsize=4096) pickle_string = dumps(input_data, protocol=protocol) @@ -67,7 +69,7 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): message = "Subprocess returned %d: " % proc.returncode message += err.decode('utf-8') raise RuntimeError(message) - return loads(out) + return out except TimeoutExpired as e: proc.kill() out, err = proc.communicate() @@ -75,6 +77,22 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): raise RuntimeError(message) from e +def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT, + add_env=None): + """Echo function with a child Python process + Pickle the input data into a buffer, send it to a subprocess via + stdin, expect the subprocess to unpickle, re-pickle that data back + and send it back to the parent process via stdout for final unpickling. + >>> subprocess_pickle_echo([1, 'a', None]) + [1, 'a', None] + """ + out = subprocess_pickle_string(input_data, + protocol=protocol, + timeout=timeout, + add_env=add_env) + return loads(out) + + def _read_all_bytes(stream_in, chunk_size=4096): all_data = b"" while True: From d983e0dcf856b0624e40adc4bafd10d67460a595 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 30 Jun 2021 16:00:59 +0200 Subject: [PATCH 2/5] Temporarily restore Python 3.5 syntax compat --- tests/cloudpickle_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 4bb7b1604..e2d84012c 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -2356,7 +2356,7 @@ def func_with_globals(): for val in vals: pickletools.dis(val) pytest.fail( - f"Expected a single deterministic payload, got {len(vals)}/5" + "Expected a single deterministic payload, got %d/5" % len(vals) ) From b33fdf1f1732f8f59f76a94334732148d439d6ab Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 30 Jun 2021 17:51:38 +0200 Subject: [PATCH 3/5] co_lnotab => co_linetable/co_exceptiontable --- cloudpickle/cloudpickle_fast.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index be1bb83c0..b39620a0e 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -244,7 +244,19 @@ def _enum_getstate(obj): def _code_reduce(obj): """codeobject reducer""" - if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + if hasattr(obj, "co_linetable"): # pragma: no branch + # Python 3.10 and later: obj.co_lnotab is deprecated and constructor + # expects obj.co_linetable and obj.co_exceptiontable instead. + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_linetable, obj.co_exceptiontable, + obj.co_freevars, obj.co_cellvars + ) + elif hasattr(obj, "co_posonlyargcount"): + # Backward compat for 3.9 and older args = ( obj.co_argcount, obj.co_posonlyargcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, @@ -254,6 +266,7 @@ def _code_reduce(obj): obj.co_cellvars ) else: + # Backward compat for even older versions of Python args = ( obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, From 8e8ee4138a291ac604bdfec8704bdd9c3f95dae5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 30 Jun 2021 18:33:20 +0200 Subject: [PATCH 4/5] no obj.co_exceptiontable in 3.10? --- cloudpickle/cloudpickle_fast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index b39620a0e..c46914c66 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -246,14 +246,14 @@ def _code_reduce(obj): """codeobject reducer""" if hasattr(obj, "co_linetable"): # pragma: no branch # Python 3.10 and later: obj.co_lnotab is deprecated and constructor - # expects obj.co_linetable and obj.co_exceptiontable instead. + # expects obj.co_linetable instead. args = ( obj.co_argcount, obj.co_posonlyargcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_linetable, obj.co_exceptiontable, - obj.co_freevars, obj.co_cellvars + obj.co_firstlineno, obj.co_linetable, obj.co_freevars, + obj.co_cellvars ) elif hasattr(obj, "co_posonlyargcount"): # Backward compat for 3.9 and older From 5ac1728d67b2220ffad0b0a7ba9028c470b37871 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 30 Jun 2021 18:38:12 +0200 Subject: [PATCH 5/5] Document the handling of the deprecation of co_lnotab in the changelog --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 0bc5eeb94..211fdc874 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -16,7 +16,7 @@ dev ([PR #409](https://github.com/cloudpipe/cloudpickle/pull/409)) - Suppressed a source of non-determinism when pickling dynamically defined - functions. + functions and handles the deprecation of co_lnotab in Python 3.10+. ([PR #428](https://github.com/cloudpipe/cloudpickle/pull/428)) 1.6.0