From d003266b18336e1e603536bdbe6518bc2dcc00d3 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 23 Nov 2023 18:38:33 +0100 Subject: [PATCH] ENH some steps to make cloudpickle dynamic function/classes more deterministic (#524) Co-authored-by: Olivier Grisel --- CHANGES.md | 4 + ci/install_coverage_subprocess_pth.py | 8 +- cloudpickle/cloudpickle.py | 86 ++++++++--- cloudpickle/cloudpickle_fast.py | 1 + tests/__init__.py | 3 + tests/cloudpickle_test.py | 146 +++++++++++++++++- .../_cloudpickle_testpkg/__init__.py | 5 +- .../_cloudpickle_testpkg/mod.py | 12 +- tests/cloudpickle_testpkg/setup.py | 16 +- tests/generate_old_pickles.py | 1 + tests/mock_local_folder/mod.py | 1 + tests/test_backward_compat.py | 1 + tests/testutils.py | 38 ++++- 13 files changed, 273 insertions(+), 49 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index c036cf1c9..ab86e7afb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,10 @@ 3.1.0 (in development) ====================== +- Some improvements to make cloudpickle more deterministic when pickling + dynamic functions and classes. + ([PR #524](https://github.com/cloudpipe/cloudpickle/pull/524)) + 3.0.0 ===== diff --git a/ci/install_coverage_subprocess_pth.py b/ci/install_coverage_subprocess_pth.py index 927820b97..a842ce961 100644 --- a/ci/install_coverage_subprocess_pth.py +++ b/ci/install_coverage_subprocess_pth.py @@ -9,8 +9,8 @@ import coverage; coverage.process_startup() """ -filename = op.join(get_path('purelib'), 'coverage_subprocess.pth') -with open(filename, 'wb') as f: - f.write(FILE_CONTENT.encode('ascii')) +filename = op.join(get_path("purelib"), "coverage_subprocess.pth") +with open(filename, "wb") as f: + f.write(FILE_CONTENT.encode("ascii")) -print('Installed subprocess coverage support: %s' % filename) +print("Installed subprocess coverage support: %s" % filename) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index eb43a9676..ec61002e8 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def): def register_pickle_by_value(module): - """Register a module to make it functions and classes picklable by value. + """Register a module to make its functions and classes picklable by value. By default, functions and classes that are attributes of an importable module are to be pickled by reference, that is relying on re-importing @@ -369,7 +369,7 @@ def func(): # sys.modules. if name is not None and name.startswith(prefix): # check whether the function can address the sub-module - tokens = set(name[len(prefix) :].split(".")) + tokens = set(name[len(prefix):].split(".")) if not tokens - set(code.co_names): subimports.append(sys.modules[name]) return subimports @@ -409,7 +409,10 @@ def _walk_global_ops(code): def _extract_class_dict(cls): """Retrieve a copy of the dict of a class without the inherited method.""" - clsdict = dict(cls.__dict__) # copy dict proxy to a dict + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)} + if len(cls.__bases__) == 1: inherited_dict = cls.__bases__[0].__dict__ else: @@ -533,9 +536,15 @@ class id will also reuse this class definition. The "extra" variable is meant to be a dict (or None) that can be used for forward compatibility shall the need arise. """ + # We need to intern the keys of the type_kwargs dict to avoid having + # different pickles for the same dynamic class depending on whether it was + # dynamically created or reconstructed from a pickled stream. + type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()} + skeleton_class = types.new_class( name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs) ) + return _lookup_class_or_track(class_tracker_id, skeleton_class) @@ -694,7 +703,9 @@ def _function_getstate(func): # unpickling time by iterating over slotstate and calling setattr(func, # slotname, slotvalue) slotstate = { - "__name__": func.__name__, + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + "__name__": "".join(func.__name__), "__qualname__": func.__qualname__, "__annotations__": func.__annotations__, "__kwdefaults__": func.__kwdefaults__, @@ -721,7 +732,9 @@ def _function_getstate(func): ) slotstate["__globals__"] = f_globals - state = func.__dict__ + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + state = {"".join(k): v for k, v in func.__dict__.items()} return state, slotstate @@ -802,6 +815,19 @@ def _code_reduce(obj): # of the specific type from types, for example: # >>> from types import CodeType # >>> help(CodeType) + + # Hack to circumvent non-predictable memoization caused by string interning. + # See the inline comment in _class_setstate for details. + co_name = "".join(obj.co_name) + + # Create shallow copies of these tuple to make cloudpickle payload deterministic. + # When creating a code object during load, copies of these four tuples are + # created, while in the main process, these tuples can be shared. + # By always creating copies, we make sure the resulting payload is deterministic. + co_names = tuple(name for name in obj.co_names) + co_varnames = tuple(name for name in obj.co_varnames) + co_freevars = tuple(name for name in obj.co_freevars) + co_cellvars = tuple(name for name in obj.co_cellvars) if hasattr(obj, "co_exceptiontable"): # Python 3.11 and later: there are some new attributes # related to the enhanced exceptions. @@ -814,16 +840,16 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_qualname, obj.co_firstlineno, obj.co_linetable, obj.co_exceptiontable, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) elif hasattr(obj, "co_linetable"): # Python 3.10 and later: obj.co_lnotab is deprecated and constructor @@ -837,14 +863,14 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_linetable, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) elif hasattr(obj, "co_nmeta"): # pragma: no cover # "nogil" Python: modified attributes from 3.9 @@ -859,15 +885,15 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_varnames, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_exc_handlers, obj.co_jump_table, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, obj.co_free2reg, obj.co_cell2reg, ) @@ -882,14 +908,14 @@ def _code_reduce(obj): obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, - obj.co_varnames, + co_names, + co_varnames, obj.co_filename, - obj.co_name, + co_name, obj.co_firstlineno, obj.co_lnotab, - obj.co_freevars, - obj.co_cellvars, + co_freevars, + co_cellvars, ) return types.CodeType, args @@ -1127,6 +1153,18 @@ def _class_setstate(obj, state): if attrname == "_abc_impl": registry = attr else: + # Note: setting attribute names on a class automatically triggers their + # interning in CPython: + # https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957 + # + # This means that to get deterministic pickling for a dynamic class that + # was initially defined in a different Python process, the pickler + # needs to ensure that dynamic class and function attribute names are + # systematically copied into a non-interned version to avoid + # unpredictable pickle payloads. + # + # Indeed the Pickler's memoizer relies on physical object identity to break + # cycles in the reference graph of the object being serialized. setattr(obj, attrname, attr) if registry is not None: for subclass in registry: diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 52d6732e4..20280f0ca 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -6,6 +6,7 @@ See: tests/test_backward_compat.py """ + from . import cloudpickle diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..10cd20ec6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +import pytest + +pytest.register_assert_rewrite("tests.testutils") diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 4041bf724..ed18285cf 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -48,10 +48,11 @@ from cloudpickle.cloudpickle import _extract_class_dict, _whichmodule from cloudpickle.cloudpickle import _lookup_module_and_qualname +from .testutils import subprocess_worker from .testutils import subprocess_pickle_echo from .testutils import subprocess_pickle_string from .testutils import assert_run_python_script -from .testutils import subprocess_worker +from .testutils import check_deterministic_pickle _TEST_GLOBAL_VARIABLE = "default_value" @@ -108,7 +109,7 @@ def method_c(self): return "c" clsdict = _extract_class_dict(C) - assert sorted(clsdict.keys()) == ["C_CONSTANT", "__doc__", "method_c"] + assert list(clsdict.keys()) == ["C_CONSTANT", "__doc__", "method_c"] assert clsdict["C_CONSTANT"] == 43 assert clsdict["__doc__"] is None assert clsdict["method_c"](C()) == C().method_c() @@ -1951,7 +1952,6 @@ def lookup(obj_id): class A: '''Updated class definition''' - pass assert not w.run(lambda obj_id: isinstance(lookup(obj_id), A), id1) retrieved1 = w.run(lookup, id1) @@ -1983,6 +1983,146 @@ class A: """.format(protocol=self.protocol) assert_run_python_script(code) + def test_dynamic_func_deterministic_roundtrip(self): + # Check that the pickle serialization for a dynamic func is the same + # in two processes. + + def get_dynamic_func_pickle(): + def test_method(arg_1, arg_2): + pass + + return cloudpickle.dumps(test_method) + + with subprocess_worker(protocol=self.protocol) as w: + A_dump = w.run(get_dynamic_func_pickle) + check_deterministic_pickle(A_dump, get_dynamic_func_pickle()) + + def test_dynamic_class_deterministic_roundtrip(self): + # Check that the pickle serialization for a dynamic class is the same + # in two processes. + pytest.xfail("This test fails due to different tracker_id.") + + def get_dynamic_class_pickle(): + class A: + """Class with potential string interning issues.""" + + arg_1 = "class_value" + + def join(self): + pass + + def test_method(self, arg_1, join): + pass + + return cloudpickle.dumps(A) + + with subprocess_worker(protocol=self.protocol) as w: + A_dump = w.run(get_dynamic_class_pickle) + check_deterministic_pickle(A_dump, get_dynamic_class_pickle()) + + def test_deterministic_dynamic_class_attr_ordering_for_chained_pickling(self): + # Check that the pickle produced by pickling a reconstructed class definition + # in a remote process matches the pickle produced by pickling the original + # class definition. + # In particular, this test checks that the order of the class attributes is + # deterministic. + + with subprocess_worker(protocol=self.protocol) as w: + + class A: + """Simple class definition""" + + pass + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + + # If the `__doc__` attribute is defined after some other class + # attribute, this can cause class attribute ordering changes due to + # the way we reconstruct the class definition in + # `_make_class_skeleton`, which creates the class and thus its + # `__doc__` attribute before populating the class attributes. + class A: + name = "A" + __doc__ = "Updated class definition" + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + + # If a `__doc__` is defined on the `__init__` method, this can + # cause ordering changes due to the way we reconstruct the class + # with `_make_class_skeleton`. + class A: + def __init__(self): + """Class definition with explicit __init__""" + pass + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + + def test_deterministic_str_interning_for_chained_dynamic_class_pickling(self): + # Check that the pickle produced by the unpickled instance is the same. + # This checks that there is no issue related to the string interning of + # the names of attributes of class definitions and names of attributes + # of the `__code__` objects of the methods. + + with subprocess_worker(protocol=self.protocol) as w: + # Due to interning of class attributes, check that this does not + # create issues with dynamic function definition. + class A: + """Class with potential string interning issues.""" + + arg_1 = "class_value" + + def join(self): + pass + + def test_method(self, arg_1, join): + pass + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + + # Also check that memoization of string value inside the class does + # not cause non-deterministic pickle with interned method names. + class A: + """Class with potential string interning issues.""" + + arg_1 = "join" + + def join(self, arg_1): + pass + + # Set a custom method attribute that can potentially trigger + # undeterministic memoization depending on the interning state of + # the string used for the attribute name. + A.join.arg_1 = "join" + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + + def test_dynamic_class_determinist_subworker_tuple_memoization(self): + # Check that the pickle produced by the unpickled instance is the same. + # This highlights some issues with tuple memoization. + + with subprocess_worker(protocol=self.protocol) as w: + # Arguments' tuple is memoized in the main process but not in the + # subprocess as the tuples do not share the same id in the loaded + # class. + + # XXX - this does not seem to work, and I am not sure there is an easy fix. + class A: + """Class with potential tuple memoization issues.""" + + def func1(self): + pass + + def func2(self): + pass + + A_dump = w.run(cloudpickle.dumps, A) + check_deterministic_pickle(A_dump, cloudpickle.dumps(A)) + @pytest.mark.skipif( platform.python_implementation() == "PyPy", reason="Skip PyPy because memory grows too much", diff --git a/tests/cloudpickle_testpkg/_cloudpickle_testpkg/__init__.py b/tests/cloudpickle_testpkg/_cloudpickle_testpkg/__init__.py index 58eb18673..2051e4c06 100644 --- a/tests/cloudpickle_testpkg/_cloudpickle_testpkg/__init__.py +++ b/tests/cloudpickle_testpkg/_cloudpickle_testpkg/__init__.py @@ -27,19 +27,22 @@ def relative_imports_factory(): Relative import of functions living both inside modules and packages are tested. """ + def f(): # module_function belongs to _cloudpickle_testpkg.mod, which is a # module from .mod import module_function + return module_function() def g(): # package_function belongs to _cloudpickle_testpkg, which is a package from . import package_function + return package_function() return f, g some_singleton = _SingletonClass() -T = typing.TypeVar('T') +T = typing.TypeVar("T") diff --git a/tests/cloudpickle_testpkg/_cloudpickle_testpkg/mod.py b/tests/cloudpickle_testpkg/_cloudpickle_testpkg/mod.py index e8225a4ca..f38cdb701 100644 --- a/tests/cloudpickle_testpkg/_cloudpickle_testpkg/mod.py +++ b/tests/cloudpickle_testpkg/_cloudpickle_testpkg/mod.py @@ -20,7 +20,7 @@ # module. The following lines emulate such a behavior without being a compiled # extension module. -submodule_name = '_cloudpickle_testpkg.mod.dynamic_submodule' +submodule_name = "_cloudpickle_testpkg.mod.dynamic_submodule" dynamic_submodule = types.ModuleType(submodule_name) # This line allows the dynamic_module to be imported using either one of: @@ -32,7 +32,7 @@ # so this dynamic module could be binded to another name. This behavior is # demonstrated with `dynamic_submodule_two` -submodule_name_two = '_cloudpickle_testpkg.mod.dynamic_submodule_two' +submodule_name_two = "_cloudpickle_testpkg.mod.dynamic_submodule_two" # Notice the inconsistent name binding, breaking attribute lookup-based import # attempts. another_submodule = types.ModuleType(submodule_name_two) @@ -41,9 +41,7 @@ # In this third case, the module is not added to sys.modules, and can only be # imported using attribute lookup-based imports. -submodule_three = types.ModuleType( - '_cloudpickle_testpkg.mod.dynamic_submodule_three' -) +submodule_three = types.ModuleType("_cloudpickle_testpkg.mod.dynamic_submodule_three") code = """ def f(x): return x @@ -53,9 +51,7 @@ def f(x): # What about a dynamic submodule inside a dynamic submodule inside an # importable module? -subsubmodule_name = ( - '_cloudpickle_testpkg.mod.dynamic_submodule.dynamic_subsubmodule' -) +subsubmodule_name = "_cloudpickle_testpkg.mod.dynamic_submodule.dynamic_subsubmodule" dynamic_subsubmodule = types.ModuleType(subsubmodule_name) dynamic_submodule.dynamic_subsubmodule = dynamic_subsubmodule sys.modules[subsubmodule_name] = dynamic_subsubmodule diff --git a/tests/cloudpickle_testpkg/setup.py b/tests/cloudpickle_testpkg/setup.py index 5cb49f907..403b69f76 100644 --- a/tests/cloudpickle_testpkg/setup.py +++ b/tests/cloudpickle_testpkg/setup.py @@ -5,12 +5,12 @@ setup( - name='cloudpickle_testpkg', - version='0.0.0', - description='Package used only for cloudpickle testing purposes', - author='Cloudpipe', - author_email='cloudpipe@googlegroups.com', - license='BSD 3-Clause License', - packages=['_cloudpickle_testpkg'], - python_requires='>=3.8', + name="cloudpickle_testpkg", + version="0.0.0", + description="Package used only for cloudpickle testing purposes", + author="Cloudpipe", + author_email="cloudpipe@googlegroups.com", + license="BSD 3-Clause License", + packages=["_cloudpickle_testpkg"], + python_requires=">=3.8", ) diff --git a/tests/generate_old_pickles.py b/tests/generate_old_pickles.py index d91aad6ef..ff8077704 100644 --- a/tests/generate_old_pickles.py +++ b/tests/generate_old_pickles.py @@ -8,6 +8,7 @@ active cloudpickle branch to make sure that cloudpickle is able to depickle old cloudpickle files. """ + import sys from pathlib import Path diff --git a/tests/mock_local_folder/mod.py b/tests/mock_local_folder/mod.py index 1a1c1da46..517d5013d 100644 --- a/tests/mock_local_folder/mod.py +++ b/tests/mock_local_folder/mod.py @@ -5,6 +5,7 @@ reference should instead flagged to cloudpickle for pickling by value: this is done using the register_pickle_by_value api exposed by cloudpickle. """ + import typing diff --git a/tests/test_backward_compat.py b/tests/test_backward_compat.py index 1abb3da4f..1d1d70a92 100644 --- a/tests/test_backward_compat.py +++ b/tests/test_backward_compat.py @@ -9,6 +9,7 @@ few canonical use cases. Cloudpicke backward-compatitibility support remains a best-effort initiative. """ + import pickle import pytest diff --git a/tests/testutils.py b/tests/testutils.py index 24bef9e57..bf2d3bcad 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -1,9 +1,12 @@ import sys import os -import os.path as op +import io +import difflib import tempfile +import os.path as op from subprocess import Popen, check_output, PIPE, STDOUT, CalledProcessError import pickle +import pickletools from contextlib import contextmanager from concurrent.futures import ProcessPoolExecutor @@ -213,6 +216,39 @@ def assert_run_python_script(source_code, timeout=TIMEOUT): os.unlink(source_file) +def check_deterministic_pickle(a, b): + """Check that two pickle output are bitwise equal. + + If it is not the case, print the diff between the disassembled pickle + payloads. + + This helper is useful to investigate non-deterministic pickling. + """ + if a != b: + with io.StringIO() as out: + pickletools.dis(pickletools.optimize(a), out) + a_out = out.getvalue() + # Remove the 11 first characters of each line to remove the bytecode offset + # of each object, which is different on each line for very small differences, + # making the diff very hard to read. + a_out = "\n".join(ll[11:] for ll in a_out.splitlines()) + with io.StringIO() as out: + pickletools.dis(pickletools.optimize(b), out) + b_out = out.getvalue() + b_out = "\n".join(ll[11:] for ll in b_out.splitlines()) + assert a_out == b_out + full_diff = difflib.context_diff( + a_out.splitlines(keepends=True), b_out.splitlines(keepends=True) + ) + full_diff = "".join(full_diff) + if len(full_diff) > 1500: + full_diff = full_diff[:1494] + " [...]" + raise AssertionError( + "Pickle payloads are not bitwise equal:\n" + + full_diff + ) + + if __name__ == "__main__": protocol = int(sys.argv[sys.argv.index("--protocol") + 1]) pickle_echo(protocol=protocol)