Skip to content

Commit

Permalink
ENH some steps to make cloudpickle dynamic function/classes more dete…
Browse files Browse the repository at this point in the history
…rministic (#524)


Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
  • Loading branch information
tomMoral and ogrisel committed Nov 23, 2023
1 parent 25aef95 commit d003266
Show file tree
Hide file tree
Showing 13 changed files with 273 additions and 49 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
@@ -1,6 +1,10 @@
3.1.0 (in development)
======================

- Some improvements to make cloudpickle more deterministic when pickling
dynamic functions and classes.
([PR #524](https://github.com/cloudpipe/cloudpickle/pull/524))


3.0.0
=====
Expand Down
8 changes: 4 additions & 4 deletions ci/install_coverage_subprocess_pth.py
Expand Up @@ -9,8 +9,8 @@
import coverage; coverage.process_startup()
"""

filename = op.join(get_path('purelib'), 'coverage_subprocess.pth')
with open(filename, 'wb') as f:
f.write(FILE_CONTENT.encode('ascii'))
filename = op.join(get_path("purelib"), "coverage_subprocess.pth")
with open(filename, "wb") as f:
f.write(FILE_CONTENT.encode("ascii"))

print('Installed subprocess coverage support: %s' % filename)
print("Installed subprocess coverage support: %s" % filename)
86 changes: 62 additions & 24 deletions cloudpickle/cloudpickle.py
Expand Up @@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def):


def register_pickle_by_value(module):
"""Register a module to make it functions and classes picklable by value.
"""Register a module to make its functions and classes picklable by value.
By default, functions and classes that are attributes of an importable
module are to be pickled by reference, that is relying on re-importing
Expand Down Expand Up @@ -369,7 +369,7 @@ def func():
# sys.modules.
if name is not None and name.startswith(prefix):
# check whether the function can address the sub-module
tokens = set(name[len(prefix) :].split("."))
tokens = set(name[len(prefix):].split("."))
if not tokens - set(code.co_names):
subimports.append(sys.modules[name])
return subimports
Expand Down Expand Up @@ -409,7 +409,10 @@ def _walk_global_ops(code):

def _extract_class_dict(cls):
"""Retrieve a copy of the dict of a class without the inherited method."""
clsdict = dict(cls.__dict__) # copy dict proxy to a dict
# Hack to circumvent non-predictable memoization caused by string interning.
# See the inline comment in _class_setstate for details.
clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)}

if len(cls.__bases__) == 1:
inherited_dict = cls.__bases__[0].__dict__
else:
Expand Down Expand Up @@ -533,9 +536,15 @@ class id will also reuse this class definition.
The "extra" variable is meant to be a dict (or None) that can be used for
forward compatibility shall the need arise.
"""
# We need to intern the keys of the type_kwargs dict to avoid having
# different pickles for the same dynamic class depending on whether it was
# dynamically created or reconstructed from a pickled stream.
type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()}

skeleton_class = types.new_class(
name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs)
)

return _lookup_class_or_track(class_tracker_id, skeleton_class)


Expand Down Expand Up @@ -694,7 +703,9 @@ def _function_getstate(func):
# unpickling time by iterating over slotstate and calling setattr(func,
# slotname, slotvalue)
slotstate = {
"__name__": func.__name__,
# Hack to circumvent non-predictable memoization caused by string interning.
# See the inline comment in _class_setstate for details.
"__name__": "".join(func.__name__),
"__qualname__": func.__qualname__,
"__annotations__": func.__annotations__,
"__kwdefaults__": func.__kwdefaults__,
Expand All @@ -721,7 +732,9 @@ def _function_getstate(func):
)
slotstate["__globals__"] = f_globals

state = func.__dict__
# Hack to circumvent non-predictable memoization caused by string interning.
# See the inline comment in _class_setstate for details.
state = {"".join(k): v for k, v in func.__dict__.items()}
return state, slotstate


Expand Down Expand Up @@ -802,6 +815,19 @@ def _code_reduce(obj):
# of the specific type from types, for example:
# >>> from types import CodeType
# >>> help(CodeType)

# Hack to circumvent non-predictable memoization caused by string interning.
# See the inline comment in _class_setstate for details.
co_name = "".join(obj.co_name)

# Create shallow copies of these tuple to make cloudpickle payload deterministic.
# When creating a code object during load, copies of these four tuples are
# created, while in the main process, these tuples can be shared.
# By always creating copies, we make sure the resulting payload is deterministic.
co_names = tuple(name for name in obj.co_names)
co_varnames = tuple(name for name in obj.co_varnames)
co_freevars = tuple(name for name in obj.co_freevars)
co_cellvars = tuple(name for name in obj.co_cellvars)
if hasattr(obj, "co_exceptiontable"):
# Python 3.11 and later: there are some new attributes
# related to the enhanced exceptions.
Expand All @@ -814,16 +840,16 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_qualname,
obj.co_firstlineno,
obj.co_linetable,
obj.co_exceptiontable,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
elif hasattr(obj, "co_linetable"):
# Python 3.10 and later: obj.co_lnotab is deprecated and constructor
Expand All @@ -837,14 +863,14 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_linetable,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
elif hasattr(obj, "co_nmeta"): # pragma: no cover
# "nogil" Python: modified attributes from 3.9
Expand All @@ -859,15 +885,15 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_varnames,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_lnotab,
obj.co_exc_handlers,
obj.co_jump_table,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
obj.co_free2reg,
obj.co_cell2reg,
)
Expand All @@ -882,14 +908,14 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_lnotab,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
return types.CodeType, args

Expand Down Expand Up @@ -1127,6 +1153,18 @@ def _class_setstate(obj, state):
if attrname == "_abc_impl":
registry = attr
else:
# Note: setting attribute names on a class automatically triggers their
# interning in CPython:
# https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957
#
# This means that to get deterministic pickling for a dynamic class that
# was initially defined in a different Python process, the pickler
# needs to ensure that dynamic class and function attribute names are
# systematically copied into a non-interned version to avoid
# unpredictable pickle payloads.
#
# Indeed the Pickler's memoizer relies on physical object identity to break
# cycles in the reference graph of the object being serialized.
setattr(obj, attrname, attr)
if registry is not None:
for subclass in registry:
Expand Down
1 change: 1 addition & 0 deletions cloudpickle/cloudpickle_fast.py
Expand Up @@ -6,6 +6,7 @@
See: tests/test_backward_compat.py
"""

from . import cloudpickle


Expand Down
3 changes: 3 additions & 0 deletions tests/__init__.py
@@ -0,0 +1,3 @@
import pytest

pytest.register_assert_rewrite("tests.testutils")

0 comments on commit d003266

Please sign in to comment.