Skip to content

Commit

Permalink
FIX avoid problematic string memoization
Browse files Browse the repository at this point in the history
  • Loading branch information
tomMoral committed Nov 18, 2023
1 parent 252989c commit 9510a0e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 18 deletions.
23 changes: 15 additions & 8 deletions cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,10 @@ def _walk_global_ops(code):
def _extract_class_dict(cls):
"""Retrieve a copy of the dict of a class without the inherited method."""
# copy dict proxy to a dict. Sort the keys to make the pickle deterministic
clsdict = {k: cls.__dict__[k] for k in sorted(cls.__dict__)}
# Also create a copy of the dict's keys, to avoid its memoization.
# This is necessary as memoization happens only if all string are interned,
# which is not the case in reconstructed dynamic classes.
clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)}

if len(cls.__bases__) == 1:
inherited_dict = cls.__bases__[0].__dict__
Expand Down Expand Up @@ -702,9 +705,11 @@ def _function_getstate(func):
# unpickling time by iterating over slotstate and calling setattr(func,
# slotname, slotvalue)
slotstate = {
# Intern the function names to be consistent with the method names that are
# automatically interned with `setattr`. This is only the case for cpython.
"__name__": sys.intern(func.__name__) if not PYPY else func.__name__,
# Create a copy of the function name, to avoid memoization. This is necessary
# to ensure deterministic pickles, that depends wheter the name is interned
# or not. This is not guaranteed for reconstructed dynamic function, so we make
# sure it is never interned.
"__name__": "".join(func.__name__),
"__qualname__": func.__qualname__,
"__annotations__": func.__annotations__,
"__kwdefaults__": func.__kwdefaults__,
Expand Down Expand Up @@ -813,9 +818,11 @@ def _code_reduce(obj):
# >>> from types import CodeType
# >>> help(CodeType)

# We need to intern the function names to be consistent with the method name,
# which are interned automatically with `setattr`. This is only the case for cpython.
co_name = sys.intern(obj.co_name) if not PYPY else obj.co_name
# Create a copy of the object name, to avoid memoization. This is necessary
# to ensure deterministic pickles, that depends wheter the name is interned
# or not. This is not guaranteed for reconstructed dynamic classes, so we make
# sure it is never interned.
co_name = "".join(obj.co_name)

# Create copies of these tuple to make cloudpickle payload deterministic.
# When creating a code object during load, copies of these four tuples are
Expand Down Expand Up @@ -1151,7 +1158,7 @@ def _class_setstate(obj, state):
registry = attr
else:
# Note: setting attribute names on a class automatically triggers their
# interning in CPython:
# interning in CPython:
# https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957
#
# This means that to get deterministic pickling for a dynamic class that
Expand Down
10 changes: 2 additions & 8 deletions tests/cloudpickle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2081,7 +2081,8 @@ def test_method(self, arg_1, join):
A_dump = w.run(cloudpickle.dumps, A)
check_deterministic_pickle(A_dump, cloudpickle.dumps(A))

# XXX - this does not seem to work, and I am not sure there is an easy fix.
# Also check that memoization of string value inside the class does not cause
# non-deterministic pickle with interned method names.
class A:
"""Class with potential string interning issues."""

Expand All @@ -2091,13 +2092,6 @@ def join(self, arg_1):
pass

A_dump = w.run(cloudpickle.dumps, A)
pytest.xfail(
"This test is expected to fail due to a string interning difference: "
"the first payloads shares the same string instance both for the "
"'join' method name and the literal value in 'arg_1' while this is "
"not the case for the payload generated by the reconstruction of the "
"class definition on the remote Python process."
)
check_deterministic_pickle(A_dump, cloudpickle.dumps(A))

def test_dynamic_class_determinist_subworker_tuple_memoization(self):
Expand Down
5 changes: 3 additions & 2 deletions tests/testutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,13 @@ def check_deterministic_pickle(a, b):
pickletools.dis(pickletools.optimize(b), out)
b_out = out.getvalue()
b_out = "\n".join(ll[11:] for ll in b_out.splitlines())
assert a_out == b_out
full_diff = difflib.context_diff(
a_out.splitlines(keepends=True), b_out.splitlines(keepends=True)
)
full_diff = "".join(full_diff)
if len(full_diff) > 1000:
full_diff = full_diff[:994] + " [...]"
if len(full_diff) > 1500:
full_diff = full_diff[:1494] + " [...]"
raise AssertionError(
"Pickle payloads are not bitwise equal:\n"
+ full_diff
Expand Down

0 comments on commit 9510a0e

Please sign in to comment.