Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP refactor func globals to make external delegation possible #430

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 61 additions & 0 deletions cloudpickle/cloudpickle.py
Expand Up @@ -637,6 +637,67 @@ def __reduce__(cls):
return cls.__name__


def _noop(x):
"""Identity function"""
return x


class _FuncMetadataGlobals:
"""Extracts the base metadata of func.__globals__

This wrapper makes it possible to customize the serialization of the
function globals by subclassing the pickler and overriding persistend_id /
persistent_load:

https://docs.python.org/3/library/pickle.html#persistence-of-external-objects
"""

_common_func_keys = ("__package__", "__name__", "__path__", "__file__")

def __init__(self, func, shared_namespace):
self.func = func
func_globals = func.__globals__
if not shared_namespace:
# The shared name space is empty, meaning that it's the first time
# this function is pickled by the CloudPickler instance: let's
# populate it by the base globals of the function.
shared_namespace.update({
k: func_globals[k] for k in self._common_func_keys
if k in func_globals
})
self.func_metadata_globals = shared_namespace

def __reduce__(self):
# By default, only pickle the core meta-data information of the globals
# dict of the function. The actual symbols referenced in func.__code__
# are pickled separately in _FilteredFuncGlobals.
return _noop, (self.func_metadata_globals,)


class _FuncCodeGlobals:
"""Extracts entries of func.__globals__ actually referenced in func.__code__

This wrapper makes it possible to customize the serialization of the
globals by subclassing the pickler and overriding persistend_id /
persistent_load:

https://docs.python.org/3/library/pickle.html#persistence-of-external-objects
"""

def __init__(self, func):
self.func = func
code_global_names = _extract_code_globals(func.__code__)
func_globals = func.__globals__
self.func_code_globals = {
k: func_globals[k] for k in code_global_names if k in func_globals
}

def __reduce__(self):
# By default, only pickle the Python ojects actually referenced by the
# code of the function.
return _noop, (self.func_code_globals,)


def _fill_function(*args):
"""Fills in the rest of function data into the skeleton function object

Expand Down
26 changes: 11 additions & 15 deletions cloudpickle/cloudpickle_fast.py
Expand Up @@ -36,6 +36,7 @@
parametrized_type_hint_getinitargs, _create_parametrized_type_hint,
builtin_code_type,
_make_dict_keys, _make_dict_values, _make_dict_items,
_FuncMetadataGlobals, _FuncCodeGlobals,
)


Expand Down Expand Up @@ -153,11 +154,7 @@ def _function_getstate(func):
"__doc__": func.__doc__,
"__closure__": func.__closure__,
}

f_globals_ref = _extract_code_globals(func.__code__)
f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in
func.__globals__}

f_globals = _FuncCodeGlobals(func)
closure_values = (
list(map(_get_cell_contents, func.__closure__))
if func.__closure__ is not None else ()
Expand All @@ -168,7 +165,12 @@ def _function_getstate(func):
# trigger the side effect of importing these modules at unpickling time
# (which is necessary for func to work correctly once depickled)
slotstate["_cloudpickle_submodules"] = _find_imported_submodules(
func.__code__, itertools.chain(f_globals.values(), closure_values))
func.__code__,
itertools.chain(
f_globals.func_code_globals.values(),
closure_values,
),
)
slotstate["__globals__"] = f_globals

state = func.__dict__
Expand Down Expand Up @@ -577,15 +579,9 @@ def _function_getnewargs(self, func):
# same invocation of cloudpickle.dump/cloudpickle.dumps (for example:
# cloudpickle.dumps([f1, f2])). There is no such limitation when using
# CloudPickler.dump, as long as the multiple invocations are bound to
# the same CloudPickler.
base_globals = self.globals_ref.setdefault(id(func.__globals__), {})

if base_globals == {}:
# Add module attributes used to resolve relative imports
# instructions inside func.
for k in ["__package__", "__name__", "__path__", "__file__"]:
if k in func.__globals__:
base_globals[k] = func.__globals__[k]
# the same CloudPickler instance.
shared_ns = self.globals_ref.setdefault(id(func.__globals__), {})
base_globals = _FuncMetadataGlobals(func, shared_namespace=shared_ns)

# Do not bind the free variables before the function is created to
# avoid infinite recursion.
Expand Down
40 changes: 40 additions & 0 deletions tests/cloudpickle_test.py
Expand Up @@ -49,6 +49,7 @@
from cloudpickle.cloudpickle import _make_empty_cell, cell_set
from cloudpickle.cloudpickle import _extract_class_dict, _whichmodule
from cloudpickle.cloudpickle import _lookup_module_and_qualname
from cloudpickle.cloudpickle import _FuncMetadataGlobals, _FuncCodeGlobals
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to make those directly importable from the top level cloudpickle package even if they are private API?


from .testutils import subprocess_pickle_echo
from .testutils import subprocess_pickle_string
Expand Down Expand Up @@ -2377,6 +2378,45 @@ def func_with_globals():
"Expected a single deterministic payload, got %d/5" % len(vals)
)

def test_externally_managed_function_globals(self):
common_globals = {"a": "foo"}

class CustomPickler(cloudpickle.CloudPickler):
@staticmethod
def persistent_id(obj):
if (
isinstance(obj, _FuncMetadataGlobals)
Copy link
Contributor Author

@ogrisel ogrisel Jul 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_FuncMetadataGlobals and _FuncCodeGlobals are still private API so the subclassing pickler would still be impacted by changes in those constructs. However it feels maybe less brittle than overriding the save_function or _dynamic_function_reduce methods.

Copy link

@faucct faucct Jul 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this is worse than my solution:

  • it is less intuitive as it forces you to use private API
  • it also brings unnecessary wrapping of all dicts, so you will have to maintain these wrappers forever

My solution does not bring any more APIs as it follows official python docs. It also does not complicate serialization for people who don't use persistent_id for func.__globals__.

Copy link
Contributor Author

@ogrisel ogrisel Jul 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I don't like your solution because it's calling persistent_id with the global dict at a point where it's not supposed to be called: the full func globals dict is never pickled when persistent_id is not overridden, only the derived base_globals and f_globals are. Therefore your solution is a breach/abuse of the persistent_id protocol as defined in the pickle module documentation.

If we don't like the wrapping, we could instead inject a specific key inside the dict with the id of the full func.__globals__. But it would make it hard to write code to access it and it would make it impossible to know which function is being pickled (but maybe users don't need that info).

Copy link

@faucct faucct Jul 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the full func globals dict is never pickled when persistent_id is not overridden, only the derived base_globals and f_globals are.

Are you talking about cloudpickle or pickle? Because pickle pickles full func globals dict and we want to be able to tell cloudpickle that we want the same behaviour here.

Therefore your solution is a breach/abuse of the persistent_id protocol as defined in the pickle module documentation.

I don't see how it is. When you replace cloudpickle.CloudPickler with pickle.Pickler in my example you receive the desired behavior:

import io
import pickle

__globals__ = {"a": "foo"}


class Pickler(pickle.Pickler):
    @staticmethod
    def persistent_id(obj):
        if id(obj) == id(__globals__):
            return "__globals__"


class Unpickler(pickle.Unpickler):
    @staticmethod
    def persistent_load(pid):
        return {"__globals__": __globals__}[pid]


exec("""
def get():
    return a
""", __globals__)
get = __globals__['get']
file = io.BytesIO()
Pickler(file).dump(get)
dumped = file.getvalue()
if b'foo' in dumped:
    raise Exception(repr(dumped))
get = Unpickler(io.BytesIO(dumped)).load()
if id(__globals__) != id(get.__globals__):
    raise Exception(f"id(__globals__) != id(get.__globals__)")
print(f"get() == {get()}")
__globals__['a'] = 'bar'
print(f"get() == {get()}")

But it would make it hard to write code to access it and it would make it impossible to know which function is being pickled (but maybe users don't need that info).

That's right, users don't need that info.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you replace cloudpickle.CloudPickler with pickle.Pickler in my example you receive the desired behavior

Your example here works without defining persistent_id and persistent_load, so I am not sure it is relevant to why your changes are "abusing" the persistent_id protocol.

In [9]: import io
   ...: import pickle
   ...:
   ...: __globals__ = {"a": "foo"}
   ...:
   ...:
   ...: class Pickler(pickle.Pickler):
   ...:     pass
   ...:
   ...: class Unpickler(pickle.Unpickler):
   ...:     pass
   ...:
   ...: exec("""
   ...: def get():
   ...:     return a
   ...: """, __globals__)
   ...: get = __globals__['get']
   ...: file = io.BytesIO()
   ...: Pickler(file).dump(get)
   ...: dumped = file.getvalue()
   ...: if b'foo' in dumped:
   ...:     raise Exception(repr(dumped))
   ...: get = Unpickler(io.BytesIO(dumped)).load()
   ...: if id(__globals__) != id(get.__globals__):
   ...:     raise Exception(f"id(__globals__) != id(get.__globals__)")
   ...: print(f"get() == {get()}")
   ...: __globals__['a'] = 'bar'
   ...: print(f"get() == {get()}")

I'll try to explain one more time why your PR does not use persistent_id as it is intended to be used. According to the pickle documentation:

To pickle objects that have an external persistent ID, the pickler must have a custom persistent_id() method that takes an object as an argument and returns either None or the persistent ID for that object. When None is returned, the pickler simply pickles the object as normal. When a persistent ID string is returned, the pickler will pickle that object, along with a marker so that the unpickler will recognize it as a persistent ID.

In other words, persistent_id(obj) only has an impact on which obj is pickled. In your PR, persistent_id(func.__globals__) has an impact on the content of two different objects that are NOT func.__globals__: f_globals and base_globals. Such a case is never advertised by the pickle docs, and thus is an abuse of the persistent_id protocol.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because pickle pickles full func globals dict and we want to be able to tell cloudpickle that we want the same behaviour here

"pickles full globals dict" in which setting?

import pickle


global_var = 1


def f():
    return global_var


pickle.dumps(f)

does not pickle any global variable, or any dictionary containing global variables. It pickles f by reference.

and obj.func.__globals__ is common_globals
):
return "common_globals"
elif (
isinstance(obj, _FuncCodeGlobals)
and obj.func.__globals__ is common_globals
):
return "empty_dict"

class CustomUnpickler(pickle.Unpickler):
@staticmethod
def persistent_load(pid):
return {
"common_globals": common_globals,
"empty_dict": {}
}[pid]

lookup_a = eval('lambda: a', common_globals)
assert lookup_a() == "foo"

file = io.BytesIO()
CustomPickler(file).dump(lookup_a)
dumped = file.getvalue()
assert b'foo' not in dumped

lookup_a_cloned = CustomUnpickler(io.BytesIO(dumped)).load()
assert lookup_a_cloned() == "foo"
assert lookup_a_cloned.__globals__ is common_globals
common_globals['a'] = 'bar'
assert lookup_a_cloned() == "bar"


class Protocol2CloudPickleTest(CloudPickleTest):

Expand Down