From 8d63a2e4f583bacb2a21e3afccc7ae8d7e556cf5 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Wed, 21 Apr 2021 10:35:08 -0400 Subject: [PATCH 1/5] TST: Refactor subprocess_pickle_echo for string inspection Separate out the retrieval of the pickle string from the subprocess with the unpickling, so the string can be directly compared. This patch also adds an add_env parameter to allow subprocesses to be passed environment variables. The main use case under consideration is setting PYTHONHASHSEED, allowing these functions to be used to confirm (in)consistency of strings under different initial conditions. --- tests/testutils.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/testutils.py b/tests/testutils.py index 6acc998d4..9c7d5c47d 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -38,24 +38,25 @@ def _make_cwd_env(): return cloudpickle_repo_folder, env -def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): - """Echo function with a child Python process +def subprocess_pickle_string(input_data, protocol=None, timeout=TIMEOUT, + add_env=None): + """Retrieve pickle string of an object generated by a child Python process Pickle the input data into a buffer, send it to a subprocess via stdin, expect the subprocess to unpickle, re-pickle that data back - and send it back to the parent process via stdout for final unpickling. + and send it back to the parent process via stdout. - >>> subprocess_pickle_echo([1, 'a', None]) - [1, 'a', None] + >>> testutils.subprocess_pickle_string([1, 'a', None], protocol=2) + b'\x80\x02]q\x00(K\x01X\x01\x00\x00\x00aq\x01Ne.' """ - # run then pickle_echo(protocol=protocol) in __main__: - # Protect stderr from any warning, as we will assume an error will happen # if it is not empty. A concrete example is pytest using the imp module, # which is deprecated in python 3.8 cmd = [sys.executable, '-W ignore', __file__, "--protocol", str(protocol)] cwd, env = _make_cwd_env() + if add_env: + env.update(add_env) proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env, bufsize=4096) pickle_string = dumps(input_data, protocol=protocol) @@ -67,7 +68,7 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): message = "Subprocess returned %d: " % proc.returncode message += err.decode('utf-8') raise RuntimeError(message) - return loads(out) + return out except TimeoutExpired as e: proc.kill() out, err = proc.communicate() @@ -75,6 +76,25 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): raise RuntimeError(message) from e +def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT, + add_env=None): + """Echo function with a child Python process + + Pickle the input data into a buffer, send it to a subprocess via + stdin, expect the subprocess to unpickle, re-pickle that data back + and send it back to the parent process via stdout for final unpickling. + + >>> subprocess_pickle_echo([1, 'a', None]) + [1, 'a', None] + + """ + out = subprocess_pickle_string(input_data, + protocol=protocol, + timeout=timeout, + add_env=add_env) + return loads(out) + + def _read_all_bytes(stream_in, chunk_size=4096): all_data = b"" while True: From b9d2bdc4ad07961de4aa0d28b990d78e7e064c4b Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Wed, 21 Apr 2021 10:39:52 -0400 Subject: [PATCH 2/5] TST: Verify the same pickle string is produced with global variables (fails) --- tests/cloudpickle_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 845f27962..c9b631078 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -49,6 +49,7 @@ from cloudpickle.cloudpickle import _extract_class_dict, _whichmodule from cloudpickle.cloudpickle import _lookup_module_and_qualname +from .testutils import subprocess_pickle_string from .testutils import subprocess_pickle_echo from .testutils import assert_run_python_script from .testutils import subprocess_worker @@ -57,6 +58,7 @@ _TEST_GLOBAL_VARIABLE = "default_value" +_TEST_GLOBAL_VARIABLE2 = "another_value" class RaiserOnPickle(object): @@ -2321,6 +2323,19 @@ def __type__(self): o = MyClass() pickle_depickle(o, protocol=self.protocol) + def test_sorted_globals(self): + vals = set() + + def func_with_globals(): + return _TEST_GLOBAL_VARIABLE + _TEST_GLOBAL_VARIABLE2 + + for i in range(5): + vals.add( + subprocess_pickle_string(func_with_globals, + protocol=self.protocol, + add_env={"PYTHONHASHSEED": str(i)})) + assert len(vals) == 1 + class Protocol2CloudPickleTest(CloudPickleTest): From ca7b0e81e055eab4ce6cc28bafadc0b2a0fc1c69 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Wed, 21 Apr 2021 10:41:42 -0400 Subject: [PATCH 3/5] ENH: Sort __globals__ dict to make pickle string more deterministic --- cloudpickle/cloudpickle_fast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 46a9540ec..e14e4710d 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -155,8 +155,8 @@ def _function_getstate(func): } f_globals_ref = _extract_code_globals(func.__code__) - f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in - func.__globals__} + f_globals = {k: func.__globals__[k] for k in sorted(f_globals_ref) + if k in func.__globals__} closure_values = ( list(map(_get_cell_contents, func.__closure__)) From 1491be99bed851addc8dadd809277375b194629f Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Wed, 21 Apr 2021 12:17:42 -0400 Subject: [PATCH 4/5] TST: Skip determinism check in Py3.5 --- tests/cloudpickle_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index c9b631078..cbce00c4d 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -2323,6 +2323,9 @@ def __type__(self): o = MyClass() pickle_depickle(o, protocol=self.protocol) + @pytest.mark.skipif( + sys.version_info < (3, 6, 0), + reason="Dict determinism is a lost cause in Python < 3.6") def test_sorted_globals(self): vals = set() From b92c2965bee1d901ca63723b741c36c757957713 Mon Sep 17 00:00:00 2001 From: "Christopher J. Markiewicz" Date: Tue, 22 Jun 2021 17:08:08 -0400 Subject: [PATCH 5/5] TEST: Add a non-regression test to ensure sorting takes a reasonably short time --- tests/cloudpickle_test.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index cbce00c4d..7784f5769 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -24,6 +24,7 @@ import enum import typing from functools import wraps +import time import pytest @@ -60,6 +61,8 @@ _TEST_GLOBAL_VARIABLE = "default_value" _TEST_GLOBAL_VARIABLE2 = "another_value" +exec("def _TEST_BIG_GLOBAL_SPACE():\n return %s" % ", ".join([f"a{i}" for i in range(1000)])) + class RaiserOnPickle(object): @@ -2339,6 +2342,16 @@ def func_with_globals(): add_env={"PYTHONHASHSEED": str(i)})) assert len(vals) == 1 + def test_efficient_sorted_globals(self): + # Non regression test to demonstrate that large numbers of globals + # do not cause slowdown + gvars = set(f"a{i}" for i in range(1000)) + assert cloudpickle.cloudpickle._extract_code_globals( + _TEST_BIG_GLOBAL_SPACE.__code__) == gvars + tic = time.time() + subprocess_pickle_string(_TEST_BIG_GLOBAL_SPACE, protocol=self.protocol) + assert time.time() - tic < 0.5 + class Protocol2CloudPickleTest(CloudPickleTest):