diff --git a/README.md b/README.md index 53fc8fd..866de89 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,11 @@ to: - hash is computed once for lookup and reused to store the copy - keepalive is a lightweight vector of pointers instead of a `list` - memo object is not tracked in GC, unless stolen in custom `__deepcopy__` +- #### Native __reduce__ handling + When type's `__reduce__` strictly follows the protocol, `copium` handles returned values natively, + without interpreter overhead, the same way CPython pickle implementation does. + + [What if there's type mismatch?](#pickle-protocol) - #### Cached memo Rather than creating a new memo object for each `deepcopy` and discarding it after, copium stores one per thread and reuses it. Referenced objects are cleared, but some amount of memory stays @@ -118,50 +123,14 @@ still there are minor deviations from stdlib you should be aware of. ### Pickle protocol -`copium` is stricter than `copy` for some malformed `__reduce__` implementations. +stdlib's `copy` tolerates some deviations from the pickle protocol that `pickle` itself reject +(see https://github.com/python/cpython/issues/141757). -stdlib's `copy` tolerates some deviations from the pickle protocol that `pickle` (and `copium`) reject (see https://github.com/python/cpython/issues/141757). - -
-Example - -```python-repl ->>> import copy -... import pickle -... -... import copium -... -... class BadReduce: -... def __reduce__(self): -... return BadReduce, [] -... ->>> copy.deepcopy(BadReduce()) # copy doesn't require exact types in __reduce__ -<__main__.BadReduce object at 0x1026d7b10> ->>> copium.deepcopy(BadReduce()) # copium is stricter -Traceback (most recent call last): - File "", line 1, in - copium.deepcopy(BadReduce()) - ~~~~~~~~~~~~~~~^^^^^^^^^^^^^ -TypeError: second item of the tuple returned by __reduce__ must be a tuple, not list - ->>> pickle.dumps(BadReduce()) # so is pickle -Traceback (most recent call last): - File "", line 1, in - pickle.dumps(BadReduce()) - ~~~~~~~~~~~~^^^^^^^^^^^^^ -_pickle.PicklingError: second item of the tuple returned by __reduce__ must be a tuple, not list -when serializing BadReduce object -``` - -
- -If `copium` raises `TypeError` while `copy` does not, see if `pickle.dumps(obj)` works. -If it doesn't, the fix is easy: make your object comply with pickle protocol. - -[Tracking issue](https://github.com/Bobronium/copium/issues/32) - -> [!NOTE] -> If this becomes a real blocker for adoption, `copium` might mimic stdlib's behavior in the future releases while still being fast. +`copium` strictly follows stdlib semantics: if `__reduce__` +returns a list instead of a tuple for args, or a mapping instead of a dict for kwargs, +`copium` will coerce them the same way stdlib would +(via `*args` unpacking, `**kwargs` merging, `.items()` iteration, etc.). +Errors from malformed `__reduce__` results match what `copy.deepcopy` produces. ### Memo handling diff --git a/datamodelzoo b/datamodelzoo index b8e653b..0b96935 160000 --- a/datamodelzoo +++ b/datamodelzoo @@ -1 +1 @@ -Subproject commit b8e653bf1bd5e23264c116f5b158d11bc374438d +Subproject commit 0b96935482e01b060e8bbd28d6156021cbd415fe diff --git a/pyproject.toml b/pyproject.toml index d85af31..bbd3c7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ typecheck = [ test = [ "pytest>=8", "pytest-assert-type>=0.1.5", - "indifference>=0.1.0", + "indifference>=0.2.0", "typing-extensions; python_version < '3.12'", "datamodelzoo", "pytest-codspeed>=4.2.0", diff --git a/src/_deepcopy.c b/src/_deepcopy.c index 6037668..18dc8a3 100644 --- a/src/_deepcopy.c +++ b/src/_deepcopy.c @@ -481,28 +481,39 @@ static PyObject* reconstruct_newobj_ex(PyObject* argtup, PyMemoObject* memo) { ); return NULL; } + + PyObject* coerced_args = NULL; + PyObject* coerced_kwargs = NULL; + if (!PyTuple_Check(args)) { - PyErr_Format( - PyExc_TypeError, - "__newobj_ex__ arg 2 must be a tuple, not %.200s", - Py_TYPE(args)->tp_name - ); - return NULL; + coerced_args = PySequence_Tuple(args); + if (!coerced_args) + return NULL; + args = coerced_args; } if (!PyDict_Check(kwargs)) { - PyErr_Format( - PyExc_TypeError, - "__newobj_ex__ arg 3 must be a dict, not %.200s", - Py_TYPE(kwargs)->tp_name - ); - return NULL; + coerced_kwargs = PyDict_New(); + if (!coerced_kwargs) { + Py_XDECREF(coerced_args); + return NULL; + } + if (PyDict_Merge(coerced_kwargs, kwargs, 1) < 0) { + Py_XDECREF(coerced_args); + Py_DECREF(coerced_kwargs); + return NULL; + } + kwargs = coerced_kwargs; } PyObject* copied_args = deepcopy(args, memo); - if (!copied_args) + Py_XDECREF(coerced_args); + if (!copied_args) { + Py_XDECREF(coerced_kwargs); return NULL; + } PyObject* copied_kwargs = deepcopy(kwargs, memo); + Py_XDECREF(coerced_kwargs); if (!copied_kwargs) { Py_DECREF(copied_args); return NULL; @@ -569,15 +580,22 @@ static int apply_dict_state(PyObject* instance, PyObject* dict_state, PyMemoObje if (!dict_state || dict_state == Py_None) return 0; - if (!PyDict_Check(dict_state)) { - PyErr_SetString(PyExc_TypeError, "state must be a dict"); - return -1; - } - PyObject* copied = deepcopy(dict_state, memo); if (!copied) return -1; + if (UNLIKELY(!PyDict_Check(copied))) { + PyObject* instance_dict = PyObject_GetAttr(instance, module_state.s__dict__); + if (!instance_dict) { + Py_DECREF(copied); + return -1; + } + int ret = PyDict_Merge(instance_dict, copied, 1); + Py_DECREF(instance_dict); + Py_DECREF(copied); + return ret; + } + PyObject* instance_dict = PyObject_GetAttr(instance, module_state.s__dict__); if (!instance_dict) { Py_DECREF(copied); @@ -604,15 +622,48 @@ static int apply_slot_state(PyObject* instance, PyObject* slotstate, PyMemoObjec if (!slotstate || slotstate == Py_None) return 0; - if (!PyDict_Check(slotstate)) { - PyErr_SetString(PyExc_TypeError, "slot state is not a dictionary"); - return -1; - } - PyObject* copied = deepcopy(slotstate, memo); if (!copied) return -1; + if (UNLIKELY(!PyDict_Check(copied))) { + PyObject* items = PyObject_CallMethod(copied, "items", NULL); + Py_DECREF(copied); + if (!items) + return -1; + + PyObject* iterator = PyObject_GetIter(items); + Py_DECREF(items); + if (!iterator) + return -1; + + int ret = 0; + PyObject* pair; + while ((pair = PyIter_Next(iterator))) { + PyObject* seq = PySequence_Fast(pair, "items() must return pairs"); + Py_DECREF(pair); + if (!seq || PySequence_Fast_GET_SIZE(seq) != 2) { + Py_XDECREF(seq); + if (!PyErr_Occurred()) + PyErr_SetString(PyExc_ValueError, "not enough values to unpack"); + ret = -1; + break; + } + int set_ret = PyObject_SetAttr( + instance, PySequence_Fast_GET_ITEM(seq, 0), PySequence_Fast_GET_ITEM(seq, 1) + ); + Py_DECREF(seq); + if (set_ret < 0) { + ret = -1; + break; + } + } + if (ret == 0 && PyErr_Occurred()) + ret = -1; + Py_DECREF(iterator); + return ret; + } + PyObject *key, *value; Py_ssize_t pos = 0; int ret = 0; @@ -704,17 +755,42 @@ static int apply_dictitems(PyObject* instance, PyObject* dictitems, PyMemoObject PyObject* pair; while ((pair = PyIter_Next(iterator))) { - if (!PyTuple_Check(pair) || PyTuple_GET_SIZE(pair) != 2) { - Py_DECREF(pair); - PyErr_SetString(PyExc_ValueError, "dictiter must yield (key, value) pairs"); - ret = -1; - break; - } + PyObject* key; + PyObject* value; - PyObject* key = PyTuple_GET_ITEM(pair, 0); - PyObject* value = PyTuple_GET_ITEM(pair, 1); - Py_INCREF(key); - Py_INCREF(value); + if (LIKELY(PyTuple_Check(pair) && PyTuple_GET_SIZE(pair) == 2)) { + key = Py_NewRef(PyTuple_GET_ITEM(pair, 0)); + value = Py_NewRef(PyTuple_GET_ITEM(pair, 1)); + } else { + PyObject* seq = PySequence_Fast(pair, "cannot unpack non-sequence"); + if (!seq) { + Py_DECREF(pair); + ret = -1; + break; + } + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + if (n != 2) { + Py_DECREF(seq); + Py_DECREF(pair); + if (n < 2) + PyErr_Format( + PyExc_ValueError, + "not enough values to unpack (expected 2, got %zd)", + n + ); + else + PyErr_Format( + PyExc_ValueError, + "too many values to unpack (expected 2, got %zd)", + n + ); + ret = -1; + break; + } + key = Py_NewRef(PySequence_Fast_GET_ITEM(seq, 0)); + value = Py_NewRef(PySequence_Fast_GET_ITEM(seq, 1)); + Py_DECREF(seq); + } Py_DECREF(pair); Py_SETREF(key, deepcopy(key, memo)); diff --git a/src/_deepcopy_legacy.c b/src/_deepcopy_legacy.c index aefd382..f1b7ecf 100644 --- a/src/_deepcopy_legacy.c +++ b/src/_deepcopy_legacy.c @@ -522,28 +522,39 @@ static PyObject* reconstruct_newobj_ex_legacy( ); return NULL; } + + PyObject* coerced_args = NULL; + PyObject* coerced_kwargs = NULL; + if (!PyTuple_Check(args)) { - PyErr_Format( - PyExc_TypeError, - "__newobj_ex__ arg 2 must be a tuple, not %.200s", - Py_TYPE(args)->tp_name - ); - return NULL; + coerced_args = PySequence_Tuple(args); + if (!coerced_args) + return NULL; + args = coerced_args; } if (!PyDict_Check(kwargs)) { - PyErr_Format( - PyExc_TypeError, - "__newobj_ex__ arg 3 must be a dict, not %.200s", - Py_TYPE(kwargs)->tp_name - ); - return NULL; + coerced_kwargs = PyDict_New(); + if (!coerced_kwargs) { + Py_XDECREF(coerced_args); + return NULL; + } + if (PyDict_Merge(coerced_kwargs, kwargs, 1) < 0) { + Py_XDECREF(coerced_args); + Py_DECREF(coerced_kwargs); + return NULL; + } + kwargs = coerced_kwargs; } PyObject* copied_args = deepcopy_legacy(args, memo, keepalive_pointer); - if (!copied_args) + Py_XDECREF(coerced_args); + if (!copied_args) { + Py_XDECREF(coerced_kwargs); return NULL; + } PyObject* copied_kwargs = deepcopy_legacy(kwargs, memo, keepalive_pointer); + Py_XDECREF(coerced_kwargs); if (!copied_kwargs) { Py_DECREF(copied_args); return NULL; @@ -616,15 +627,22 @@ static int apply_dict_state_legacy( if (!dict_state || dict_state == Py_None) return 0; - if (!PyDict_Check(dict_state)) { - PyErr_SetString(PyExc_TypeError, "state must be a dict"); - return -1; - } - PyObject* copied = deepcopy_legacy(dict_state, memo, keepalive_pointer); if (!copied) return -1; + if (UNLIKELY(!PyDict_Check(copied))) { + PyObject* instance_dict = PyObject_GetAttr(instance, module_state.s__dict__); + if (!instance_dict) { + Py_DECREF(copied); + return -1; + } + int ret = PyDict_Merge(instance_dict, copied, 1); + Py_DECREF(instance_dict); + Py_DECREF(copied); + return ret; + } + PyObject* instance_dict = PyObject_GetAttr(instance, module_state.s__dict__); if (!instance_dict) { Py_DECREF(copied); @@ -653,15 +671,48 @@ static int apply_slot_state_legacy( if (!slotstate || slotstate == Py_None) return 0; - if (!PyDict_Check(slotstate)) { - PyErr_SetString(PyExc_TypeError, "slot state is not a dictionary"); - return -1; - } - PyObject* copied = deepcopy_legacy(slotstate, memo, keepalive_pointer); if (!copied) return -1; + if (UNLIKELY(!PyDict_Check(copied))) { + PyObject* items = PyObject_CallMethod(copied, "items", NULL); + Py_DECREF(copied); + if (!items) + return -1; + + PyObject* iterator = PyObject_GetIter(items); + Py_DECREF(items); + if (!iterator) + return -1; + + int ret = 0; + PyObject* pair; + while ((pair = PyIter_Next(iterator))) { + PyObject* seq = PySequence_Fast(pair, "items() must return pairs"); + Py_DECREF(pair); + if (!seq || PySequence_Fast_GET_SIZE(seq) != 2) { + Py_XDECREF(seq); + if (!PyErr_Occurred()) + PyErr_SetString(PyExc_ValueError, "not enough values to unpack"); + ret = -1; + break; + } + int set_ret = PyObject_SetAttr( + instance, PySequence_Fast_GET_ITEM(seq, 0), PySequence_Fast_GET_ITEM(seq, 1) + ); + Py_DECREF(seq); + if (set_ret < 0) { + ret = -1; + break; + } + } + if (ret == 0 && PyErr_Occurred()) + ret = -1; + Py_DECREF(iterator); + return ret; + } + PyObject *key, *value; Py_ssize_t pos = 0; int ret = 0; @@ -759,17 +810,42 @@ static int apply_dictitems_legacy( PyObject* pair; while ((pair = PyIter_Next(iterator))) { - if (!PyTuple_Check(pair) || PyTuple_GET_SIZE(pair) != 2) { - Py_DECREF(pair); - PyErr_SetString(PyExc_ValueError, "dictiter must yield (key, value) pairs"); - ret = -1; - break; - } + PyObject* key; + PyObject* value; - PyObject* key = PyTuple_GET_ITEM(pair, 0); - PyObject* value = PyTuple_GET_ITEM(pair, 1); - Py_INCREF(key); - Py_INCREF(value); + if (LIKELY(PyTuple_Check(pair) && PyTuple_GET_SIZE(pair) == 2)) { + key = Py_NewRef(PyTuple_GET_ITEM(pair, 0)); + value = Py_NewRef(PyTuple_GET_ITEM(pair, 1)); + } else { + PyObject* seq = PySequence_Fast(pair, "cannot unpack non-sequence"); + if (!seq) { + Py_DECREF(pair); + ret = -1; + break; + } + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + if (n != 2) { + Py_DECREF(seq); + Py_DECREF(pair); + if (n < 2) + PyErr_Format( + PyExc_ValueError, + "not enough values to unpack (expected 2, got %zd)", + n + ); + else + PyErr_Format( + PyExc_ValueError, + "too many values to unpack (expected 2, got %zd)", + n + ); + ret = -1; + break; + } + key = Py_NewRef(PySequence_Fast_GET_ITEM(seq, 0)); + value = Py_NewRef(PySequence_Fast_GET_ITEM(seq, 1)); + Py_DECREF(seq); + } Py_DECREF(pair); Py_SETREF(key, deepcopy_legacy(key, memo, keepalive_pointer)); diff --git a/src/_reduce_helpers.c b/src/_reduce_helpers.c index 3b95f1b..9cd49b7 100644 --- a/src/_reduce_helpers.c +++ b/src/_reduce_helpers.c @@ -95,40 +95,14 @@ static int validate_reduce_tuple( PyObject* listitems = (size >= 4) ? PyTuple_GET_ITEM(reduce_result, 3) : Py_None; PyObject* dictitems = (size == 5) ? PyTuple_GET_ITEM(reduce_result, 4) : Py_None; - if (!PyCallable_Check(callable)) { - PyErr_Format( - PyExc_TypeError, - "first item of the tuple returned by __reduce__ must be callable, not %.200s", - Py_TYPE(callable)->tp_name - ); - return REDUCE_ERROR; - } - if (!PyTuple_Check(argtup)) { - PyErr_Format( - PyExc_TypeError, - "second item of the tuple returned by __reduce__ must be a tuple, not %.200s", - Py_TYPE(argtup)->tp_name - ); - return REDUCE_ERROR; - } - - if (listitems != Py_None && !PyIter_Check(listitems)) { - PyErr_Format( - PyExc_TypeError, - "fourth item of the tuple returned by __reduce__ must be an iterator, not %.200s", - Py_TYPE(listitems)->tp_name - ); - return REDUCE_ERROR; - } - - if (dictitems != Py_None && !PyIter_Check(dictitems)) { - PyErr_Format( - PyExc_TypeError, - "fifth item of the tuple returned by __reduce__ must be an iterator, not %.200s", - Py_TYPE(dictitems)->tp_name - ); - return REDUCE_ERROR; + PyObject* coerced = PySequence_Tuple(argtup); + if (!coerced) + return REDUCE_ERROR; + PyObject* old = argtup; + PyTuple_SET_ITEM(reduce_result, 1, coerced); + Py_DECREF(old); + argtup = coerced; } *out_callable = callable; diff --git a/tests/test_copium.py b/tests/test_copium.py index e11bdae..18031e7 100644 --- a/tests/test_copium.py +++ b/tests/test_copium.py @@ -72,21 +72,15 @@ def __deepcopy__(self, memo: MutableMapping): EXPECTED_ERROR_DIVERGENCES = { - repr(TypeError("EvilReduceArgs() takes no arguments")): repr( - TypeError("second item of the tuple returned by __reduce__ must be a tuple, not str") - ), - repr(TypeError("'int' object is not callable")): repr( - TypeError("first item of the tuple returned by __reduce__ must be callable, not int") - ), + # Recreating these errors messages seems unnecessary. + # It's highly unlikely that anyone would depend on it being this exact message + # the type of exception is preserved and that's enough. repr(TypeError("_reconstruct() missing 1 required positional argument: 'args'")): repr( TypeError("tuple returned by __reduce__ must contain 2 through 5 elements") ), - repr(AttributeError("'EvilStateSlotsMapping' object has no attribute 'items'")): repr( - TypeError("slot state is not a dictionary") - ), - repr(ValueError("not enough values to unpack (expected 2, got 1)")): repr( - ValueError("dictiter must yield (key, value) pairs") - ), + repr(ValueError("too many values to unpack (expected 2)")): repr( + ValueError("too many values to unpack (expected 2, got 3)") + ) }