Skip to content

Commit

Permalink
Merge pull request #1531 from rstudio/disable-subclassed-dict-list-co…
Browse files Browse the repository at this point in the history
…nversion

disable auto conversion of subclassed dicts and lists
  • Loading branch information
t-kalinowski authored Jan 30, 2024
2 parents 866ff9a + a12f30e commit c6b601f
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 70 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# reticulate (development version)

- Subclassed Python list and dict objects are no longer automatically converted
to R vectors. Additionally, the S3 R `class` attribute for Python objects is
now constructed using the Python `type(object)` directly, rather than from the
`object.__class__` attribute. See #1531 for details and context.

- The knitr python engine now formats captured python exceptions to include the
exception type and any exception notes when chunk options
`error = TRUE` is set (reported in #1520, fixed in #1527).
Expand Down
141 changes: 78 additions & 63 deletions src/python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,15 @@ std::string as_r_class(PyObject* classPtr) {

std::vector<std::string> py_class_names(PyObject* object) {

// class
PyObjectPtr classPtr(PyObject_GetAttrString(object, "__class__"));
if (classPtr.is_null())
// Py_TYPE() usually returns a borrowed reference to object.__class__
// but can differ if __class__ was modified after the object was created.
// (e.g., wrapt.ObjectProxy(dict()), as encountered in
// tensorflow.python.trackable.data_structures._DictWrapper)
// In CPython, the definition of Py_TYPE() changed in Python 3.10
// from a macro with no return type to a inline static function returning PyTypeObject*.
// for back compat, we continue to define Py_TYPE as a macro in reticulate/src/libpython.h
PyObject* type = (PyObject*) Py_TYPE(object);
if (type == NULL)
throw PythonException(py_fetch_error());

// call inspect.getmro to get the class and it's bases in
Expand All @@ -476,7 +482,7 @@ std::vector<std::string> py_class_names(PyObject* object) {
throw PythonException(py_fetch_error());
}

PyObjectPtr classes(PyObject_CallFunctionObjArgs(getmro, classPtr.get(), NULL));
PyObjectPtr classes(PyObject_CallFunctionObjArgs(getmro, type, NULL));
if (classes.is_null())
throw PythonException(py_fetch_error());

Expand Down Expand Up @@ -1305,65 +1311,74 @@ SEXP py_to_r(PyObject* x, bool convert) {

}

else if (PyList_Check(x)) {
// didn't pass PyList_CheckExact(), but does pass PyList_Check()
// so it's an object that subclasses list.
// (This type of subclassed list is used by tensorflow for lists of layers
// attached to a keras model, tensorflow.python.training.tracking.data_structures.List,
// https://github.com/rstudio/reticulate/issues/1226 )
// if needed, consider changing this check from PyList_Check(x) to either:
// - PySequence_Check(x), which just checks for existence of __getitem__ and __len__ methods,
// - PyObject_IsInstance(x, Py_ListClass) for wrapt.ProxyObject wrapping a list.

// Since it's a subclassed list.
// We can't depend on the the PyList_* API working,
// and must instead fallback to the generic PyObject_* API or PySequence_API.
// (PyList_*() function do not work for tensorflow.python.training.tracking.data_structures.List)
long len = PyObject_Size(x);
Rcpp::List list(len);
for (long i = 0; i < len; i++) {
PyObject *pi = PyLong_FromLong(i);
list[i] = py_to_r(PyObject_GetItem(x, pi), convert);
Py_DecRef(pi);
}
return list;
}

else if (PyObject_IsInstance(x, Py_DictClass)) {
// This check is kind of slow since it calls back into evaluating Python code instead of
// merely consulting the object header, but it is the only reliable way that works
// for tensorflow._DictWrapper,
// which in actually is a wrapt.ProxyObject pretending to be a dict.
// ProxyObject goes to great lenghts to pretend to be the underlying object,
// to the point that x.__class__ is __builtins__.dict,
// but it fails PyDict_CheckExact(x) and PyDict_Check(x).
// Registering a custom S3 r_to_py() method here isn't straighforward either,
// since the object presents as a plain dict when inspecting __class__,
// despite the fact that none of the PyDict_* C API functions work with it.

// PyMapping_Items returns a list of (key, value) tuples.
PyObjectPtr items(PyMapping_Items(x));

Py_ssize_t size = PyObject_Size(items);
std::vector<std::string> names(size);
Rcpp::List list(size);

for (Py_ssize_t idx = 0; idx < size; idx++) {
PyObjectPtr item(PySequence_GetItem(items, idx));
PyObject *key = PyTuple_GetItem(item, 0); // borrowed ref
PyObject *value = PyTuple_GetItem(item, 1); // borrowed ref

if (is_python_str(key)) {
names[idx] = as_utf8_r_string(key);
} else {
PyObjectPtr str(PyObject_Str(key));
names[idx] = as_utf8_r_string(str);
}
list[idx] = py_to_r(value, convert);
}
list.names() = names;
return list;
}
// These two commented-out ifelse branches convert a subclassed
// python list or dict object as if it was a simple list or dict. This is turning out
// to be a bad idea, as we encounter more user-facing subclassed dicts and
// list with additional methods or items that are discarded during conversion,
// and that users are expecting to persist. e.g., in huggingface
// https://github.com/rstudio/reticulate/issues/1510
// https://github.com/rstudio/reticulate/issues/1429#issuecomment-1658499679
// https://github.com/rstudio/reticulate/issues/1360#issuecomment-1680413674
//
// else if (PyList_Check(x)) {
// // didn't pass PyList_CheckExact(), but does pass PyList_Check()
// // so it's an object that subclasses list.
// // (This type of subclassed list is used by tensorflow for lists of layers
// // attached to a keras model, tensorflow.python.training.tracking.data_structures.List,
// // https://github.com/rstudio/reticulate/issues/1226 )
// // if needed, consider changing this check from PyList_Check(x) to either:
// // - PySequence_Check(x), which just checks for existence of __getitem__ and __len__ methods,
// // - PyObject_IsInstance(x, Py_ListClass) for wrapt.ProxyObject wrapping a list.
//
// // Since it's a subclassed list.
// // We can't depend on the the PyList_* API working,
// // and must instead fallback to the generic PyObject_* API or PySequence_API.
// // (PyList_*() function do not work for tensorflow.python.training.tracking.data_structures.List)
// long len = PyObject_Size(x);
// Rcpp::List list(len);
// for (long i = 0; i < len; i++) {
// PyObject *pi = PyLong_FromLong(i);
// list[i] = py_to_r(PyObject_GetItem(x, pi), convert);
// Py_DecRef(pi);
// }
// return list;
// }
//
// else if (PyObject_IsInstance(x, Py_DictClass)) {
// // This check is kind of slow since it calls back into evaluating Python code instead of
// // merely consulting the object header, but it is the only reliable way that works
// // for tensorflow._DictWrapper,
// // which in actually is a wrapt.ProxyObject pretending to be a dict.
// // ProxyObject goes to great lenghts to pretend to be the underlying object,
// // to the point that x.__class__ is __builtins__.dict,
// // but it fails PyDict_CheckExact(x) and PyDict_Check(x).
// // Registering a custom S3 r_to_py() method here isn't straighforward either,
// // since the object presents as a plain dict when inspecting __class__,
// // despite the fact that none of the PyDict_* C API functions work with it.
//
// // PyMapping_Items returns a list of (key, value) tuples.
// PyObjectPtr items(PyMapping_Items(x));
//
// Py_ssize_t size = PyObject_Size(items);
// std::vector<std::string> names(size);
// Rcpp::List list(size);
//
// for (Py_ssize_t idx = 0; idx < size; idx++) {
// PyObjectPtr item(PySequence_GetItem(items, idx));
// PyObject *key = PyTuple_GetItem(item, 0); // borrowed ref
// PyObject *value = PyTuple_GetItem(item, 1); // borrowed ref
//
// if (is_python_str(key)) {
// names[idx] = as_utf8_r_string(key);
// } else {
// PyObjectPtr str(PyObject_Str(key));
// names[idx] = as_utf8_r_string(str);
// }
// list[idx] = py_to_r(value, convert);
// }
// list.names() = names;
// return list;
// }

// callable
else if (py_is_callable(x)) {
Expand Down
23 changes: 16 additions & 7 deletions tests/testthat/test-python-objects.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class M:
})



test_that("py_id() returns unique strings; #1216", {
skip_if_no_python()

Expand All @@ -53,8 +52,6 @@ test_that("py_id() returns unique strings; #1216", {





test_that("subclassed lists can be converted", {
skip_if_no_python()

Expand All @@ -75,7 +72,14 @@ class List(Sequence, list):
return len(self._storage)
")$List

expect_identical(List(1,2,3), list(1,2,3))
expect_contains(class(List(1,2,3)),
c("__main__.List",
"collections.abc.Sequence",
"python.builtin.list",
"python.builtin.object"))

py_bt_list <- import_builtins()$list
expect_identical(py_bt_list(List(1, 2, "3")), list(1, 2, "3"))

})

Expand All @@ -96,11 +100,16 @@ assert isinstance(Dict({}), dict)
")$Dict

expect_identical(Dict(dict()), structure(list(), names = character(0)))
expect_identical(Dict(list("abc" = 1:3)), list("abc" = 1:3))
expect_contains(class(Dict(dict())),
c("__main__.Dict",
"python.builtin.ObjectProxy",
"python.builtin.object"))

})
x <- list("abc" = 1:3)
py_bt_dict <- import_builtins()$dict
expect_identical(py_bt_dict(Dict(x)), x)

})


test_that("capsules can be freed by other threads", {
Expand Down

0 comments on commit c6b601f

Please sign in to comment.