Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"maximum recursion depth reached" when submitting ~200 workchains #4876

Closed
ltalirz opened this issue Apr 28, 2021 · 72 comments
Closed

"maximum recursion depth reached" when submitting ~200 workchains #4876

ltalirz opened this issue Apr 28, 2021 · 72 comments
Assignees

Comments

@ltalirz
Copy link
Member

ltalirz commented Apr 28, 2021

@danieleongari reports the following error when submitting ~200 workchains with aiida-core 1.6.1

2021-04-27 16:41:13 [26387 | ERROR]: Traceback (most recent call last):
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/manage/external/rmq.py", line 208, in _continue
    result = await super()._continue(communicator, pid, nowait, tag)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/process_comms.py", line 607, in _continue
    proc = cast('Process', saved_state.unbundle(self._load_context))
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/persistence.py", line 60, in unbundle
    return Savable.load(self, load_context)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/persistence.py", line 452, in load
    return load_cls.recreate_from(saved_state, load_context)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/processes.py", line 238, in recreate_from
    process = cast(Process, super().recreate_from(saved_state, load_context))
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/persistence.py", line 477, in recreate_from
    call_with_super_check(obj.load_instance_state, saved_state, load_context)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/engine/processes/workchains/workchain.py", line 105, in load_instance_state
    super().load_instance_state(saved_state, load_context)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/engine/processes/process.py", line 284, in load_instance_state
    super().load_instance_state(saved_state, load_context)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/plumpy/processes.py", line 620, in load_instance_state
    decoded = self.decode_input_args(saved_state[BundleKeys.INPUTS_RAW])
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/engine/processes/process.py", line 607, in decode_input_args
    return serialize.deserialize(encoded)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/serialize.py", line 230, in deserialize
    return yaml.load(serialized, Loader=AiiDALoader)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/__init__.py", line 114, in load
    return loader.get_single_data()
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 43, in get_single_data
    return self.construct_document(node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 47, in construct_document
    data = self.construct_object(node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 92, in construct_object
    data = constructor(self, node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/serialize.py", line 131, in mapping_constructor
    yaml_node = loader.construct_mapping(mapping, deep=True)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 210, in construct_mapping
    return super().construct_mapping(node, deep=deep)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 135, in construct_mapping
    value = self.construct_object(value_node, deep=deep)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 99, in construct_object
    for dummy in generator:
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 404, in construct_yaml_map
    value = self.construct_mapping(node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 210, in construct_mapping
    return super().construct_mapping(node, deep=deep)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 135, in construct_mapping
    value = self.construct_object(value_node, deep=deep)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/yaml/constructor.py", line 92, in construct_object
    data = constructor(self, node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/serialize.py", line 56, in node_constructor
    return orm.load_node(uuid=yaml_node)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/__init__.py", line 197, in load_node
    return load_entity(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/__init__.py", line 77, in load_entity
    return entity_loader.load_entity(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/utils/loaders.py", line 213, in load_entity
    entity = builder.one()[0]
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 2179, in one
    res = self.all()
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 2252, in all
    matches = list(self.iterall(batch_size=batch_size))
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 2209, in iterall
    query = self.get_query()
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 2088, in get_query
    query = self._build()
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 1938, in _build
    self._query = self._query.filter(self._build_filters(alias, filter_specs))
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/querybuilder.py", line 1373, in _build_filters
    self._impl.get_filter_expr(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/implementation/django/querybuilder.py", line 212, in get_filter_expr
    self.get_filter_expr(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/implementation/django/querybuilder.py", line 237, in get_filter_expr
    expr = self.get_filter_expr_from_column(operator, value, column)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/aiida/orm/implementation/querybuilder.py", line 217, in get_filter_expr_from_column
    expr = database_entity.cast(String).like(value)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/orm/attributes.py", line 236, in __getattr__
    return getattr(self.comparator, key)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/util/langhelpers.py", line 987, in __getattr__
    return self._fallback_getattr(key)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/orm/properties.py", line 364, in _fallback_getattr
    return getattr(self.__clause_element__(), key)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/util/langhelpers.py", line 974, in oneshot
    result = fn(*args, **kw)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/orm/properties.py", line 316, in _memoized_method___clause_element__
    return self.adapter(self.prop.columns[0])
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/orm/util.py", line 680, in _adapt_element
    return self._adapter.traverse(elem)._annotate(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/util.py", line 936, in traverse
    return self.columns[obj]
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/util/_collections.py", line 745, in __missing__
    self[key] = val = self.creator(self.weakself(), key)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/util.py", line 943, in _locate_col
    c = ClauseAdapter.traverse(self, col)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/visitors.py", line 240, in traverse
    return replacement_traverse(obj, self.__traverse_options__, replace)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/visitors.py", line 484, in replacement_traverse
    obj = clone(obj, **opts)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/visitors.py", line 473, in clone
    newelem = replace(elem)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/visitors.py", line 236, in replace
    e = v.replace(elem)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/util.py", line 848, in replace
    return self._corresponding_column(col, True)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/util.py", line 820, in _corresponding_column
    newcol = self.selectable.corresponding_column(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/selectable.py", line 560, in corresponding_column
    if self.c.contains_column(column):
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/util/langhelpers.py", line 893, in __get__
    obj.__dict__[self.__name__] = result = self.fget(obj)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/selectable.py", line 647, in columns
    self._populate_column_collection()
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/selectable.py", line 1393, in _populate_column_collection
    col._make_proxy(self)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/schema.py", line 1802, in _make_proxy
    c = self._constructor(
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/schema.py", line 1568, in __init__
    self._init_items(*args)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/schema.py", line 121, in _init_items
    spwd(self)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/base.py", line 461, in _set_parent_with_dispatch
    self._set_parent(parent)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/schema.py", line 2282, in _set_parent
    self.parent._on_table_attach(self._set_table)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/sql/schema.py", line 1722, in _on_table_attach
    event.listen(self, "after_parent_attach", fn)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/api.py", line 102, in listen
    _event_key(target, identifier, fn).listen(*args, **kw)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/api.py", line 25, in _event_key
    tgt = evt_cls._accept_with(target)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/base.py", line 232, in _accept_with
    if hasattr(target, "dispatch"):
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/base.py", line 298, in __get__
    obj.__dict__["dispatch"] = disp = self.dispatch._for_instance(obj)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/base.py", line 121, in _for_instance
    return self._for_class(instance_cls)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/base.py", line 117, in _for_class
    return self.__class__(self, instance_cls)
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/site-packages/sqlalchemy/event/base.py", line 83, in __init__
    self._empty_listeners = self._empty_listener_reg[instance_cls]
  File "/home/daniele/anaconda3/envs/aiida_py38/lib/python3.8/weakref.py", line 383, in __getitem__
    return self.data[ref(key)]
RecursionError: maximum recursion depth exceeded while calling a Python object

the error goes away when adding a time.sleep(2) in between submissions

Your environment

  • Operating system [e.g. Linux]: ubuntu
  • Python version [e.g. 3.7.1]: 3.8
  • aiida-core version [e.g. 1.2.1]: 1.6.1
@sphuber
Copy link
Contributor

sphuber commented Apr 28, 2021

Does this come from the submission script or the daemon? Looks to be on the daemon side, correct?

@ltalirz
Copy link
Member Author

ltalirz commented Apr 28, 2021

Sorry, forgot to mention: it comes from the submission script, not from the daemon logs

@chrisjsewell
Copy link
Member

It may also be helpful to provide the output of pip freeze, I.e to check the version of sqlalchemy

@sphuber
Copy link
Contributor

sphuber commented Apr 28, 2021

Sorry, forgot to mention: it comes from the submission script

Are you sure? Is he running the workchains in that script or sending them to the daemon? Reason that I am asking is that the stack trace shows that the problem originates from the ProcessLauncher._continue_ call which should not be called during submission. This is the hook that is called on a daemon runner when it receives a task to continue a process from RabbitMQ.

@danieleongari
Copy link

danieleongari commented Apr 28, 2021

@sphuber The error comes from inspecting the report (verdi process report). The workchain is submitted from a Jupyter notebook using the builder, and then submitting it. Of the ca. 200 calculations, the first 10ish run ok, and then the most of them start to be excepted with the error that Leo reported: it looks like there is a flooding of processes that causes problems.

@chrisjsewell is this enough?

~ pip freeze | egrep sql
SQLAlchemy @ file:///home/conda/feedstock_root/build_artifacts/sqlalchemy_1612225077951/work
SQLAlchemy-Utils @ file:///home/conda/feedstock_root/build_artifacts/sqlalchemy-utils_1614043858099/work
sqlparse @ file:///home/conda/feedstock_root/build_artifacts/sqlparse_1602142927465/work

Thank you for the help!

@sphuber
Copy link
Contributor

sphuber commented Apr 28, 2021

Thanks for the additional info @danieleongari . It would be very useful if we actually could get the version of sqlalchemy. Maybe you can do the following in a shell or notebook:

In [1]: import sqlalchemy

In [2]: sqlalchemy.__version__
Out[2]: '1.3.23'

So as expected, the problem is on the daemon worker side and not the submit script. What is happening is that the daemon worker receives a task from the process queue from RabbitMQ, it loads the corresponding node from the database and then uses the YAML dump in the checkpoint attribute, to reconstruct the Process instance in memory from that serialized version. This includes the entire set of inputs of the process that were also serialized and so each of those are reloaded from the database, which happens in the line:

return orm.load_node(uuid=yaml_node)

from the aiida.orm.utils.serialize module. The load_node call will use the QueryBuilder underneath which will then go into sqlalchemy which has a big part in the stack trace. This is where I lose the thread, because apparently somewhere in the sqlalchemy code that is invoked when we call load_node leads to this infinite recursion. I have no idea why or why this should be related to many processes being run.

The only hint there is is that the final method in the stacktrace comes from the weakref built in module. At this point I have to start speculating but it could be that the daemon worker is trying to load a node that is already loaded in its memory (as part of the inputs of another process that it is working on, they could share the same Code as input for example) and there might be some caching mechanism in place that when the same node gets loaded, it is coupled to the existing ref instead. I am really spitballing here and have no idea how to further debug this or try and reproduce.

@ltalirz
Copy link
Member Author

ltalirz commented Apr 28, 2021

In [1]: import sqlalchemy

In [2]: sqlalchemy.__version__
Out[2]: '1.3.23'

sorry for the hickkup with the submission script vs daemon - there was a miscommunication on our side

@mbercx
Copy link
Member

mbercx commented May 3, 2021

I'm having a somewhat similar issue, so I'll add my case to this one. When submitting a 100 PwBaseWorkChains in a loop (without any pause in between 😬 ), half of them got stuck in the "Running" state:

$ verdi process list -S running
   PK  Created    Process label    Process State    Process status
-----  ---------  ---------------  ---------------  ----------------
36774  1h ago     PwBaseWorkChain  ⏵ Running
36783  1h ago     PwBaseWorkChain  ⏵ Running
...
37166  1h ago     PwBaseWorkChain  ⏵ Running

Total results: 49

(Note that I've already deleted and restarted one manually). The logs for these are completely empty:

$ verdi process report 37156
No log messages recorded for this entry

But a bit of digging through the daemon logs gives me the following trace:

05/03/2021 08:52:17 PM <16823> kiwipy.rmq.tasks: [ERROR] Exception occurred while processing task.
Traceback (most recent call last):
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/utils.py", line 128, in __getattr__
    return self[attr]
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/utils.py", line 85, in __getitem__
    return self._dict[key]
KeyError: 'kpoints'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mbercx/envs/aiida-sirius/code/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 247, in validate_kpoints
    kpoints = self.inputs.kpoints
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/utils.py", line 131, in __getattr__
    raise AttributeError(errmsg)
AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'

These two are basically repeated until a RecursionError is raised:

Traceback (most recent call last):
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/kiwipy/rmq/tasks.py", line 166, in _on_task
    result = await result
  File "/usr/lib/python3.8/asyncio/futures.py", line 257, in __await__
    yield self  # This tells Task to wait for completion.
  File "/usr/lib/python3.8/asyncio/tasks.py", line 349, in __wakeup
    future.result()
  File "/usr/lib/python3.8/asyncio/futures.py", line 175, in result
    raise self._exception
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/kiwipy/rmq/threadcomms.py", line 253, in done
    result = kiwi_future.result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 432, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 388, in __get_result
    raise self._exception
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/kiwipy/futures.py", line 54, in capture_exceptions
    yield
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/communications.py", line 48, in on_done
    result = plum_future.result()
  File "/usr/lib/python3.8/asyncio/futures.py", line 175, in result
    raise self._exception
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/kiwipy/futures.py", line 54, in capture_exceptions
    yield
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/futures.py", line 73, in run_task
    res = await coro()
  File "/home/mbercx/.virtualenvs/aiida-sirius/lib/python3.8/site-packages/plumpy/process_comms.py", line 539, in __call__
    return await self._continue(communicator, **task.get(TASK_ARGS, {}))
  File "/home/mbercx/envs/aiida-sirius/code/aiida-core/aiida/manage/external/rmq.py", line 219, in _continue
    self.handle_continue_exception(node, exception, message)
  File "/home/mbercx/envs/aiida-sirius/code/aiida-core/aiida/manage/external/rmq.py", line 158, in handle_continue_exception
    node.logger.exception(message)
  File "/usr/lib/python3.8/logging/__init__.py", line 1814, in exception
    self.log(ERROR, msg, *args, exc_info=exc_info, **kwargs)
  File "/usr/lib/python3.8/logging/__init__.py", line 1829, in log
    self.logger.log(level, msg, *args, **kwargs)
  File "/usr/lib/python3.8/logging/__init__.py", line 1500, in log
    self._log(level, msg, args, **kwargs)
  File "/usr/lib/python3.8/logging/__init__.py", line 1577, in _log
    self.handle(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 1587, in handle
    self.callHandlers(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 1649, in callHandlers
    hdlr.handle(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 950, in handle
    self.emit(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 1081, in emit
    msg = self.format(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 925, in format
    return fmt.format(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 672, in format
    record.exc_text = self.formatException(record.exc_info)
  File "/usr/lib/python3.8/logging/__init__.py", line 622, in formatException
    traceback.print_exception(ei[0], ei[1], tb, None, sio)
  File "/usr/lib/python3.8/traceback.py", line 103, in print_exception
    for line in TracebackException(
  File "/usr/lib/python3.8/traceback.py", line 493, in __init__
    context = TracebackException(
  File "/usr/lib/python3.8/traceback.py", line 493, in __init__
    context = TracebackException(
  File "/usr/lib/python3.8/traceback.py", line 493, in __init__
    context = TracebackException(
  [Previous line repeated 34 more times]
  File "/usr/lib/python3.8/traceback.py", line 476, in __init__
    _seen.add(id(exc_value))
RecursionError: maximum recursion depth exceeded while calling a Python object

After deleting one and restarting it, it ran just fine. I've also started ~35 work chains like this just fine. I then tried deleting the remaining 49, and restarting them all at once, and then I wind up with 7 PwBaseWorkChains that are stuck in Running, with the same error trace as above, after which is a whole range of reports of launching the PwCalculations:

RecursionError: maximum recursion depth exceeded while calling a Python object
05/03/2021 10:28:13 PM <17394> aiida.orm.nodes.process.workflow.workchain.WorkChainNode: [REPORT] [37947|PwBaseWorkChain|run_process]: launching PwCalculation<37961> iteration #1
05/03/2021 10:28:14 PM <17394> aiida.orm.nodes.process.workflow.workchain.WorkChainNode: [REPORT] [37849|PwBaseWorkChain|run_process]: launching PwCalculation<37964> iteration #1
...

Afterwards, I deleted all 100 work chains, and then restarted them with a 5 second pause in between. Now all work chains were able to create the PwCalculations without issue.

I'm running:

OS: Ubuntu 18.04.5 LTS
Python: 3.8
aiida-core: 1.6.3 (but I think I've also spotted this issue when I was running v1.6.1, just couldn't pin it down at the time)
plumpy: 0.19.0
kiwipy: 0.7.4
sqlalchemy: 1.3.23

@dev-zero
Copy link
Contributor

Seeing the same thing, also when submitting a large amount of workchains (whether one-by-one in a script or by a parent workchain launching them does not matter, but when launching them via a parent workchain the issue is triggered sooner).

The backtrace is a bit different, though:

2021-05-14 10:55:09 [1739 | REPORT]: [6047|Cp2kEosWorkChain|on_except]: Traceback (most recent call last):
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/process_states.py", line 230, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 250, in step
    return True, self._fn(self._workchain)
  File "/scratch/tiziano/work/aiida/aiida-cp2k/aiida_cp2k/workchains/eos.py", line 170, in run_calculations
    builder.cp2k.structure = get_rescaled_structure(self.inputs.structure, Float(scale))
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/functions.py", line 179, in decorated_function
    result, _ = run_get_node(*args, **kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/functions.py", line 131, in run_get_node
    process = process_class(inputs=inputs, runner=runner)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 193, in __call__
    inst.transition_to(inst.create_initial_state())
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 335, in transition_to
    self.transition_failed(initial_state_label, label, *sys.exc_info()[1:])
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 351, in transition_failed
    raise exception.with_traceback(trace)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 320, in transition_to
    self._enter_next_state(new_state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 382, in _enter_next_state
    self._fire_state_event(StateEventHook.ENTERING_STATE, next_state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 299, in _fire_state_event
    callback(self, hook, state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/processes.py", line 324, in <lambda>
    lambda _s, _h, state: self.on_entering(cast(process_states.State, state)),
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 380, in on_entering
    super().on_entering(state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/processes.py", line 669, in on_entering
    call_with_super_check(self.on_create)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 376, in on_create
    self._pid = self._create_and_setup_db_record()  # pylint: disable=attribute-defined-outside-init
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 563, in _create_and_setup_db_record
    self._setup_db_record()
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/functions.py", line 362, in _setup_db_record
    super()._setup_db_record()
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 672, in _setup_db_record
    self._setup_inputs()
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/node.py", line 417, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/utils/mixins.py", line 148, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/node.py", line 449, in validate_incoming
    if builder.count() > 0:
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/querybuilder.py", line 2164, in count
    return self._impl.count(query)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/implementation/querybuilder.py", line 288, in count
    return query.count()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3803, in count
    return self.from_self(col).scalar()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3523, in scalar
    ret = self.one()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3490, in one
    ret = self.one_or_none()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3459, in one_or_none
    ret = list(self)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
    return self._execute_and_instances(context)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1011, in execute
    return meth(self, multiparams, params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1115, in _execute_clauseelement
    compiled_sql = elem.compile(
  File "<string>", line 1, in <lambda>
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 481, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 487, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 592, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 322, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 352, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1801, in visit_alias
    ret = alias.original._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2435, in visit_join
    join.left._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2437, in visit_join
    + join.right._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1727, in visit_cte
    self.visit_cte(pre_alias_cte, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1758, in visit_cte
    cte.original._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1216, in visit_compound_select
    text = (" " + keyword + " ").join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1218, in <genexpr>
    c._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2301, in _compose_select_body
    t = select._whereclause._compiler_dispatch(self, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1354, in visit_binary
    return disp(binary, operator_, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1471, in visit_like_op_binary
    binary.left._compiler_dispatch(self, **kw),
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1075, in visit_cast
    cast.clause._compiler_dispatch(self, **kwargs),
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/annotation.py", line 79, in _compiler_dispatch
    return self.__element.__class__._compiler_dispatch(self, visitor, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 912, in visit_column
    if not is_literal and isinstance(name, elements._truncated_label):
RecursionError: maximum recursion depth exceeded while calling a Python object

aiida-core: 403f7e7
plumpy: 0.19.0
kiwipy: 0.7.4
sqlalchemy: 1.3.24
python: 3.9.5

@zhubonan
Copy link
Contributor

Just want to say that I did a similar thing like submitting the work chains (~100 VaspRelaxWorkChain) in a loop without delay previously (aiida-core < 1.6.1), and found no issue. In my case, the submission process itself is rather slow though, takes about 1.5 seconds for each call to submit.

@dev-zero
Copy link
Contributor

And another one, again at a different place (but this one comes from the sub-workchain called by the primary one):

$ verdi process report 15370
2021-05-17 10:26:29 [7359 | REPORT]: [15370|Cp2kBaseWorkChain|on_except]: Traceback (most recent call last):
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/process_states.py", line 230, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 532, in step
    finished, result = self._child_stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/workchains.py", line 250, in step
    return True, self._fn(self._workchain)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/workchains/restart.py", line 183, in run_process
    node = self.submit(self.process_class, **inputs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 498, in submit
    return self.runner.submit(process, *args, **kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/runners.py", line 184, in submit
    process_inited = self.instantiate_process(process, *args, **inputs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/runners.py", line 170, in instantiate_process
    return instantiate_process(self, process, *args, **inputs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/utils.py", line 65, in instantiate_process
    process = process_class(runner=runner, inputs=inputs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 193, in __call__
    inst.transition_to(inst.create_initial_state())
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 335, in transition_to
    self.transition_failed(initial_state_label, label, *sys.exc_info()[1:])
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 351, in transition_failed
    raise exception.with_traceback(trace)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 320, in transition_to
    self._enter_next_state(new_state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 382, in _enter_next_state
    self._fire_state_event(StateEventHook.ENTERING_STATE, next_state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/state_machine.py", line 299, in _fire_state_event
    callback(self, hook, state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/processes.py", line 324, in <lambda>
    lambda _s, _h, state: self.on_entering(cast(process_states.State, state)),
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 380, in on_entering
    super().on_entering(state)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/processes.py", line 669, in on_entering
    call_with_super_check(self.on_create)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 376, in on_create
    self._pid = self._create_and_setup_db_record()  # pylint: disable=attribute-defined-outside-init
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 563, in _create_and_setup_db_record
    self._setup_db_record()
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 672, in _setup_db_record
    self._setup_inputs()
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/node.py", line 417, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/utils/mixins.py", line 148, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/nodes/node.py", line 449, in validate_incoming
    if builder.count() > 0:
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/querybuilder.py", line 2164, in count
    return self._impl.count(query)
  File "/scratch/tiziano/work/aiida/aiida-core/aiida/orm/implementation/querybuilder.py", line 288, in count
    return query.count()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3803, in count
    return self.from_self(col).scalar()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3523, in scalar
    ret = self.one()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3490, in one
    ret = self.one_or_none()
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3459, in one_or_none
    ret = list(self)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
    return self._execute_and_instances(context)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1011, in execute
    return meth(self, multiparams, params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1115, in _execute_clauseelement
    compiled_sql = elem.compile(
  File "<string>", line 1, in <lambda>
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 481, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/elements.py", line 487, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 592, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 322, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 352, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1801, in visit_alias
    ret = alias.original._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2435, in visit_join
    join.left._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2437, in visit_join
    + join.right._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1727, in visit_cte
    self.visit_cte(pre_alias_cte, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1758, in visit_cte
    cte.original._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1216, in visit_compound_select
    text = (" " + keyword + " ").join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1218, in <genexpr>
    c._compiler_dispatch(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 2301, in _compose_select_body
    t = select._whereclause._compiler_dispatch(self, **kwargs)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1354, in visit_binary
    return disp(binary, operator_, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1471, in visit_like_op_binary
    binary.left._compiler_dispatch(self, **kw),
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/scratch/tiziano/virtualenvs/aiida/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1075, in visit_cast
    cast.clause._compiler_dispatch(self, **kwargs),
RecursionError: maximum recursion depth exceeded in comparison

@dev-zero
Copy link
Contributor

I took the liberty to add the important label, because the mess this leaves is considerable (one also has to check all subprocesses).

@dev-zero
Copy link
Contributor

@danieleongari is the computer to run the calculations configured with an SSH proxy command?

@giovannipizzi
Copy link
Member

Just to report the same issue, submitting many common workflows. I have two different behaviours (probably because of the different place where the recursion limit occurs):

2021-06-11 07:03:21 [1133 | REPORT]: [6664|EquationOfStateWorkChain|run_init]: submitting `QuantumEspressoCommonRelaxWorkChain` for scale_factor `uuid: 8c5be874-7345-478e-9139-e44f3a6e00cc (pk: 6758) value: 0.94`
2021-06-11 07:04:02 [1193 | REPORT]:     [7525|PwRelaxWorkChain|setup]: No change in volume possible for the provided base input parameters. Meta convergence is turned off.
2021-06-11 07:04:02 [1194 | REPORT]:     [7525|PwRelaxWorkChain|setup]: Work chain will not run final SCF when `calculation` is set to `scf` for the relaxation `PwBaseWorkChain`.
2021-06-11 07:04:03 [1199 | REPORT]:     [7525|PwRelaxWorkChain|run_relax]: launching PwBaseWorkChain<7628>
2021-06-11 07:05:01 [1451 | REPORT]:       [7628|PwBaseWorkChain|run_process]: launching PwCalculation<9311> iteration #1
2021-06-11 07:48:44 [4902 | REPORT]:       [7628|PwBaseWorkChain|results]: work chain completed after 1 iterations
2021-06-11 07:48:44 [4903 | REPORT]:       [7628|PwBaseWorkChain|on_terminated]: remote folders will not be cleaned
2021-06-11 07:49:26 [4924 | REPORT]:     [7525|PwRelaxWorkChain|results]: workchain completed after 1 iterations
2021-06-11 07:49:29 [4928 | REPORT]:     [7525|PwRelaxWorkChain|on_terminated]: cleaned remote folders of calculations: 9311
2021-06-11 07:50:03 [4960 | REPORT]: [6664|EquationOfStateWorkChain|on_except]: Traceback (most recent call last):
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/process_states.py", line 230, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/workchains.py", line 250, in step
    return True, self._fn(self._workchain)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-common-workflows/aiida_common_workflows/workflows/eos.py", line 196, in run_eos
    builder, structure = self.get_sub_workchain_builder(scale_factor, reference_workchain=reference_workchain)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-common-workflows/aiida_common_workflows/workflows/eos.py", line 160, in get_sub_workchain_builder
    structure = scale_structure(self.inputs.structure, scale_factor)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 179, in decorated_function
    result, _ = run_get_node(*args, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 131, in run_get_node
    process = process_class(inputs=inputs, runner=runner)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 193, in __call__
    inst.transition_to(inst.create_initial_state())
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 335, in transition_to
    self.transition_failed(initial_state_label, label, *sys.exc_info()[1:])
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 351, in transition_failed
    raise exception.with_traceback(trace)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 320, in transition_to
    self._enter_next_state(new_state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 382, in _enter_next_state
    self._fire_state_event(StateEventHook.ENTERING_STATE, next_state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 299, in _fire_state_event
    callback(self, hook, state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/processes.py", line 324, in <lambda>
    lambda _s, _h, state: self.on_entering(cast(process_states.State, state)),
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 380, in on_entering
    super().on_entering(state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/processes.py", line 669, in on_entering
    call_with_super_check(self.on_create)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 376, in on_create
    self._pid = self._create_and_setup_db_record()  # pylint: disable=attribute-defined-outside-init
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 563, in _create_and_setup_db_record
    self._setup_db_record()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 362, in _setup_db_record
    super()._setup_db_record()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 672, in _setup_db_record
    self._setup_inputs()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/node.py", line 802, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/utils/mixins.py", line 139, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/node.py", line 834, in validate_incoming
    if builder.count() > 0:
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/querybuilder.py", line 2193, in count
    return self._impl.count(query)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/implementation/querybuilder.py", line 290, in count
    return query.count()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3803, in count
    return self.from_self(col).scalar()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3523, in scalar
    ret = self.one()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3490, in one
    ret = self.one_or_none()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3459, in one_or_none
    ret = list(self)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
    return self._execute_and_instances(context)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1011, in execute
    return meth(self, multiparams, params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1121, in _execute_clauseelement
    else None,
  File "<string>", line 1, in <lambda>
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 481, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 487, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 592, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 322, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 352, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in _compose_select_body
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in <listcomp>
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1802, in visit_alias
    self, asfrom=True, **kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in _compose_select_body
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in <listcomp>
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2439, in visit_join
    + join.onclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2439, in visit_join
    + join.onclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1727, in visit_cte
    self.visit_cte(pre_alias_cte, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1759, in visit_cte
    self, asfrom=True, **kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1225, in visit_compound_select
    for i, c in enumerate(cs.selects)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1225, in <genexpr>
    for i, c in enumerate(cs.selects)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2301, in _compose_select_body
    t = select._whereclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in visit_clauselist
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1041, in <genexpr>
    s
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in visit_clauselist
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1041, in <genexpr>
    s
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1354, in visit_binary
    return disp(binary, operator_, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1471, in visit_like_op_binary
    binary.left._compiler_dispatch(self, **kw),
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1076, in visit_cast
    cast.typeclause._compiler_dispatch(self, **kwargs),
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 955, in visit_typeclause
    return self.dialect.type_compiler.process(typeclause.type, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 402, in process
    return type_._compiler_dispatch(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 89, in _compiler_dispatch
    meth = getter(visitor)
RecursionError: maximum recursion depth exceeded while calling a Python object
2021-06-11 07:04:01 [1190 | REPORT]: [6733|EquationOfStateWorkChain|run_init]: submitting `QuantumEspressoCommonRelaxWorkChain` for scale_factor `uuid: 41b191e3-c9f5-4bb1-b038-0099d99123eb (pk: 6963) value: 0.94`
2021-06-11 07:04:21 [1296 | REPORT]:     [8136|PwRelaxWorkChain|setup]: No change in volume possible for the provided base input parameters. Meta convergence is turned off.
2021-06-11 07:04:21 [1297 | REPORT]:     [8136|PwRelaxWorkChain|setup]: Work chain will not run final SCF when `calculation` is set to `scf` for the relaxation `PwBaseWorkChain`.
2021-06-11 07:04:22 [1301 | REPORT]:     [8136|PwRelaxWorkChain|run_relax]: launching PwBaseWorkChain<8252>
2021-06-11 07:04:28 [1335 | REPORT]:       [8252|PwBaseWorkChain|run_process]: launching PwCalculation<8449> iteration #1
2021-06-11 07:49:38 [4940 | REPORT]:       [8252|PwBaseWorkChain|results]: work chain completed after 1 iterations
2021-06-11 07:49:39 [4941 | REPORT]:       [8252|PwBaseWorkChain|on_terminated]: remote folders will not be cleaned
2021-06-11 07:49:52 [4950 | REPORT]:     [8136|PwRelaxWorkChain|results]: workchain completed after 1 iterations
2021-06-11 07:49:56 [4951 | REPORT]:     [8136|PwRelaxWorkChain|on_terminated]: cleaned remote folders of calculations: 8449
2021-06-11 07:50:01 [4955 | REPORT]:   [7611|QuantumEspressoCommonRelaxWorkChain|on_except]: Traceback (most recent call last):
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/process_states.py", line 230, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/workchains.py", line 250, in step
    return True, self._fn(self._workchain)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-common-workflows/aiida_common_workflows/workflows/relax/quantum_espresso/workchain.py", line 49, in convert_outputs
    result = extract_from_parameters(outputs.output_parameters).values()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 179, in decorated_function
    result, _ = run_get_node(*args, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 131, in run_get_node
    process = process_class(inputs=inputs, runner=runner)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 193, in __call__
    inst.transition_to(inst.create_initial_state())
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 335, in transition_to
    self.transition_failed(initial_state_label, label, *sys.exc_info()[1:])
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 351, in transition_failed
    raise exception.with_traceback(trace)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 320, in transition_to
    self._enter_next_state(new_state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 382, in _enter_next_state
    self._fire_state_event(StateEventHook.ENTERING_STATE, next_state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/state_machine.py", line 299, in _fire_state_event
    callback(self, hook, state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/processes.py", line 324, in <lambda>
    lambda _s, _h, state: self.on_entering(cast(process_states.State, state)),
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 380, in on_entering
    super().on_entering(state)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/processes.py", line 669, in on_entering
    call_with_super_check(self.on_create)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 376, in on_create
    self._pid = self._create_and_setup_db_record()  # pylint: disable=attribute-defined-outside-init
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 563, in _create_and_setup_db_record
    self._setup_db_record()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/functions.py", line 362, in _setup_db_record
    super()._setup_db_record()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 672, in _setup_db_record
    self._setup_inputs()
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/node.py", line 802, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/utils/mixins.py", line 139, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/nodes/node.py", line 834, in validate_incoming
    if builder.count() > 0:
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/querybuilder.py", line 2193, in count
    return self._impl.count(query)
  File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-core/aiida/orm/implementation/querybuilder.py", line 290, in count
    return query.count()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3803, in count
    return self.from_self(col).scalar()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3523, in scalar
    ret = self.one()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3490, in one
    ret = self.one_or_none()
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3459, in one_or_none
    ret = list(self)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
    return self._execute_and_instances(context)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1011, in execute
    return meth(self, multiparams, params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1121, in _execute_clauseelement
    else None,
  File "<string>", line 1, in <lambda>
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 481, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 487, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 592, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 322, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 352, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in _compose_select_body
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in <listcomp>
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1802, in visit_alias
    self, asfrom=True, **kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in _compose_select_body
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2294, in <listcomp>
    for f in froms
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2439, in visit_join
    + join.onclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2439, in visit_join
    + join.onclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1727, in visit_cte
    self.visit_cte(pre_alias_cte, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1759, in visit_cte
    self, asfrom=True, **kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1225, in visit_compound_select
    for i, c in enumerate(cs.selects)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1225, in <genexpr>
    for i, c in enumerate(cs.selects)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2202, in visit_select
    text, select, inner_columns, froms, byfrom, kwargs
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 2301, in _compose_select_body
    t = select._whereclause._compiler_dispatch(self, **kwargs)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in visit_clauselist
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1041, in <genexpr>
    s
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in visit_clauselist
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1041, in <genexpr>
    s
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1354, in visit_binary
    return disp(binary, operator_, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1471, in visit_like_op_binary
    binary.left._compiler_dispatch(self, **kw),
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 1076, in visit_cast
    cast.typeclause._compiler_dispatch(self, **kwargs),
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 955, in visit_typeclause
    return self.dialect.type_compiler.process(typeclause.type, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/compiler.py", line 402, in process
    return type_._compiler_dispatch(self, **kw)
  File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
RecursionError: maximum recursion depth exceeded in comparison

2021-06-11 07:50:57 [5002 | REPORT]: [6733|EquationOfStateWorkChain|inspect_init]: Initial sub process did not finish successful so aborting the workchain.

The validate_incoming function of a work function seems to be (?) a common denominator of many of these reports (also of others?)

@giovannipizzi
Copy link
Member

@chrisjsewell can you please look into this? This seems like a blocker and important issue...

@chrisjsewell
Copy link
Member

@chrisjsewell can you please look into this? This seems like a blocker and important issue...

Indeed 👍 I’m going to do a bunch of aiida-core stuff next week

@sphuber
Copy link
Contributor

sphuber commented Jun 11, 2021

The validate_incoming function of a work function seems to be (?)

Not sure. Looking at the original example in the OP happens when a Process gets deserialized from a process checkpoint. The examples by @dev-zero are in one case a calcfunction that is called, but the other example is the submission of a process. Both those cases point to the instantiation of a process, but are still different from the original case which excepts while loading a simple node from the database.

@giovannipizzi
Copy link
Member

BTW:

  • I confirm that I'm using SSH with a proxy_command; looking in the log, sometimes (but not always) the recursion error seems to be associated with transport issues
  • I note that I was running with 8 workers, and I increased the number of slots daemon.worker_process_slots to 1200
  • I have restarted a few times the daemon today (but I cannot say if the errors are correlated or not with the daemon shutdowns)
  • in a few cases, there seem to be a very long loop of recursive calls to exceptions like this, maybe this is (partially) related?
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 265, in validate_kpoints
        kpoints = self.inputs.kpoints
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 131, in __getattr__
        raise AttributeError(errmsg)
    AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 128, in __getattr__
        return self[attr]
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 85, in __getitem__
        return self._dict[key]
    KeyError: 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 265, in validate_kpoints
        kpoints = self.inputs.kpoints
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 131, in __getattr__
        raise AttributeError(errmsg)
    AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 128, in __getattr__
        return self[attr]
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 85, in __getitem__
        return self._dict[key]
    KeyError: 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 265, in validate_kpoints
        kpoints = self.inputs.kpoints
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 131, in __getattr__
        raise AttributeError(errmsg)
    AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 128, in __getattr__
        return self[attr]
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 85, in __getitem__
        return self._dict[key]
    KeyError: 'kpoints'
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
    File "/home/pizzi/.virtualenvs/aiida-prod/codes/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 265, in validate_kpoints
        kpoints = self.inputs.kpoints
    File "/home/pizzi/.virtualenvs/aiida-prod/lib/python3.7/site-packages/plumpy/utils.py", line 131, in __getattr__
        raise AttributeError(errmsg)
    AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'
    
    During handling of the above exception, another exception occurred:
    ...
    
  • I continue to see these crashes even much after the original submission (hours later), they are quite negatively impacting the results (of the work chains that finished, 217 finished OK and 141 failed, and from a quick look the vast majority because of this bug)

Since I don't think there is any sensitive information, I attach the full log (I started running from a clean profile yesterday, so the log should contain all info, not cluttered with too much else): daemon.log.zip

@chrisjsewell chrisjsewell changed the title "maximum recursion depth reached" when submitting ~200 workchains maximum recursion depth reached" when submitting ~200 workchains Jun 16, 2021
@chrisjsewell chrisjsewell changed the title maximum recursion depth reached" when submitting ~200 workchains "maximum recursion depth reached" when submitting ~200 workchains Jun 16, 2021
@RobinHilg
Copy link

When performing FLEUR scandium relaxation I ran into this error which Vasily described yesterday in the mailing list. Here you find a full traceback:

2021-08-10 13:44:18 [34571 | REPORT]: [55184|FleurCreateMagneticWorkChain|start]: INFO: started Create Magnetic Film workflow version 0.2.0

2021-08-10 13:44:18 [34572 | REPORT]: [55184|FleurCreateMagneticWorkChain|start]: INFO: EOS workchain will be submitted
2021-08-10 13:44:18 [34573 | REPORT]: [55184|FleurCreateMagneticWorkChain|run_eos]: INFO: submit EOS WorkChain
2021-08-10 13:44:18 [34574 | REPORT]: [55184|FleurCreateMagneticWorkChain|on_except]: Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/plumpy/process_states.py", line 230, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/opt/aiida-core/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/usr/local/lib/python3.8/dist-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/usr/local/lib/python3.8/dist-packages/plumpy/workchains.py", line 432, in step
    finished, retval = self._child_stepper.step()
  File "/usr/local/lib/python3.8/dist-packages/plumpy/workchains.py", line 299, in step
    finished, result = self._child_stepper.step()
  File "/usr/local/lib/python3.8/dist-packages/plumpy/workchains.py", line 250, in step
    return True, self._fn(self._workchain)
  File "/opt/aiida-fleur/aiida_fleur/workflows/create_magnetic_film.py", line 124, in run_eos
    inputs, error = self.prepare_eos()
  File "/opt/aiida-fleur/aiida_fleur/workflows/create_magnetic_film.py", line 111, in prepare_eos
    inputs.structure = create_substrate_bulk(Dict(dict=self.ctx.wf_dict))
  File "/opt/aiida-core/aiida/engine/processes/functions.py", line 179, in decorated_function
    result, _ = run_get_node(*args, **kwargs)
  File "/opt/aiida-core/aiida/engine/processes/functions.py", line 131, in run_get_node
    process = process_class(inputs=inputs, runner=runner)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 193, in __call__
    inst.transition_to(inst.create_initial_state())
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 335, in transition_to
    self.transition_failed(initial_state_label, label, *sys.exc_info()[1:])
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 351, in transition_failed
    raise exception.with_traceback(trace)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 320, in transition_to
    self._enter_next_state(new_state)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 382, in _enter_next_state
    self._fire_state_event(StateEventHook.ENTERING_STATE, next_state)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/state_machine.py", line 299, in _fire_state_event
    callback(self, hook, state)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/processes.py", line 324, in <lambda>
    lambda _s, _h, state: self.on_entering(cast(process_states.State, state)),
  File "/opt/aiida-core/aiida/engine/processes/process.py", line 380, in on_entering
    super().on_entering(state)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/processes.py", line 669, in on_entering
    call_with_super_check(self.on_create)
  File "/usr/local/lib/python3.8/dist-packages/plumpy/base/utils.py", line 29, in call_with_super_check
    wrapped(*args, **kwargs)
  File "/opt/aiida-core/aiida/engine/processes/process.py", line 376, in on_create
    self._pid = self._create_and_setup_db_record()  # pylint: disable=attribute-defined-outside-init
  File "/opt/aiida-core/aiida/engine/processes/process.py", line 563, in _create_and_setup_db_record
    self._setup_db_record()
  File "/opt/aiida-core/aiida/engine/processes/functions.py", line 362, in _setup_db_record
    super()._setup_db_record()
  File "/opt/aiida-core/aiida/engine/processes/process.py", line 672, in _setup_db_record
    self._setup_inputs()
  File "/opt/aiida-core/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/opt/aiida-core/aiida/orm/nodes/node.py", line 802, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/opt/aiida-core/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/opt/aiida-core/aiida/orm/utils/mixins.py", line 139, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/opt/aiida-core/aiida/orm/nodes/node.py", line 834, in validate_incoming
    if builder.count() > 0:
  File "/opt/aiida-core/aiida/orm/querybuilder.py", line 2193, in count
    return self._impl.count(query)
  File "/opt/aiida-core/aiida/orm/implementation/querybuilder.py", line 290, in count
    return query.count()
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3803, in count
    return self.from_self(col).scalar()
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3523, in scalar
    ret = self.one()
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3490, in one
    ret = self.one_or_none()
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3459, in one_or_none
    ret = list(self)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
    return self._execute_and_instances(context)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
    result = conn.execute(querycontext.statement, self._params)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/engine/base.py", line 1011, in execute
    return meth(self, multiparams, params)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/engine/base.py", line 1115, in _execute_clauseelement
    compiled_sql = elem.compile(
  File "<string>", line 1, in <lambda>
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/elements.py", line 481, in compile
    return self._compiler(dialect, bind=bind, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/elements.py", line 487, in _compiler
    return dialect.statement_compiler(dialect, self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 592, in __init__
    Compiled.__init__(self, dialect, statement, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 322, in __init__
    self.string = self.process(self.statement, **compile_kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 352, in process
    return obj._compiler_dispatch(self, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1801, in visit_alias
    ret = alias.original._compiler_dispatch(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2292, in _compose_select_body
    [
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2293, in <listcomp>
    f._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2435, in visit_join
    join.left._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2437, in visit_join
    + join.right._compiler_dispatch(self, asfrom=True, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1727, in visit_cte
    self.visit_cte(pre_alias_cte, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1758, in visit_cte
    cte.original._compiler_dispatch(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1216, in visit_compound_select
    text = (" " + keyword + " ").join(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1218, in <genexpr>
    c._compiler_dispatch(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2201, in visit_select
    text = self._compose_select_body(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 2301, in _compose_select_body
    t = select._whereclause._compiler_dispatch(self, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1040, in visit_clauselist
    text = sep.join(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1040, in <genexpr>
    text = sep.join(
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1043, in <genexpr>
    c._compiler_dispatch(self, **kw) for c in clauselist.clauses
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1354, in visit_binary
    return disp(binary, operator_, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1471, in visit_like_op_binary
    binary.left._compiler_dispatch(self, **kw),
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch
    return meth(self, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/compiler.py", line 1075, in visit_cast
    cast.clause._compiler_dispatch(self, **kwargs),
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/annotation.py", line 79, in _compiler_dispatch
    return self.__element.__class__._compiler_dispatch(self, visitor, **kw)
  File "/usr/local/lib/python3.8/dist-packages/sqlalchemy/sql/visitors.py", line 89, in _compiler_dispatch
    meth = getter(visitor)
RecursionError: maximum recursion depth exceeded while calling a Python object

FleurCreateMagneticWorkChain<55184> Excepted [1:if_(eos_needed)]

This and similar tracebacks occur on various points of the work chain.

@chrisjsewell
Copy link
Member

chrisjsewell commented Aug 26, 2021

Thanks, I think this error is maybe different to the one opened for this issue.
For this one, at least for initial debugging, I would think to maybe add to:

if builder.count() > 0:
raise ValueError('the link you are attempting to create would generate a cycle in the graph')

something like:

            try:
                count = builder.count()
            except Exception as exc:
                raise ValueError(f'the link ({source} -> {self}) would result in an erroneous query') from exc
            if count > 0:
                raise ValueError(f'the link you are attempting to create ({source} -> {self}) would generate a cycle in the graph')

at least then we could see what nodes are trying to be linked

@unkcpz
Copy link
Member

unkcpz commented Jan 19, 2022

I got a chance to reproduce this issue and I add the exception trace as @chrisjsewell suggested. Here is the traces I got. It is basically lots of kpoints KeyError followed with RecursionError describe in this issue:

Traceback (most recent call last):                                                                                                                                                                                                         
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/utils.py", line 128, in __getattr__                                                                                                                    
    return self[attr]                                                                                                                                                                                                                      
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/utils.py", line 85, in __getitem__                                                                                                                     
    return self._dict[key]                                                                                                                                                                                                                 
KeyError: 'kpoints'                                                                                                                                                                                                                        
                                                                                                                                                                                                                                           
During handling of the above exception, another exception occurred:                                                                                                                                                                        
                                                                                                                                                                                                                                           
Traceback (most recent call last):                                                                                                                                                                                                         
  File "/home/jyu/Projects/WP-SSSP/aiida-quantumespresso/aiida_quantumespresso/workflows/pw/base.py", line 273, in validate_kpoints                                                                                                        
    kpoints = self.inputs.kpoints                                                                                                                                                                                                          
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/utils.py", line 131, in __getattr__                                                                                                                    
    raise AttributeError(errmsg)                                                                                                                                                                                                           
AttributeError: 'AttributesFrozendict' object has no attribute 'kpoints'   

During handling of the above exception, another exception occurred: 

Traceback (most recent call last):                                                                                                                                                                                                         
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/nodes/node.py", line 835, in validate_incoming                                                                                                      
    count = builder.count()                                                                                                                                                                                                                
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/querybuilder.py", line 2193, in count                                                                                                               
    return self._impl.count(query)                                                                                                                                                                                                         
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/implementation/querybuilder.py", line 290, in count                                                                                                 
    return query.count()                                                                                                                                                                                                                   
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 3803, in count                
...

  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/sqlalchemy/sql/visitors.py", line 96, in _compiler_dispatch                                                                                                   
    return meth(self, **kw)                                                                                                                                                                                                                
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/sqlalchemy/sql/compiler.py", line 1075, in visit_cast                                                                                                         
    cast.clause._compiler_dispatch(self, **kwargs),                                                                                                                                                                                        
RecursionError: maximum recursion depth exceeded in comparison                                                                                                                                                                             
                                                                                                                                                                                                                                           
The above exception was the direct cause of the following exception:                                                                                                                                                                       
                                                                                                                                                                                                                                           
Traceback (most recent call last):                                                                                                                                                                                                         
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/process_states.py", line 231, in execute                                                                                                               
    result = self.run_fn(*self.args, **self.kwargs)                                                                                                                                                                                        
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/engine/processes/workchains/workchain.py", line 214, in _do_step                                                                                        
    finished, stepper_result = self._stepper.step()                                                                                                                                                                                        
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/workchains.py", line 299, in step                                                                                                                      
    finished, result = self._child_stepper.step()                                                                                                                                                                                          
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/workchains.py", line 532, in step                                                                                                                      
    finished, result = self._child_stepper.step()                                                                                                                                                                                          
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/workchains.py", line 299, in step                                                                                                                      
    finished, result = self._child_stepper.step()                                                                                                                                                                                          
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/plumpy/workchains.py", line 250, in step                                                                                                                      
    return True, self._fn(self._workchain)                                                              

...

  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/engine/processes/process.py", line 709, in _setup_inputs
    self.node.add_incoming(node, LinkType.INPUT_CALC, name)
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/nodes/node.py", line 802, in add_incoming
    self.validate_incoming(source, link_type, link_label)
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/nodes/process/process.py", line 472, in validate_incoming
    super().validate_incoming(source, link_type, link_label)
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/utils/mixins.py", line 139, in validate_incoming
    super().validate_incoming(source, link_type=link_type, link_label=link_label)
  File "/home/jyu/miniconda3/envs/aiida-sssp-dev/lib/python3.9/site-packages/aiida/orm/nodes/node.py", line 837, in validate_incoming
    raise ValueError(f'the link ({source} -> {self}) would result in an erroneous query') from exc
ValueError: the link (Remote code 'pw-6.8' on eiger-hq, pk: 1, uuid: dc38cb79-defb-4fc9-a859-58e44aabefe7 -> uuid: 799e0295-a8b7-4c81-98b1-5511c87d0c21 (unstored) (aiida.calculations:quantumespresso.pw)) would result in an erroneous query  

@chrisjsewell
Copy link
Member

chrisjsewell commented Jun 2, 2023

But then how does that explain that we only see the problem when we run a certain number of processes.

Oh exactly, that is what I can't understand 😅
It certainly feels like there is some correlation, but currently I can't reproduce such "shared frames" in minimal examples

@chrisjsewell
Copy link
Member

image

@sphuber
Copy link
Contributor

sphuber commented Jun 2, 2023

Sure, but what the hell does ChatGPT now? It will tell all sorts of nonsense:

Screenshot 2023-06-02 203036

@sphuber
Copy link
Contributor

sphuber commented Jun 2, 2023

I think I may have a lead. Thinking about why it seems the asyncio tasks seem to be sharing stacks even though they shouldn't, I remember that plumpy actually hacks asyncio. By default, event loops are not reentrant. This posed a problem for process functions when @muhrin developed them. To get around this limitation, plumpy uses nest_asyncio to patch the event loop and make it reentrant. Without this patch process functions can not run and will hit the exception:

RuntimeError: This event loop is already running

In the examples of myself, @mbercx and @unkcpz which are all running PwBaseWorkChains, there is the create_kpoints_from_density calcfunction that is called. If I pass the explicit kpoints in the input instead, this function is not called, and then I no longer see any problems. I think this very strongly hints to the nest_asyncio needed for process functions being the culprit. It seems feasible that the stack is abused by making the loop reentrant and frames are not properly popped of the stack after a process function finished.

Think this is a promising avenue to look in further, although I am not quite sure exactly how to debug the frames that the process function execution adds. Ideally, if we find out this is the problem, we should really find a way to refactor process functions such that they no longer need nest_asyncio. It is a nasty hack and think it could cause other problems, if not now, maybe in the future.

@sphuber
Copy link
Contributor

sphuber commented Jun 4, 2023

I tried running the PwBaseWorkChain with the calcfunction enabled, but with the nest_asyncio disabled. I had to make a quick hack for it to run without the nested loop, and now I no longer see the RecursionError, independent of how many workchains I launch. I think it is almost certain that the reentrant loop is the cause of the problem.

@unkcpz
Copy link
Member

unkcpz commented Jun 5, 2023

but with the nest_asyncio disabled. I had to make a quick hack for it to run without the nested loop, and now I no longer see the RecursionError

I am very curious to see how you make this happen. 🤔

@chrisjsewell
Copy link
Member

Note, I can't reproduce any issue when adding nest_asyncio to my previous minimal example (#4876 (comment)), i.e. this does not affect the recursion limit per tasks

import nest_asyncio
nest_asyncio.apply()

import asyncio
import sys

sys.setrecursionlimit(70)

number_of_tasks = 10
recursion_depth = 100

async def recursive_task(task_name, counter):
    if counter <= 0:
        return

    print(f"Task {task_name} - Counter: {counter}")

    # Simulating some asynchronous work
    await asyncio.sleep(.1)

    # Recursive call
    await recursive_task(task_name, counter - 1)

async def main():
    tasks = [
        asyncio.create_task(recursive_task(f"Task {i}", recursion_depth))
        for i in range(number_of_tasks)
    ]

    # Wait for all tasks to complete
    await asyncio.gather(*tasks)

asyncio.run(main())

This is certainly not to say that nest-asyncio is not the problem, but still, it would be nice to find some minimal code that demonstrates the "stack sharing" issue and allows it to be understood.

@chrisjsewell
Copy link
Member

Without this patch process functions can not run and will hit the exception

@sphuber can you provide the stack trace for this, to see the code path that leads up to "re-entry"

@sphuber
Copy link
Contributor

sphuber commented Jun 6, 2023

@chrisjsewell but that is because you are not actually using the functionality of nest_asyncio to reenter the loop. This is what is done with the process functions, where as soon as it is called, it will essentially do get_event_loop.run_until_complete(Process.step_until_terminated()). This run_until_complete is attempting to start a new loop, but typically a loop is already running and so it raises the RuntimeError: This event loop is already running exception. The exception goes away by enabling nest_asyncio as it will make the current loop reentrant.

Take the following example that simulates what AiiDA is doing:

#!/usr/bin/env python
import nest_asyncio
nest_asyncio.apply()

import asyncio
import sys

sys.setrecursionlimit(70)

number_of_tasks = 2
recursion_depth = 10

async def recursive_task(task_name, counter):
    if counter <= 0:
        return

    print(f"Task {task_name} - Counter: {counter}")

    asyncio.get_event_loop().run_until_complete(recursive_task(task_name, counter - 1))


async def main():
    tasks = [
        asyncio.create_task(recursive_task(f"Task {i}", recursion_depth))
        for i in range(number_of_tasks)
    ]

    # Wait for all tasks to complete
    await asyncio.gather(*tasks)

asyncio.run(main())

This will except with the recursion error almost instantly.

@sphuber
Copy link
Contributor

sphuber commented Jun 6, 2023

I am very curious to see how you make this happen. 🤔

What you do is remove the application of nest_asyncio in plumpy.events and then in aiida-core you replace line 177 of aiida.engine.processes.functions:

result = process.execute()

with

runner.loop.create_task(process.step_until_terminated())

and line 188

return result, process.node

with

return process.node.get_outgoing().all_nodes()[0], process.node

You should now be able to run a PwBaseWorkChain as follows:

from aiida import engine, orm, plugins
structure = orm.StructureData(ase=bulk('Si', 'diamond', a=5.43))
builder = plugins.WorkflowFactory('quantumespresso.pw.base').get_builder_from_protocol(code, structure, protocol='fast')
results, node = engine.run_get_node(builder)

If you only disable the nest_asyncio application, the workchain will except as soon as it calls the create_kpoints_from_density calcfunction, because there the code will call loop.run_until_complete again to run the function in a reentrant loop, which is not allowed in vanilla asyncio. This is done in the process function code by calling self.execute. By replacing this with creating a task on the current loop instead, we are avoiding this. Somehow, and I don't fully understand yet how this works, it does execute the function body. However, since the create_task returns a Task and not the actual return value of the function, we have to manually recreate the output, which is the created KpointsData, by retrieving it from the node's outputs.

As I said, this is a real hack and is not a real solution, but at least I think it demonstrates that nest-asyncio is the problem, because with this hack I can run as many workchains over the daemon without hitting the recursion error.

@sphuber
Copy link
Contributor

sphuber commented Jun 6, 2023

I don't think there is really a "bug" in the stack handling with nest_asyncio, it simply looks like when a large number of workchains are submitted where each workchain contains a process function (or multiple), the daemon worker will start all workchains and launch each process function, but instead of finishing the process function, it will move on to the next workchain and do the same. Each process function call will reenter the loop and add a bunch of frames to the stack that will only be resolved when the function finished. However, it seems the event loop is preferentially starting tasks of new workchains and adding new process functions to the stack, eventually leading to the recursion error.

In summary, I only see the following possible solutions:

  1. Remove nest_asyncio altogether and find a way to be able to run process functions within a running loop without using a nested loop and without blocking itself.
  2. Find a way to tell the event loop to finish the tasks of the process functions preferentially and run it to completion before starting other tasks.

The last one seems counter to the concept of asyncio though, and I am not sure if it is even possible. Essentially what we want there is that the calcfunction should be completed synchronously.

@sphuber
Copy link
Contributor

sphuber commented Jun 6, 2023

Here is another nasty workaround that we could consider putting in place in case we cannot solve this problem properly. We could think to increase the recursion limit dynamically whenever a process function is about to be executed and the current frame number comes close to the current limit. Of course, if this were to go without end, in the end we would hit a stack overflow. We could add an additional hard limit above we won't go, and maybe at that point, instead of executing the process function, we could relinquish control to the event loop so it can try and do other stuff, ideally finishing other process functions that had already been started and in so freeing up frames on the stack.

@giovannipizzi
Copy link
Member

Hi all, first of all huge thanks to all of you!
Great example of teamwork and group debugging!

Now, since I wasn't deep into this, I tried to give to it an external look from a different point of view to understand the problem, and while reading the docs of nest-asyncio I found the link to the python bug report where it says that by design loops cannot be nested.

Reading a bit further I was lucky to find a (very simple!) solution by the developer of SQLAlchemy for a workaround to have a coroutine call a normal function that in turns would like to await a coroutine. It's very simple (<30 lines) and requires minimal change to the code, see also below or the linked GitHub gist in the comment of the code below.

I report below a modified version of the code you guys wrote above, that crashes with a few (not too many, only 20) not-too-deep recursive calls:

  • I changed the parameters to show that it fails indeed not because recursive functions are deep, but because we are calling many of them.
  • I made it explicit that we have a recursive coroutine that calls itself via a standard function (i.e. coroutine -> function -> coroutine -> ...). @sphuber do you confirm that this is the issue we are trying to solve in AiiDA? (Maybe I'm just completely missing the point...)

I then rewrote it using the suggested trick and it works fine, even if I run 20000 different tasks!

Here is the code of the failing one using nest-asyncio:

#!/usr/bin/env python
import nest_asyncio
nest_asyncio.apply()

import asyncio
import sys

sys.setrecursionlimit(70)

# Running already 20 not-too-deep recursions crashes badly
# Note: we are calling a recursive co-routine that calls a routine that internally calls again the initial coroutine
number_of_tasks = 20
recursion_depth = 5


async def recursive_task(task_name, counter):
    if counter <= 0:
        return
    print(f"Task {task_name} - Counter: {counter}")
    intermediate_function(task_name, counter)

def intermediate_function(task_name, counter):
    asyncio.get_event_loop().run_until_complete(recursive_task(task_name, counter - 1))

async def main():
    tasks = [
        asyncio.create_task(recursive_task(f"Task {i}", recursion_depth))
        for i in range(number_of_tasks)
    ]
    # Wait for all tasks to complete
    await asyncio.gather(*tasks)

asyncio.run(main())

And here is the code that works - I put 200 tasks because there is printing that is slow, but if you comment out printing you'll see it works even with 20000 and more):

#!/usr/bin/env python
import asyncio
import sys

########################################################################
### START SOLUTION FROM zzzeek ###
# From https://gist.github.com/zzzeek/4e89ce6226826e7a8df13e1b573ad354
import greenlet

def await_(coroutine):
    current = greenlet.getcurrent()

    if not isinstance(current, AsyncIoGreenlet):
        raise Exception(
            "not running inside a greenlet right now, "
            "can't use await_() function"
        )

    return current.driver.switch(coroutine)


class AsyncIoGreenlet(greenlet.greenlet):
    def __init__(self, driver, fn):
        greenlet.greenlet.__init__(self, fn, driver)
        self.driver = driver


async def greenlet_spawn(__fn, *args, **kw):
    target = AsyncIoGreenlet(greenlet.getcurrent(), __fn)

    target_return = target.switch(*args, **kw)

    while target:
        try:
            result = await target_return
        except:
            target_return = target.throw(*sys.exc_info())
        else:
            target_return = target.switch(result)

    # clean up cycle for the common case
    # (gc can do the exception case)
    del target.driver
    return target_return
### END SOLUTION FROM zzzeek ###
########################################################################


sys.setrecursionlimit(70)

# Here I set a lot of concurrent tasks. This never creates problems.
number_of_tasks = 200
recursion_depth = 5

async def recursive_task(task_name, counter):
    if counter <= 0:
        return

    #print(f"Task {task_name} - Counter: {counter}")
    await greenlet_spawn(intermediate_function, task_name, counter)

def intermediate_function(task_name, counter):
    await_(recursive_task(task_name, counter - 1))

async def main():
    tasks = [
        asyncio.create_task(recursive_task(f"Task {i}", recursion_depth))
        for i in range(number_of_tasks)
    ]

    # Wait for all tasks to complete
    await asyncio.gather(*tasks)

asyncio.run(main())

I'd be curious to know if this can solve our issue with minimal changes to the AiiDA code, or if this is not something we can use to fix this issue.

@sphuber
Copy link
Contributor

sphuber commented Jun 10, 2023

Interesting @giovannipizzi thanks for that. I will give this a go soon with the test I was running and see if it works. One concern I have is that reading the docs of greenlet, it seems they are working with threads, and normally that is a no-go for AiiDA, as it is currently written. The SqlA engine is not setup correctly for that in aiida-core, in any case, and attempting to offload something to another thread will result in DetachedInstanceErrors. But maybe greenlet does it slightly different such that it can still work

@giovannipizzi
Copy link
Member

From a brief read to https://greenlet.readthedocs.io/en/latest/ they just compare to threads to explain the difference, but they say that they are lightweight coroutines without the requirement of python language support, can also work with C code, and do cooperative scheduling, so I don't think we have thread issues

@chrisjsewell
Copy link
Member

Thanks @giovannipizzi, although yeh I would certainly be cautious here; although it looks like only a few lines of code, you are literally adding a whole new concurrency model (one that is not "native" to Python), which could obviously come with its own issues and can be difficult to debug.

I made it explicit that we have a recursive coroutine that calls itself via a standard function (i.e. coroutine -> function -> coroutine -> ...). sphuber do you confirm that this is the issue we are trying to solve in AiiDA

Indeed we should be very clear on what we are trying to achieve, in order to understand the tradeoffs in solutions

@sphuber
Copy link
Contributor

sphuber commented Jun 11, 2023

I made it explicit that we have a recursive coroutine that calls itself via a standard function (i.e. coroutine -> function -> coroutine -> ...). @sphuber do you confirm that this is the issue we are trying to solve in AiiDA? (Maybe I'm just completely missing the point...)

I think this is the problem: sometimes this is what we are doing, but not always. We can sum the situation up as follows:

  • AiiDA processes are implemented to run asynchronously
  • The process function API needs to be synchronous since we do not want the user to have to work with the asynchronous Python API
  • A process function needs to be able to (1) run directly by a user in an interactive shell, but should be able to be run identically as part of another AiiDA process, e.g., (2) run a process function within a step of a workchain.

This leads to the following scenarios:

(1) sync -> async
(2) async -> sync -> async

In (1) the outer call context is synchronous, the user simply calls the process function as a normal synchronous Python function. However, for scenario (2), for example a workchain that is run by a daemon runner, is called asynchronously. This is also where the need for nest_asyncio comes from. As soon as a workchain is stepped (which is an asynchronous call) and the step contains a call to a process function, that call will launch a new Process instance that then needs to be run asynchronously, but it needs to be done on a new event loop, because that is the "only" way to run asynchronous code from a synchronous stack. But since the even loop is already running, which is running the outside workchain (2) the loop needs to be reentered, which is not allowed in vanilla asyncio.

This leads to the problem of trying to implement the greenlet workaround. If I understand correctly, it works by wrapping the synchronous code in the greenlet_spawn coroutine and awaiting it. But I think this means that the user should be doing this. I don't see how we could do this inside the process function execution.

Essentially what happens currently when a process function is called is:

  • The Process instance (which is the FunctionProcess class that is created dynamically on the fly) is constructed with the provided inputs
  • Process.execute() is called, which is a synchronous call
  • This calls self.loop.run_until_complete(self.step_until_terminated()) where step_until_terminated is a coroutine.

To me it is not clear where the call to greenlet_spawn should be added. I think the only possibility is to have the actual call to the process function be wrapped in that coroutine and await it, but that is the user code and not something we can do.

@giovannipizzi
Copy link
Member

I would certainly be cautious here; although it looks like only a few lines of code, you are literally adding a whole new concurrency model (one that is not "native" to Python), which could obviously come with its own issues and can be difficult to debug.
Indeed we should be very clear on what we are trying to achieve, in order to understand the tradeoffs in solutions

@chrisjsewell you are totally right - here I'm just trying to see if this might work. If it does we should discuss if we should do it. However, the current design of the engine requires features not supported by native asyncio, so in a way or another we need to do something different (currently, nested-asyncio), so I feel that the solution might work, especially (to double check) if anyway some of our libraries are using it (e.g. SQLAlchemy).

@giovannipizzi
Copy link
Member

@sphuber I indeed had thought to this after posting my comment.
I make a more convoluted example, that shows how to run a "calcf" both from the main code (so the user never sees any async, only the @calcf decorator) and from a "fake" daemon - here I keep the list of tasks in a JSON and pop in a FIFO approach (instead of DB/RMQ). I also simulate in the daemon loop that after 1 second new tasks are submitted elsewhere and appear at the bottom of the queue.

Each recursive task has a random small (~0.01s) slowdown to allow potential concurrent execution, to prove that things are not just running serially.

Not sure if this really reproduces what we do in AiiDA... but it seems to work? (Again, good to have a double check that I'm not doing something very stupid)

#!/usr/bin/env python
import asyncio
import sys
import json

########################################################################
### START SOLUTION FROM zzzeek ###
# From https://gist.github.com/zzzeek/4e89ce6226826e7a8df13e1b573ad354
import greenlet

def await_(coroutine):
    current = greenlet.getcurrent()

    if not isinstance(current, AsyncIoGreenlet):
        raise Exception(
            "not running inside a greenlet right now, "
            "can't use await_() function"
        )

    return current.driver.switch(coroutine)


class AsyncIoGreenlet(greenlet.greenlet):
    def __init__(self, driver, fn):
        greenlet.greenlet.__init__(self, fn, driver)
        self.driver = driver


async def greenlet_spawn(__fn, *args, **kw):
    target = AsyncIoGreenlet(greenlet.getcurrent(), __fn)

    target_return = target.switch(*args, **kw)

    while target:
        try:
            result = await target_return
        except:
            target_return = target.throw(*sys.exc_info())
        else:
            target_return = target.switch(result)

    # clean up cycle for the common case
    # (gc can do the exception case)
    del target.driver
    return target_return
### END SOLUTION FROM zzzeek ###
########################################################################

def append_task(name, val):
    try:
        with open('tasks.json') as f:
            tasks = json.load(f)
    except IOError:
        tasks = []
    tasks.append([name, val])
    with open('tasks.json', 'w') as f:
        json.dump(tasks, f)

def pop_task():
    try:
        with open('tasks.json') as f:
            tasks = json.load(f)
    except IOError:
        return None
    try:
        name, val = tasks.pop(0) # FIFO
    except IndexError:
        return None
    with open('tasks.json', 'w') as f:
        json.dump(tasks, f)
    return name, val


def await_or_new_loop(coroutine):
    current = greenlet.getcurrent()

    if not isinstance(current, AsyncIoGreenlet):
        print("creating main loop")
        return asyncio.get_event_loop().run_until_complete(coroutine)
    else:
        print("reentring with greenlets")
        return current.driver.switch(coroutine)



sys.setrecursionlimit(70)

# Here I set a lot of concurrent tasks. This never creates problems.
number_of_tasks = 3
recursion_depth = 4


async def coro_executor(f, args, kwargs):
    # give a chance to switch context
    await asyncio.sleep(0)
    await greenlet_spawn(f, *args, **kwargs)

def calcf(func):
    def inner(*args, **kwargs):
        await_or_new_loop(coro_executor(func, args, kwargs))
    return inner

@calcf
def recursive(task_name, counter):
    import time
    import random

    # Very small delay to give a chance to other steps to happen at the same time
    time.sleep(random.random() * 0.01) # Random time between 0 and 0.1 s
    print(f"Task {task_name} - Counter: {counter}")
    if counter <= 0:
        return
    return recursive(task_name, counter - 1)



async def daemon_run():
    daemon_loop_counter = 0
    sleep_counter = 0

    futures = []

    while True:
        daemon_loop_counter += 1

        name_val = pop_task()
        if name_val is None:
            sleep_counter += 1
            if sleep_counter < 5:
                print("No tasks, daemon waiting")

                if sleep_counter == 1:
                     # At some point, "randomly" (here after ~1 sec), new tasks arrive
                     for i in range(number_of_tasks):
                         append_task(f"New task {i}", recursion_depth)

                await asyncio.sleep(1) # No tasks, wait 1 sec
                continue
            else:
                print("Stopping deamon after 5 sec, I now await all")
                await asyncio.gather(*futures)
                break
        # Run task
        name, val = name_val
        print(f"Deamon task to run: {name} {val}")
        futures.append(asyncio.create_task(coro_executor(recursive, [name, val], {})))



print("Running in main code, no coroutines")
recursive('test task', 5)
print("Running again")
recursive('test task', 6)


# "Submit" tasks (DB/RMQ replaced with JSON file, enough for here, not good for
# multiprocessing with many daemons)
for i in range(number_of_tasks):
    append_task(f"Task {i}", recursion_depth)

print()
print("Running in parallel from a 'daemon'")
asyncio.run(daemon_run())

Output:

Running in main code, no coroutines
creating main loop
Task test task - Counter: 5
reentring with greenlets
Task test task - Counter: 4
reentring with greenlets
Task test task - Counter: 3
reentring with greenlets
Task test task - Counter: 2
reentring with greenlets
Task test task - Counter: 1
reentring with greenlets
Task test task - Counter: 0
Running again
creating main loop
Task test task - Counter: 6
reentring with greenlets
Task test task - Counter: 5
reentring with greenlets
Task test task - Counter: 4
reentring with greenlets
Task test task - Counter: 3
reentring with greenlets
Task test task - Counter: 2
reentring with greenlets
Task test task - Counter: 1
reentring with greenlets
Task test task - Counter: 0

Running in parallel from a 'daemon'
Deamon task to run: Task 0 4
Deamon task to run: Task 1 4
Deamon task to run: Task 2 4
No tasks, daemon waiting
reentring with greenlets
reentring with greenlets
reentring with greenlets
Task Task 0 - Counter: 4
reentring with greenlets
Task Task 1 - Counter: 4
reentring with greenlets
Task Task 2 - Counter: 4
reentring with greenlets
Task Task 0 - Counter: 3
reentring with greenlets
Task Task 1 - Counter: 3
reentring with greenlets
Task Task 2 - Counter: 3
reentring with greenlets
Task Task 0 - Counter: 2
reentring with greenlets
Task Task 1 - Counter: 2
reentring with greenlets
Task Task 2 - Counter: 2
reentring with greenlets
Task Task 0 - Counter: 1
reentring with greenlets
Task Task 1 - Counter: 1
reentring with greenlets
Task Task 2 - Counter: 1
reentring with greenlets
Task Task 0 - Counter: 0
Task Task 1 - Counter: 0
Task Task 2 - Counter: 0
Deamon task to run: New task 0 4
Deamon task to run: New task 1 4
Deamon task to run: New task 2 4
No tasks, daemon waiting
reentring with greenlets
reentring with greenlets
reentring with greenlets
Task New task 0 - Counter: 4
reentring with greenlets
Task New task 1 - Counter: 4
reentring with greenlets
Task New task 2 - Counter: 4
reentring with greenlets
Task New task 0 - Counter: 3
reentring with greenlets
Task New task 1 - Counter: 3
reentring with greenlets
Task New task 2 - Counter: 3
reentring with greenlets
Task New task 0 - Counter: 2
reentring with greenlets
Task New task 1 - Counter: 2
reentring with greenlets
Task New task 2 - Counter: 2
reentring with greenlets
Task New task 0 - Counter: 1
reentring with greenlets
Task New task 1 - Counter: 1
reentring with greenlets
Task New task 2 - Counter: 1
reentring with greenlets
Task New task 0 - Counter: 0
Task New task 1 - Counter: 0
Task New task 2 - Counter: 0
No tasks, daemon waiting
No tasks, daemon waiting
Stopping deamon after 5 sec, I now await all

@giovannipizzi
Copy link
Member

OK - this does not work in Jupyter, but then we need to continue using anyway at the top:

import nest_asyncio
nest_asyncio.apply()

but just for this use case, and this should happen only once at the very top call of a calcfunction in a script (i.e. not in a daemon) - for how I wrote the code above, getting the outer loop should happen only once.

If you just add the two lines, of nest-asyncio the script above will fail with a RecusionError, but no worries! This is simply because nest-asyncio adds quite a few calls. If you replace with a more reasonable limit:

sys.setrecursionlimit(500)

then you can also run with

number_of_tasks = 600
recursion_depth = 4

(so even more tasks than the recursion limit), and things work, and from the output it looks to me that things are happening in parallel.

@giovannipizzi
Copy link
Member

Some additional reporting, to trying to convince that the greenbelt approach is better than the current nest_asyncio+increase_of_stack_size workaround (which of course was fine to get a quick improvement - but I still advocate that we should do some additional testing and move to the greenbelt approach.

Point 1: supporting both approaches We can implement it in a way that the greenlet approach is optional, as I show below. In this way, we can put it in optionally in the code and ask people to activate if they get stack overflow errors, if we are not really sure (or, I would say, I'd prefer the opposite, we turn it on by default and can revert to the current behaviour with some configuration option).

#!/usr/bin/env python
import asyncio
import sys
import itertools
import logging
#logging.basicConfig(level=logging.INFO)

import nest_asyncio
nest_asyncio.apply()

# Here I set a lot of concurrent tasks. This never creates problems.
START_RECURSION_LIMIT=2000
number_of_tasks = 40000
recursion_depth = 20
INCREASE_RECURSION_LIMIT = False
USE_GREENLET = True


sys.setrecursionlimit(START_RECURSION_LIMIT)

if USE_GREENLET:
    import greenlet

    class AsyncIoGreenlet(greenlet.greenlet):
        def __init__(self, driver, fn):
            greenlet.greenlet.__init__(self, fn, driver)
            self.driver = driver


def await_or_new_loop(coroutine):
    if USE_GREENLET:
        current = greenlet.getcurrent()

        if not isinstance(current, AsyncIoGreenlet):
            logging.info("creating main loop")
            return asyncio.get_event_loop().run_until_complete(coroutine)
        else:
            logging.info("reentring with greenlets")
            return current.driver.switch(coroutine)
    else:
        return asyncio.get_event_loop().run_until_complete(coroutine)


async def greenlet_spawn(__fn, *args, **kw):
    target = AsyncIoGreenlet(greenlet.getcurrent(), __fn)

    target_return = target.switch(*args, **kw)

    while target:
        try:
            result = await target_return
        except:
            target_return = target.throw(*sys.exc_info())
        else:
            target_return = target.switch(result)

    # clean up cycle for the common case
    # (gc can do the exception case)
    del target.driver
    return target_return
### END SOLUTION FROM zzzeek ###
########################################################################

async def spawn_as_coroutine(f, *args, **kwargs):
    if USE_GREENLET:
        await greenlet_spawn(f, *args, **kwargs)
    else:
        f(*args, **kwargs)

def get_stack_size(size: int = 2) -> int:  # type: ignore[return]
     frame = sys._getframe(size)  # pylint: disable=protected-access
     try:
         for size in itertools.count(size, 8):  # pylint: disable=redefined-argument-from-local
             frame = frame.f_back.f_back.f_back.f_back.f_back.f_back.f_back.f_back  # type: ignore[assignment,union-attr]
     except AttributeError:
         while frame:
             frame = frame.f_back  # type: ignore[assignment]
             size += 1
         return size - 1


async def recursive_task(task_name, counter):
    if counter <= 0:
        return

    logging.info(f"Task {task_name} - Counter: {counter}")
    await spawn_as_coroutine(intermediate_function, task_name, counter)

def intermediate_function(task_name, counter):
    if INCREASE_RECURSION_LIMIT:
        frame_delta = 1000
        frame_count = get_stack_size()
        stack_limit = sys.getrecursionlimit()

        # If the current frame count is more than 80% of the stack limit, or comes within 200 frames, increase the
        # stack limit by ``frame_delta``.
        if frame_count > min(0.8 * stack_limit, stack_limit - 200):
            logging.info(f"Old recursion limit = {stack_limit}, new = {stack_limit + frame_delta}")
            sys.setrecursionlimit(stack_limit + frame_delta)

    await_or_new_loop(recursive_task(task_name, counter - 1))

async def main():
    tasks = [
        asyncio.create_task(recursive_task(f"Task {i}", recursion_depth))
        for i in range(number_of_tasks)
    ]

    # Wait for all tasks to complete
    await asyncio.gather(*tasks)


logging.info("Running in main code, no coroutines")
intermediate_function('test task 1', 5)
logging.info("Running again")
intermediate_function('test task 2', 6)

logging.info("")
logging.info("Running in parallel from a 'daemon'")
import time
t = time.monotonic()
asyncio.run(main())
print(f"{number_of_tasks=}; {recursion_depth=}; {START_RECURSION_LIMIT=}; {INCREASE_RECURSION_LIMIT=}; {USE_GREENLET=}; Elapsed time: {time.monotonic() - t} s")

Point 2: clarity in stack in case of an exception In the original discussion by zzzeek, they clearly mention that the exceptions properly bubble up. Also, I'd say that the current approach with nest_asyncio is worse as we noted in this specific issue while debugging

Point 3: performance As the test results of the code above show (that I report below),
even without the automatic increase of the recursion limit, the code is 10x faster, and 50x faster of the nest_asyncio+autoincrease of the limit.

number_of_tasks=200; recursion_depth=10; START_RECURSION_LIMIT=70; INCREASE_RECURSION_LIMIT=False; USE_GREENLET=True; Elapsed time: 0.016110462000000006 s

number_of_tasks=200; recursion_depth=10; START_RECURSION_LIMIT=25000; INCREASE_RECURSION_LIMIT=False; USE_GREENLET=False; Elapsed time: 0.19436318800000002 s

number_of_tasks=200; recursion_depth=10; START_RECURSION_LIMIT=2000; INCREASE_RECURSION_LIMIT=True; USE_GREENLET=False; Elapsed time: 0.800523038 s

The greenlet approach works well even with 40000 tasks:

number_of_tasks=40000; recursion_depth=20; START_RECURSION_LIMIT=2000; INCREASE_RECURSION_LIMIT=False; USE_GREENLET=True; Elapsed time: 4.664176696 s

Point 4: robustness The solution with greenlet is being used by robust libraries like SQLAlchemy so I think they are quite robust (in the end we are just using the greenlet library that is designed to do this).

In addition, even with an automatic increase of the recursion limit, if we run 400 tasks in parallel (with a stack depth of only 10 each, quite reasonable: number_of_tasks=200; recursion_depth=10; START_RECURSION_LIMIT=70; INCREASE_RECURSION_LIMIT=False; USE_GREENLET=False), the whole python code crashes badly with a Segmentation fault: 11 code, very bad...

Note: if we activate greenlets, we need to disable (at least the fast) version of the automatic increase of the recursion limit, as accessing sys._getframe does not work properly in combination with greenlet: one gets:

frame = sys._getframe(size)
    ValueError: call stack is not deep enough

So, in summary, I think we should give the greenlet approach a go. Opinions?

@sphuber
Copy link
Contributor

sphuber commented Jun 20, 2023

If we can get rid of nest_asyncio entirely by switching to greenlets that would be great. What I understood from your previous post was that even with greenlets we would still need nest_asyncio and that it didn't work in notebooks without manual activation, which I think would have been unacceptable.

I will give this implementation a test run with an actual workload to look at performance and robustness. Note that if it works and we can get rid of nest_asyncio, this will have to be done in plumpy as it is activated there.

I propose then that we go ahead with the v2.4 release, which is ready with the workaround that works sufficiently well for the time being, and then we work towards the better solution. Note that anyways I have modern releases of plumpy and kiwipy with supposed improvements in RabbitMQ connection robustness so we could test this in parallel with the greenlets change. If all works well, we can release soon with 2.5.

@sphuber
Copy link
Contributor

sphuber commented Jun 20, 2023

@giovannipizzi I tried running with nest_asyncio disabled, and it seems that this is not working. When it gets to the await_or_new_loop call, it will call this multiple times (each time a process function is called) before the new greenlet is spawned. This results in asyncio.get_event_loop().run_until_complete(coroutine) being called multiple times which results in the exception that the loop is already running.

I then tried running the greenlet solution with nest_asyncio enabled in plumpy as it is now, and here I am also seeing some problems. When launching 25 PwBaseWorkChain, at some point it excepts with the following:

Traceback (most recent call last):
  File "/home/sph/.mambaforge/envs/aiida-py39/lib/python3.9/site-packages/plumpy/process_states.py", line 228, in execute
    result = self.run_fn(*self.args, **self.kwargs)
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/workchains/workchain.py", line 314, in _do_step
    finished, stepper_result = self._stepper.step()
  File "/home/sph/.mambaforge/envs/aiida-py39/lib/python3.9/site-packages/plumpy/workchains.py", line 295, in step
    finished, result = self._child_stepper.step()
  File "/home/sph/.mambaforge/envs/aiida-py39/lib/python3.9/site-packages/plumpy/workchains.py", line 246, in step
    return True, self._fn(self._workchain)
  File "/home/sph/code/aiida/env/dev/aiida-quantumespresso/src/aiida_quantumespresso/workflows/pw/base.py", line 271, in validate_kpoints
    kpoints = create_kpoints_from_distance(**inputs)  # pylint: disable=unexpected-keyword-arg
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 301, in decorated_function
    result, _ = run_get_node(*args, **kwargs)
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 274, in run_get_node
    result = await_or_new_loop(coro_executor(process.execute))
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 139, in await_or_new_loop
    return current.driver.switch(coroutine)
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 116, in greenlet_spawn
    result = await target_return
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 145, in coro_executor
    result = await greenlet_spawn(f, *args, **kwargs)
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 112, in greenlet_spawn
    target_return = target.switch(*args, **kw)
  File "/home/sph/code/aiida/env/dev/aiida-core/aiida/engine/processes/functions.py", line 116, in greenlet_spawn
    result = await target_return
TypeError: object KpointsData can't be used in 'await' expression

I noticed that when launching the workflows, I see a lot of creating main loop messages in the daemon log, coming from await_or_new_loop. It is essentially the same behavior as before where multiple process functions are started and they are each starting a new loop using nest_asyncio instead of relying on greenlet. I can see the stack size increasing as it does without greenlet. But at some point it does start using greenlets.

I am not quite sure what the logic is but it might just be how tasks get scheduled on the loop. It seems that when I comment out the asyncio.sleep(0) call from coro_executor, it doesn't have this problem. But I have the feeling this is just "hiding" the problem, and under some different conditions it migth try to start the loop more than once.

@giovannipizzi
Copy link
Member

Thanks @sphuber - do you have the changed code somewhere we can look at? Maybe the best is to find some time to look at this together (maybe even in person, it might be faster). Here it seems that target_return is a KpointsData that is a bit strange, it should be a function/coroutine? Maybe sometimes we are calling a function and passing the result, might be something simple. Also we need to be 100% sure that we never call asyncio.get_event_loop().run_until_complete(coroutine) anywhere else except in the function above, and also that this function is called only once. Indeed one test is what you are doing (no nest_asyncio, without Jupyter) and if it all works, we put it back just to make it work in Jupyter. Anyway converting my quick proof of concept to the actual code might require a bit more work indeed.

Anyway we can indeed release the workaround of increasing the stack size, and schedule already some time later to discuss this.

@giovannipizzi
Copy link
Member

BTW, I think my test code was done quickly and I didn't bother returning the results of awaited coroutines - so it was working with printing but not with getting function return values. Not sure if this is the problem, but definitely it might be if you just used my example above.

Here is an updated code that also properly passes around return values (I think, from quick testing...)

#!/usr/bin/env python
import asyncio
import sys
import json

########################################################################
### START SOLUTION FROM zzzeek ###
# From https://gist.github.com/zzzeek/4e89ce6226826e7a8df13e1b573ad354
import greenlet

def await_(coroutine):
    current = greenlet.getcurrent()

    if not isinstance(current, AsyncIoGreenlet):
        raise Exception(
            "not running inside a greenlet right now, "
            "can't use await_() function"
        )

    return current.driver.switch(coroutine)


class AsyncIoGreenlet(greenlet.greenlet):
    def __init__(self, driver, fn):
        greenlet.greenlet.__init__(self, fn, driver)
        self.driver = driver


async def greenlet_spawn(__fn, *args, **kw):
    target = AsyncIoGreenlet(greenlet.getcurrent(), __fn)

    target_return = target.switch(*args, **kw)

    while target:
        try:
            result = await target_return
        except:
            target_return = target.throw(*sys.exc_info())
        else:
            target_return = target.switch(result)

    # clean up cycle for the common case
    # (gc can do the exception case)
    del target.driver
    return target_return
### END SOLUTION FROM zzzeek ###
########################################################################

def append_task(name, val):
    print(f">>> Appending new task '{name}' with val {val}")
    try:
        with open('tasks.json') as f:
            tasks = json.load(f)
    except IOError:
        tasks = []
    tasks.append([name, val])
    with open('tasks.json', 'w') as f:
        json.dump(tasks, f)

def pop_task():
    try:
        with open('tasks.json') as f:
            tasks = json.load(f)
    except IOError:
        return None
    try:
        name, val = tasks.pop(0) # FIFO
    except IndexError:
        return None
    with open('tasks.json', 'w') as f:
        json.dump(tasks, f)
    return name, val


def await_or_new_loop(coroutine):
    current = greenlet.getcurrent()

    if not isinstance(current, AsyncIoGreenlet):
        print("creating main loop")
        return asyncio.get_event_loop().run_until_complete(coroutine)
    else:
        print("reentring with greenlets")
        return current.driver.switch(coroutine)



sys.setrecursionlimit(70)

# Here I set a lot of concurrent tasks. This never creates problems.
number_of_tasks = 3
recursion_depth = 4


async def coro_executor(f, args, kwargs):
    # give a chance to switch context
    await asyncio.sleep(0)
    return await greenlet_spawn(f, *args, **kwargs)

def calcf(func):
    def inner(*args, **kwargs):
        return await_or_new_loop(coro_executor(func, args, kwargs))
    return inner

@calcf
def recursive(task_name, counter):
    import time
    import random

    # Very small delay to give a chance to other steps to happen at the same time
    time.sleep(random.random() * 0.01) # Random time between 0 and 0.1 s
    print(f"Task {task_name} - Counter: {counter}")
    if counter <= 0:
        return 0
    return recursive(task_name, counter - 1) + 2



async def daemon_run():
    daemon_loop_counter = 0
    sleep_counter = 0

    futures = []
    in_vals = []

    while True:
        daemon_loop_counter += 1

        name_val = pop_task()
        if name_val is None:
            sleep_counter += 1
            if sleep_counter < 5:
                print("No tasks, daemon waiting")

                if sleep_counter == 1:
                     # At some point, "randomly" (here after ~1 sec), new tasks arrive
                     for i in range(number_of_tasks):
                         append_task(f"New task {i}", recursion_depth + 2 * i)

                await asyncio.sleep(1) # No tasks, wait 1 sec
                continue
            else:
                print("Stopping deamon after 5 sec, I now await all")
                # We are waiting for all of them at the very end;
                # in a real daemon this must be done better.
                results = await asyncio.gather(*futures)
                for (name, in_val), result in zip(in_vals, results):
                    print(f'-> [{name}] {in_val=}, {result=} (expected: {2*in_val})')
                break
        # Run task
        name, val = name_val
        print(f"<<< Deamon task to run: {name} {val}")
        
        in_vals.append([name, val])
        futures.append(asyncio.create_task(coro_executor(recursive, [name, val], {})))





print("Running in main code, no coroutines")
for in_val in [5, 6]:
    out_val = recursive('test task', in_val)
    print(f"  -> {in_val=}, {out_val=} (expected: {2 * in_val})")

# "Submit" tasks (DB/RMQ replaced with JSON file, enough for here, not good for
# multiprocessing with many daemons)
for i in range(number_of_tasks):
    append_task(f"Task {i}", recursion_depth + i)

print()
print("Running in parallel from a 'daemon'")
asyncio.run(daemon_run())

@unkcpz
Copy link
Member

unkcpz commented Sep 25, 2023

I did not see the issue anymore after #6052 for quite an intensive load on daemon last two months, I think it is safe to close this for the moment.

@sphuber
Copy link
Contributor

sphuber commented Jul 22, 2024

Here is a diff of an attempt to use greenlets integrated into aiida-core. I am not sure it is worth adding it now since the current workaround seems to be working just fine, but posting it here for posterity as I will delete the branch:

commit c905ef63b9e285049d3dc553e0a8595d8437aea1
Author: Sebastiaan Huber <mail@sphuber.net>
Date:   Wed Jun 14 08:31:59 2023 -0700

    Fix using greenlets

diff --git a/aiida/engine/processes/functions.py b/aiida/engine/processes/functions.py
index cb6fb52b1..4123fc723 100644
--- a/aiida/engine/processes/functions.py
+++ b/aiida/engine/processes/functions.py
@@ -15,6 +15,7 @@ import functools
 import inspect
 import logging
 import signal
+import sys
 import types
 import typing as t
 from typing import TYPE_CHECKING
@@ -61,6 +62,65 @@ LOGGER = logging.getLogger(__name__)
 
 FunctionType = t.TypeVar('FunctionType', bound=t.Callable[..., t.Any])
 
+import greenlet
+
+
+def await_(coroutine):
+    current = greenlet.getcurrent()
+
+    if not isinstance(current, AsyncIoGreenlet):
+        raise Exception('not running inside a greenlet right now, '
+                        "can't use await_() function")
+
+    return current.driver.switch(coroutine)
+
+
+class AsyncIoGreenlet(greenlet.greenlet):
+
+    def __init__(self, driver, fn):
+        greenlet.greenlet.__init__(self, fn, driver)
+        self.driver = driver
+
+
+async def greenlet_spawn(__fn, *args, **kw):
+    target = AsyncIoGreenlet(greenlet.getcurrent(), __fn)
+
+    target_return = target.switch(*args, **kw)
+
+    while target:
+        try:
+            result = await target_return
+        except:
+            target_return = target.throw(*sys.exc_info())
+        else:
+            target_return = target.switch(result)
+
+    # clean up cycle for the common case
+    # (gc can do the exception case)
+    del target.driver
+    return target_return
+
+
+import asyncio
+
+
+def await_or_new_loop(coroutine):
+    current = greenlet.getcurrent()
+
+    if not isinstance(current, AsyncIoGreenlet):
+        print('creating main loop')
+        return asyncio.get_event_loop().run_until_complete(coroutine)
+    else:
+        print('reentring with greenlets')
+        return current.driver.switch(coroutine)
+
+
+async def coro_executor(f, *args, **kwargs):
+    # give a chance to switch context
+    await asyncio.sleep(0)
+    result = await greenlet_spawn(f, *args, **kwargs)
+    return result
+
 
 def calcfunction(function: FunctionType) -> FunctionType:
     """
@@ -174,7 +234,7 @@ def process_function(node_class: t.Type['ProcessNode']) -> t.Callable[[FunctionT
                 signal.signal(kill_signal, kill_process)
 
             try:
-                result = process.execute()
+                result = await_or_new_loop(coro_executor(process.execute))
             finally:
                 # If the `original_handler` is set, that means the `kill_process` was bound, which needs to be reset
                 if original_handler:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests