diff --git a/lib/ramble/ramble/application.py b/lib/ramble/ramble/application.py index 0f569b30f..85cc275a7 100644 --- a/lib/ramble/ramble/application.py +++ b/lib/ramble/ramble/application.py @@ -67,7 +67,18 @@ experiment_status = Enum( "experiment_status", - ["UNKNOWN", "SETUP", "SUBMITTED", "RUNNING", "COMPLETE", "SUCCESS", "FAILED", "CANCELLED"], + [ + "UNKNOWN", + # unresolved means the status is not fetched successfully + "UNRESOLVED", + "SETUP", + "SUBMITTED", + "RUNNING", + "COMPLETE", + "SUCCESS", + "FAILED", + "CANCELLED", + ], ) _NULL_CONTEXT = "null" diff --git a/var/ramble/repos/builtin/workflow_managers/slurm/workflow_manager.py b/var/ramble/repos/builtin/workflow_managers/slurm/workflow_manager.py index 63d1e0acb..4a747ea24 100644 --- a/var/ramble/repos/builtin/workflow_managers/slurm/workflow_manager.py +++ b/var/ramble/repos/builtin/workflow_managers/slurm/workflow_manager.py @@ -172,7 +172,7 @@ def get_status(self, workspace): expander = self.app_inst.expander run_dir = expander.expand_var_name("experiment_run_dir") job_id_file = os.path.join(run_dir, ".slurm_job") - status = experiment_status.UNKNOWN + status = experiment_status.UNRESOLVED if not os.path.isfile(job_id_file): logger.warn("job_id file is missing") return status @@ -228,7 +228,13 @@ def get_status(self, job_id): if not status_out: self._ensure_runner("sacct") sacct_args = ["-o", "state", "-X", "-n", "-j", job_id] - status_out = self.sacct_runner.command(*sacct_args, output=str) + try: + status_out = self.sacct_runner.command(*sacct_args, output=str) + except ProcessError as e: + status_out = "" + logger.debug( + f"sacct returns error {e}. The status is not resolved correctly." + ) return status_out.strip() def get_partitions(self):