easybuilders · lexming · Nov 8, 2023 · Oct 18, 2023 · Oct 23, 2023 · Nov 6, 2023
diff --git a/easybuild/tools/modules.py b/easybuild/tools/modules.py
@@ -312,7 +312,7 @@ def check_module_function(self, allow_mismatch=False, regex=None):
                 output, exit_code = None, 1
         else:
             cmd = "type module"
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=False, hidden=True)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=False, hidden=True, output_file=False)
             output, exit_code = res.output, res.exit_code
 
         if regex is None:
@@ -821,7 +821,7 @@ def run_module(self, *args, **kwargs):
         cmd = ' '.join(cmd_list)
         # note: module commands are always run in dry mode, and are kept hidden in trace and dry run output
         res = run_shell_cmd(cmd_list, env=environ, fail_on_error=False, shell=False, split_stderr=True,
-                            hidden=True, in_dry_run=True)
+                            hidden=True, in_dry_run=True, output_file=False)
 
         # stdout will contain python code (to change environment etc)
         # stderr will contain text (just like the normal module command)

diff --git a/easybuild/tools/run.py b/easybuild/tools/run.py
@@ -76,19 +76,23 @@
 ]
 
 
-RunShellCmdResult = namedtuple('RunShellCmdResult', ('cmd', 'exit_code', 'output', 'stderr', 'work_dir'))
+RunShellCmdResult = namedtuple('RunShellCmdResult', ('cmd', 'exit_code', 'output', 'stderr', 'work_dir',
+                                                     'out_file', 'err_file'))
 
 
 class RunShellCmdError(BaseException):
 
-    def __init__(self, cmd, exit_code, work_dir, output, stderr, caller_info, *args, **kwargs):
+    def __init__(self, cmd_result, caller_info, *args, **kwargs):
         """Constructor for RunShellCmdError."""
-        self.cmd = cmd
-        self.cmd_name = cmd.split(' ')[0]
-        self.exit_code = exit_code
-        self.work_dir = work_dir
-        self.output = output
-        self.stderr = stderr
+        self.cmd = cmd_result.cmd
+        self.cmd_name = os.path.basename(self.cmd.split(' ')[0])
+        self.exit_code = cmd_result.exit_code
+        self.work_dir = cmd_result.work_dir
+        self.output = cmd_result.output
+        self.out_file = cmd_result.out_file
+        self.stderr = cmd_result.stderr
+        self.err_file = cmd_result.err_file
+
         self.caller_info = caller_info
 
         msg = f"Shell command '{self.cmd_name}' failed!"
@@ -110,21 +114,13 @@ def pad_4_spaces(msg):
             pad_4_spaces(f"working directory         ->  {self.work_dir}"),
         ]
 
-        tmpdir = tempfile.mkdtemp(prefix='shell-cmd-error-')
-        output_fp = os.path.join(tmpdir, f"{self.cmd_name}.out")
-        with open(output_fp, 'w') as fp:
-            fp.write(self.output or '')
+        if self.out_file is not None:
+            # if there's no separate file for error/warnings, then out_file includes both stdout + stderr
+            out_info_msg = "output (stdout + stderr)" if self.err_file is None else "output (stdout)         "
+            error_info.append(pad_4_spaces(f"{out_info_msg}  ->  {self.out_file}"))
 
-        if self.stderr is None:
-            error_info.append(pad_4_spaces(f"output (stdout + stderr)  ->  {output_fp}"))
-        else:
-            stderr_fp = os.path.join(tmpdir, f"{self.cmd_name}.err")
-            with open(stderr_fp, 'w') as fp:
-                fp.write(self.stderr)
-            error_info.extend([
-                pad_4_spaces(f"output (stdout)           ->  {output_fp}"),
-                pad_4_spaces(f"error/warnings (stderr)   ->  {stderr_fp}"),
-            ])
+        if self.err_file is not None:
+            error_info.append(pad_4_spaces(f"error/warnings (stderr)   ->  {self.err_file}"))
 
         caller_file_name, caller_line_nr, caller_function_name = self.caller_info
         called_from_info = f"'{caller_function_name}' function in {caller_file_name} (line {caller_line_nr})"
@@ -136,9 +132,9 @@ def pad_4_spaces(msg):
         sys.stderr.write('\n'.join(error_info) + '\n')
 
 
-def raise_run_shell_cmd_error(cmd, exit_code, work_dir, output, stderr):
+def raise_run_shell_cmd_error(cmd_res):
     """
-    Raise RunShellCmdError for failing shell command, after collecting additional caller info
+    Raise RunShellCmdError for failed shell command, after collecting additional caller info
     """
 
     # figure out where failing command was run
@@ -150,7 +146,7 @@ def raise_run_shell_cmd_error(cmd, exit_code, work_dir, output, stderr):
     frameinfo = inspect.getouterframes(inspect.currentframe())[3]
     caller_info = (frameinfo.filename, frameinfo.lineno, frameinfo.function)
 
-    raise RunShellCmdError(cmd, exit_code, work_dir, output, stderr, caller_info)
+    raise RunShellCmdError(cmd_res, caller_info)
 
 
 def run_cmd_cache(func):
@@ -185,7 +181,7 @@ def cache_aware_func(cmd, *args, **kwargs):
 @run_shell_cmd_cache
 def run_shell_cmd(cmd, fail_on_error=True, split_stderr=False, stdin=None, env=None,
                   hidden=False, in_dry_run=False, verbose_dry_run=False, work_dir=None, shell=True,
-                  output_file=False, stream_output=False, asynchronous=False, with_hooks=True,
+                  output_file=True, stream_output=False, asynchronous=False, with_hooks=True,
                   qa_patterns=None, qa_wait_patterns=None):
     """
     Run specified (interactive) shell command, and capture output + exit code.
@@ -234,16 +230,23 @@ def to_cmd_str(cmd):
     if work_dir is None:
         work_dir = os.getcwd()
 
-    # temporary output file for command output, if requested
-    if output_file or not hidden:
-        # collect output of running command in temporary log file, if desired
-        fd, cmd_out_fp = tempfile.mkstemp(suffix='.log', prefix='easybuild-run-')
-        os.close(fd)
-        _log.info(f'run_cmd: Output of "{cmd}" will be logged to {cmd_out_fp}')
-    else:
-        cmd_out_fp = None
-
     cmd_str = to_cmd_str(cmd)
+    cmd_name = os.path.basename(cmd_str.split(' ')[0])
+
+    # temporary output file(s) for command output
+    if output_file:
+        toptmpdir = os.path.join(tempfile.gettempdir(), 'run-shell-cmd-output')
+        os.makedirs(toptmpdir, exist_ok=True)
+        tmpdir = tempfile.mkdtemp(dir=toptmpdir, prefix=f'{cmd_name}-')
+        cmd_out_fp = os.path.join(tmpdir, 'out.txt')
+        _log.info(f'run_cmd: Output of "{cmd_str}" will be logged to {cmd_out_fp}')
+        if split_stderr:
+            cmd_err_fp = os.path.join(tmpdir, 'err.txt')
+            _log.info(f'run_cmd: Errors and warnings of "{cmd_str}" will be logged to {cmd_err_fp}')
+        else:
+            cmd_err_fp = None
+    else:
+        cmd_out_fp, cmd_err_fp = None, None
 
     # early exit in 'dry run' mode, after printing the command that would be run (unless 'hidden' is enabled)
     if not in_dry_run and build_option('extended_dry_run'):
@@ -253,11 +256,12 @@ def to_cmd_str(cmd):
             msg += f"  (in {work_dir})"
             dry_run_msg(msg, silent=silent)
 
-        return RunShellCmdResult(cmd=cmd_str, exit_code=0, output='', stderr=None, work_dir=work_dir)
+        return RunShellCmdResult(cmd=cmd_str, exit_code=0, output='', stderr=None, work_dir=work_dir,
+                                 out_file=cmd_out_fp, err_file=cmd_err_fp)
 
     start_time = datetime.now()
     if not hidden:
-        cmd_trace_msg(cmd_str, start_time, work_dir, stdin, cmd_out_fp)
+        cmd_trace_msg(cmd_str, start_time, work_dir, stdin, cmd_out_fp, cmd_err_fp)
 
     if stdin:
         # 'input' value fed to subprocess.run must be a byte sequence
@@ -286,7 +290,19 @@ def to_cmd_str(cmd):
     output = proc.stdout.decode('utf-8', 'ignore')
     stderr = proc.stderr.decode('utf-8', 'ignore') if split_stderr else None
 
-    res = RunShellCmdResult(cmd=cmd_str, exit_code=proc.returncode, output=output, stderr=stderr, work_dir=work_dir)
+    # store command output to temporary file(s)
+    if output_file:
+        try:
+            with open(cmd_out_fp, 'w') as fp:
+                fp.write(output)
+            if split_stderr:
+                with open(cmd_err_fp, 'w') as fp:
+                    fp.write(stderr)
+        except IOError as err:
+            raise EasyBuildError(f"Failed to dump command output to temporary file: {err}")
+
+    res = RunShellCmdResult(cmd=cmd_str, exit_code=proc.returncode, output=output, stderr=stderr, work_dir=work_dir,
+                            out_file=cmd_out_fp, err_file=cmd_err_fp)
 
     # always log command output
     cmd_name = cmd_str.split(' ')[0]
@@ -301,7 +317,7 @@ def to_cmd_str(cmd):
     else:
         _log.warning(f"Shell command FAILED (exit code {res.exit_code}, see output above): {cmd_str}")
         if fail_on_error:
-            raise_run_shell_cmd_error(res.cmd, res.exit_code, res.work_dir, output=res.output, stderr=res.stderr)
+            raise_run_shell_cmd_error(res)
 
     if with_hooks:
         run_hook_kwargs = {
@@ -319,15 +335,16 @@ def to_cmd_str(cmd):
     return res
 
 
-def cmd_trace_msg(cmd, start_time, work_dir, stdin, cmd_out_fp):
+def cmd_trace_msg(cmd, start_time, work_dir, stdin, cmd_out_fp, cmd_err_fp):
     """
     Helper function to construct and print trace message for command being run
 
     :param cmd: command being run
     :param start_time: datetime object indicating when command was started
     :param work_dir: path of working directory in which command is run
     :param stdin: stdin input value for command
-    :param cmd_out_fp: path to output log file for command
+    :param cmd_out_fp: path to output file for command
+    :param cmd_err_fp: path to errors/warnings output file for command
     """
     start_time = start_time.strftime('%Y-%m-%d %H:%M:%S')
 
@@ -339,7 +356,9 @@ def cmd_trace_msg(cmd, start_time, work_dir, stdin, cmd_out_fp):
     if stdin:
         lines.append(f"\t[input: {stdin}]")
     if cmd_out_fp:
-        lines.append(f"\t[output logged in {cmd_out_fp}]")
+        lines.append(f"\t[output saved to {cmd_out_fp}]")
+    if cmd_err_fp:
+        lines.append(f"\t[errors/warnings saved to {cmd_err_fp}]")
 
     lines.append('\t' + cmd)
 

diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py
@@ -273,7 +273,7 @@ def get_avail_core_count():
         core_cnt = int(sum(sched_getaffinity()))
     else:
         # BSD-type systems
-        res = run_shell_cmd('sysctl -n hw.ncpu', in_dry_run=True, hidden=True, with_hooks=False)
+        res = run_shell_cmd('sysctl -n hw.ncpu', in_dry_run=True, hidden=True, with_hooks=False, output_file=False)
         try:
             if int(res.output) > 0:
                 core_cnt = int(res.output)
@@ -310,7 +310,7 @@ def get_total_memory():
     elif os_type == DARWIN:
         cmd = "sysctl -n hw.memsize"
         _log.debug("Trying to determine total memory size on Darwin via cmd '%s'", cmd)
-        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False)
+        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False, output_file=False)
         if res.exit_code == 0:
             memtotal = int(res.output.strip()) // (1024**2)
 
@@ -392,14 +392,15 @@ def get_cpu_vendor():
 
     elif os_type == DARWIN:
         cmd = "sysctl -n machdep.cpu.vendor"
-        res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False)
+        res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False, output_file=False)
         out = res.output.strip()
         if res.exit_code == 0 and out in VENDOR_IDS:
             vendor = VENDOR_IDS[out]
             _log.debug("Determined CPU vendor on DARWIN as being '%s' via cmd '%s" % (vendor, cmd))
         else:
             cmd = "sysctl -n machdep.cpu.brand_string"
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False,
+                                output_file=False)
             out = res.output.strip().split(' ')[0]
             if res.exit_code == 0 and out in CPU_VENDORS:
                 vendor = out
@@ -502,7 +503,7 @@ def get_cpu_model():
 
     elif os_type == DARWIN:
         cmd = "sysctl -n machdep.cpu.brand_string"
-        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False)
+        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False, output_file=False)
         if res.exit_code == 0:
             model = res.output.strip()
             _log.debug("Determined CPU model on Darwin using cmd '%s': %s" % (cmd, model))
@@ -547,7 +548,7 @@ def get_cpu_speed():
     elif os_type == DARWIN:
         cmd = "sysctl -n hw.cpufrequency_max"
         _log.debug("Trying to determine CPU frequency on Darwin via cmd '%s'" % cmd)
-        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False)
+        res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, with_hooks=False, output_file=False)
         out = res.output.strip()
         cpu_freq = None
         if res.exit_code == 0 and out:
@@ -595,7 +596,8 @@ def get_cpu_features():
         for feature_set in ['extfeatures', 'features', 'leaf7_features']:
             cmd = "sysctl -n machdep.cpu.%s" % feature_set
             _log.debug("Trying to determine CPU features on Darwin via cmd '%s'", cmd)
-            res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, fail_on_error=False, with_hooks=False)
+            res = run_shell_cmd(cmd, in_dry_run=True, hidden=True, fail_on_error=False, with_hooks=False,
+                                output_file=False)
             if res.exit_code == 0:
                 cpu_feat.extend(res.output.strip().lower().split())
 
@@ -622,7 +624,8 @@ def get_gpu_info():
         try:
             cmd = "nvidia-smi --query-gpu=gpu_name,driver_version --format=csv,noheader"
             _log.debug("Trying to determine NVIDIA GPU info on Linux via cmd '%s'", cmd)
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False,
+                                output_file=False)
             if res.exit_code == 0:
                 for line in res.output.strip().split('\n'):
                     nvidia_gpu_info = gpu_info.setdefault('NVIDIA', {})
@@ -640,13 +643,15 @@ def get_gpu_info():
         try:
             cmd = "rocm-smi --showdriverversion --csv"
             _log.debug("Trying to determine AMD GPU driver on Linux via cmd '%s'", cmd)
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False,
+                                output_file=False)
             if res.exit_code == 0:
                 amd_driver = res.output.strip().split('\n')[1].split(',')[1]
 
             cmd = "rocm-smi --showproductname --csv"
             _log.debug("Trying to determine AMD GPU info on Linux via cmd '%s'", cmd)
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, with_hooks=False,
+                                output_file=False)
             if res.exit_code == 0:
                 for line in res.output.strip().split('\n')[1:]:
                     amd_card_series = line.split(',')[1]
@@ -865,7 +870,7 @@ def check_os_dependency(dep):
                 pkg_cmd_flag.get(pkg_cmd),
                 dep,
             ])
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, output_file=False)
             found = res.exit_code == 0
             if found:
                 break
@@ -877,7 +882,7 @@ def check_os_dependency(dep):
         # try locate if it's available
         if not found and which('locate'):
             cmd = 'locate -c --regexp "/%s$"' % dep
-            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True)
+            res = run_shell_cmd(cmd, fail_on_error=False, in_dry_run=True, hidden=True, output_file=False)
             try:
                 found = (res.exit_code == 0 and int(res.output.strip()) > 0)
             except ValueError:
@@ -893,7 +898,7 @@ def get_tool_version(tool, version_option='--version', ignore_ec=False):
     Output is returned as a single-line string (newlines are replaced by '; ').
     """
     res = run_shell_cmd(' '.join([tool, version_option]), fail_on_error=False, in_dry_run=True,
-                        hidden=True, with_hooks=False)
+                        hidden=True, with_hooks=False, output_file=False)
     if not ignore_ec and res.exit_code:
         _log.warning("Failed to determine version of %s using '%s %s': %s" % (tool, tool, version_option, res.output))
         return UNKNOWN
@@ -905,7 +910,7 @@ def get_gcc_version():
     """
     Process `gcc --version` and return the GCC version.
     """
-    res = run_shell_cmd('gcc --version', fail_on_error=False, in_dry_run=True, hidden=True)
+    res = run_shell_cmd('gcc --version', fail_on_error=False, in_dry_run=True, hidden=True, output_file=False)
     gcc_ver = None
     if res.exit_code:
         _log.warning("Failed to determine the version of GCC: %s", res.output)
@@ -961,7 +966,7 @@ def get_linked_libs_raw(path):
     or None for other types of files.
     """
 
-    res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True)
+    res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False)
     if res.exit_code:
         fail_msg = "Failed to run 'file %s': %s" % (path, res.output)
         _log.warning(fail_msg)
@@ -996,7 +1001,7 @@ def get_linked_libs_raw(path):
     # take into account that 'ldd' may fail for strange reasons,
     # like printing 'not a dynamic executable' when not enough memory is available
     # (see also https://bugzilla.redhat.com/show_bug.cgi?id=1817111)
-    res = run_shell_cmd(linked_libs_cmd, fail_on_error=False, hidden=True)
+    res = run_shell_cmd(linked_libs_cmd, fail_on_error=False, hidden=True, output_file=False)
     if res.exit_code == 0:
         linked_libs_out = res.output
     else:
@@ -1178,7 +1183,7 @@ def get_default_parallelism():
             # No cache -> Calculate value from current system values
             par = get_avail_core_count()
             # determine max user processes via ulimit -u
-            res = run_shell_cmd("ulimit -u", in_dry_run=True, hidden=True)
+            res = run_shell_cmd("ulimit -u", in_dry_run=True, hidden=True, output_file=False)
             try:
                 if res.output.startswith("unlimited"):
                     maxuserproc = 2 ** 32 - 1