From 853bea191901c269cf4d47d7c7934d4216ab8533 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 7 Jun 2024 14:59:29 -0400
Subject: [PATCH] In cmd.py argparse common_args(), set default number of
 threads to all available if --threads is unspecified (#104)

* In cmd.py argparse common_args(), set default number of threads to all available if --threads is unspecified

In cmd.py argparse common_args(), set default number of threads to all available if `--threads` is unspecified; previously if the threads arg were None, it would be up to the consuming function to set the thread count to all available. With this change, the new default is to use all available cores. Additionally, this sanitizes the user-requested thread count via util.misc.sanitize_thread_count(), if a value is specified. This was already the behavior in most multi-threaded functions, by separate calls to util.misc.sanitize_thread_count() where a threads arg is consumed; the latter could potentially be refactored out if we are relying solely on the argparse interface, though it should be preserved for python import usage of the same functions (including some test cases). Changing the default will cause no changes where existing separate sanitize_thread_count() calls are used. This also corrects a call to count_and_sort_barcodes() where the threads arg was not being passed.

* add pandas to python dependencies
---
 illumina.py   |  4 ++--
 read_utils.py |  6 +++---
 util/cmd.py   | 11 +++++------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/illumina.py b/illumina.py
index 6a64cb92..35b7c049 100755
--- a/illumina.py
+++ b/illumina.py
@@ -453,7 +453,7 @@ def parser_common_barcodes(parser=argparse.ArgumentParser()):
     parser.add_argument('--JVMmemory',
                         help='JVM virtual memory size (default: %(default)s)',
                         default=tools.picard.ExtractIlluminaBarcodesTool.jvmMemDefault)
-    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.common_args(parser, (('threads',None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, main_common_barcodes)
     return parser
 
@@ -506,7 +506,7 @@ def main_common_barcodes(args):
     except IndexError:
         barcode2_len = 0
 
-    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader)
+    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader, args.threads)
 
     # clean up
     os.unlink(barcode_file)
diff --git a/read_utils.py b/read_utils.py
index 44e357d9..7fc3cd86 100755
--- a/read_utils.py
+++ b/read_utils.py
@@ -919,7 +919,7 @@ def _merge_fastqs_and_mvicuna(lb, files):
 
     return readList
 
-def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
+def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None, threads=None):
     ''' Remove duplicate reads from BAM file using M-Vicuna. The
         primary advantage to this approach over Picard's MarkDuplicates tool
         is that Picard requires that input reads are aligned to a reference,
@@ -943,7 +943,7 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
     # For each library, merge FASTQs and run rmdup for entire library
     readListAll = mkstempfname('.keep_reads_all.txt')
     per_lb_read_lists = []
-    with concurrent.futures.ProcessPoolExecutor(max_workers=util.misc.available_cpu_count()) as executor:
+    with concurrent.futures.ProcessPoolExecutor(max_workers=threads or util.misc.available_cpu_count()) as executor:
         futures = [executor.submit(_merge_fastqs_and_mvicuna, lb, files) for lb, files in lb_to_files.items()]
         for future in concurrent.futures.as_completed(futures):
             log.info("mvicuna finished processing library")
@@ -972,7 +972,7 @@ def parser_rmdup_mvicuna_bam(parser=argparse.ArgumentParser()):
         default=tools.picard.FilterSamReadsTool.jvmMemDefault,
         help='JVM virtual memory size (default: %(default)s)'
     )
-    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.common_args(parser, (('threads',None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, rmdup_mvicuna_bam, split_args=True)
     return parser
 
diff --git a/util/cmd.py b/util/cmd.py
index 35eb803e..5ab52ec9 100644
--- a/util/cmd.py
+++ b/util/cmd.py
@@ -17,6 +17,7 @@
 
 import util.version
 import util.file
+import util.misc
 
 __author__ = "dpark@broadinstitute.org"
 __version__ = util.version.get_version()
@@ -76,15 +77,13 @@ def common_args(parser, arglist=(('tmp_dir', None), ('loglevel', None))):
                     the end, even if there's a failure.""",
                                 default=False)
         elif k == 'threads':
-            if v is None:
-                text_default = "all available cores"
-            else:
-                text_default = v
+            # if v is None, sanitize_thread_count() sets count to all available cores
+            thread_count = util.misc.sanitize_thread_count(v)
             parser.add_argument('--threads',
                                 dest="threads",
                                 type=int,
-                                help="Number of threads (default: {})".format(text_default),
-                                default=v)
+                                help="Number of threads; by default all cores are used",
+                                default=thread_count)
         elif k == 'version':
             if not v:
                 v = __version__