Merge pull request #97 from BIMSBbioinfo/snakemake-resources

add snakemake resources + memory requirements per rule
BIMSBbioinfo · Jun 7, 2021 · 3ac5261 · 3ac5261
2 parents e8da601 + 18fd78c
commit 3ac5261
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 4 deletions.
diff --git a/etc/settings.yaml.in b/etc/settings.yaml.in
@@ -59,6 +59,7 @@ execution:
   submit-to-cluster: no
   jobs: 6
   nice: 19
+  mem_mb: 4000
   cluster:
     missing-file-timeout: 120
     memory: 8G

diff --git a/pigx-rnaseq.in b/pigx-rnaseq.in
@@ -323,6 +323,7 @@ command = [
     "--configfile={}".format(args.configfile),
     "--directory={}".format(config['locations']['output-dir']),
     "--jobs={}".format(config['execution']['jobs']),
+    "--resources", "mem_mb={}".format(config['execution']['mem_mb']),
 ]
 
 if config['execution']['submit-to-cluster']:

diff --git a/snakefile.py b/snakefile.py
@@ -271,7 +271,6 @@ def trim_reads_input(args):
   sample = args[0]
   return [os.path.join(READS_DIR, f) for f in lookup('name', sample, ['reads', 'reads2']) if f]
 
-
 # fastp both trims/filters reads and outputs QC reports in html/json format
 rule trim_qc_reads_pe:
   input: trim_reads_input
@@ -297,6 +296,8 @@ def trim_reads_input(args):
     input: GENOME_FASTA
     output:
         star_index_file = os.path.join(OUTPUT_DIR, 'star_index', "SAindex")
+    resources:
+      mem_mb = 32000
     params:
         star_index_dir = os.path.join(OUTPUT_DIR, 'star_index')
     log: os.path.join(LOG_DIR, 'star_index.log')
@@ -306,6 +307,8 @@ def trim_reads_input(args):
     input: GENOME_FASTA
     output:
         [os.path.join(OUTPUT_DIR, "hisat2_index", f"{GENOME_BUILD}_index.{n}.ht2l") for n in [1, 2, 3, 4, 5, 6, 7, 8]]
+    resources:
+      mem_mb = 21000
     params:
         index_directory = os.path.join(OUTPUT_DIR, "hisat2_index"),
     log: os.path.join(LOG_DIR, 'hisat2_index.log')
@@ -336,6 +339,8 @@ def hisat2_file_arguments(args):
     reads = map_input
   output:
     os.path.join(MAPPED_READS_DIR, 'star', '{sample}_Aligned.sortedByCoord.out.bam')
+  resources:
+    mem_mb = 16000
   params:
     index_dir = rules.star_index.params.star_index_dir,
     output_prefix=os.path.join(MAPPED_READS_DIR, 'star', '{sample}_')
@@ -348,6 +353,8 @@ def hisat2_file_arguments(args):
     reads = map_input
   output:
     os.path.join(MAPPED_READS_DIR, 'hisat2', '{sample}_Aligned.sortedByCoord.out.bam')
+  resources:
+    mem_mb = 8000
   params:
     samfile = lambda wildcards: os.path.join(MAPPED_READS_DIR, 'hisat2', "_".join([wildcards.sample, 'Aligned.out.sam'])),
     index_dir = rules.hisat2_index.params.index_directory,
@@ -361,10 +368,12 @@ def hisat2_file_arguments(args):
     {SAMTOOLS_EXEC} view -bh {params.samfile} | {SAMTOOLS_EXEC} sort -o {output} >> {log[1]} 2>&1
     rm {params.samfile}
     """
-
+    
 rule index_bam:
   input: os.path.join(MAPPED_READS_DIR, MAPPER, '{sample}_Aligned.sortedByCoord.out.bam')
   output: os.path.join(MAPPED_READS_DIR, MAPPER, '{sample}_Aligned.sortedByCoord.out.bam.bai')
+  resources:
+    mem_mb = 100
   log: os.path.join(LOG_DIR, 'samtools_index_{sample}.log')
   shell: "{SAMTOOLS_EXEC} index {input} {output} >> {log} 2>&1"
 
@@ -373,6 +382,8 @@ def hisat2_file_arguments(args):
       CDNA_FASTA
   output:
       salmon_index_file = os.path.join(OUTPUT_DIR, 'salmon_index', "sa.bin")
+  resources:
+      mem_mb = 5000
   params:
       salmon_index_dir = os.path.join(OUTPUT_DIR, 'salmon_index')
   log: os.path.join(LOG_DIR, "salmon", 'salmon_index.log')
@@ -388,6 +399,8 @@ def hisat2_file_arguments(args):
   output:
       os.path.join(SALMON_DIR, "{sample}", "quant.sf"),
       os.path.join(SALMON_DIR, "{sample}", "quant.genes.sf")
+  resources:
+      mem_mb = 6000
   params:
       index_dir = rules.salmon_index.params.salmon_index_dir,
       outfolder = os.path.join(SALMON_DIR, "{sample}")
@@ -409,6 +422,8 @@ def hisat2_file_arguments(args):
       os.path.join(COUNTS_DIR, "raw_counts", "salmon","counts_from_SALMON.genes.tsv"),
       os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.transcripts.tsv"),
       os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.genes.tsv")
+  resources:
+      mem_mb = 1000
   log: os.path.join(LOG_DIR, "salmon", 'salmon_import_counts.log')
   shell: "{RSCRIPT_EXEC} {SCRIPTS_DIR}/counts_matrix_from_SALMON.R {SALMON_DIR} {COUNTS_DIR} {input.colDataFile} >> {log} 2>&1"
 
@@ -425,8 +440,8 @@ def hisat2_file_arguments(args):
     os.path.join(LOG_DIR, MAPPER, 'genomeCoverage.forward.{sample}.log'),
     os.path.join(LOG_DIR, MAPPER, 'genomeCoverage.reverse.{sample}.log'),
     os.path.join(LOG_DIR, MAPPER, 'genomeCoverage.{sample}.log')
-  params:
-    outdir = os.path.join(BIGWIG_DIR, MAPPER)
+  resources:
+    mem_mb = 4000
   shell:
     """
     {BAMCOVERAGE_EXEC} -b {input.bam} -o {output[0]} --filterRNAstrand forward >> {log[0]} 2>&1
@@ -439,6 +454,8 @@ def hisat2_file_arguments(args):
     salmon_output=expand(os.path.join(SALMON_DIR, "{sample}", "quant.sf"), sample = SAMPLES),
     mapping_output=expand(os.path.join(MAPPED_READS_DIR, MAPPER, '{sample}_Aligned.sortedByCoord.out.bam'), sample=SAMPLES)
   output: os.path.join(MULTIQC_DIR, 'multiqc_report.html')
+  resources:
+    mem_mb = 200
   log: os.path.join(LOG_DIR, f'multiqc.{MAPPER}.log')
   shell: "{MULTIQC_EXEC} -o {MULTIQC_DIR} {OUTPUT_DIR} >> {log} 2>&1"
 
@@ -448,6 +465,8 @@ def hisat2_file_arguments(args):
     bai = os.path.join(MAPPED_READS_DIR, MAPPER, "{sample}_Aligned.sortedByCoord.out.bam.bai")
   output:
     os.path.join(MAPPED_READS_DIR, MAPPER, "{sample}.read_counts.csv")
+  resources:
+    mem_mb = 5000
   log: os.path.join(LOG_DIR, MAPPER, "{sample}.count_reads.log")
   params:
     single_end = isSingleEnd,
@@ -467,6 +486,8 @@ def hisat2_file_arguments(args):
     expand(os.path.join(MAPPED_READS_DIR, MAPPER, "{sample}.read_counts.csv"), sample = SAMPLES)
   output:
     os.path.join(COUNTS_DIR, "raw_counts", MAPPER, "counts.tsv")
+  resources:
+    mem_mb = 200
   log: os.path.join(LOG_DIR, MAPPER, "collate_read_counts.log")
   params:
     mapped_dir = os.path.join(MAPPED_READS_DIR, MAPPER),
@@ -484,6 +505,8 @@ def hisat2_file_arguments(args):
     output:
         size_factors = os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt"),
         norm_counts = os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_normalized_counts.tsv")
+    resources:
+      mem_mb = 1000
     log:
         os.path.join(LOG_DIR, MAPPER, "norm_counts_deseq.log")
     params:
@@ -507,6 +530,8 @@ def hisat2_file_arguments(args):
   log: os.path.join(LOG_DIR, MAPPER, "{analysis}.report.log")
   output:
     os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html')
+  resources:
+    mem_mb = 4000
   shell:
     "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}'  --workdir={params.outdir} --organism='{ORGANISM}'  >> {log} 2>&1"
 
@@ -525,6 +550,8 @@ def hisat2_file_arguments(args):
   log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.transcripts.log")
   output:
     os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html')
+  resources:
+    mem_mb = 4000
   shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.transcripts' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' >> {log} 2>&1"
 
 rule report3:
@@ -542,4 +569,6 @@ def hisat2_file_arguments(args):
   log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.genes.log")
   output:
     os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html')
+  resources:
+    mem_mb = 4000
   shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.genes' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' >> {log} 2>&1"