From e68dcb38c75bb7fc4091efdfc759d5c7db013828 Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 22 Oct 2024 15:00:43 +0100 Subject: [PATCH 01/55] add --errfree argument --- mess/__main__.py | 2 +- mess/config/config.yaml | 3 ++- mess/test_data/minimal_test.tsv | 1 - mess/util.py | 7 +++++++ mess/workflow/Snakefile | 1 + mess/workflow/rules/simulate/short_reads.smk | 13 ++++++++----- mess/workflow/simulate.smk | 1 + minimal_test.tsv | 2 ++ profiles/slurm/slurm-jobscript.sh | 0 profiles/slurm/slurm-sidecar.py | 0 profiles/slurm/slurm-status.py | 0 profiles/slurm/slurm-submit.py | 0 12 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 minimal_test.tsv mode change 100755 => 100644 profiles/slurm/slurm-jobscript.sh mode change 100755 => 100644 profiles/slurm/slurm-sidecar.py mode change 100755 => 100644 profiles/slurm/slurm-status.py mode change 100755 => 100644 profiles/slurm/slurm-submit.py diff --git a/mess/__main__.py b/mess/__main__.py index 9e2a51a..d1e383d 100644 --- a/mess/__main__.py +++ b/mess/__main__.py @@ -64,7 +64,7 @@ }, { "name": "art_illumina options", - "options": ["--custom-err", "--paired", "--frag-len", "--frag-sd"], + "options": ["--custom-err", "--paired", "--frag-len", "--frag-sd", "--errfree"], }, { "name": "pbsim3 options", diff --git a/mess/config/config.yaml b/mess/config/config.yaml index 1c16f56..36e8155 100644 --- a/mess/config/config.yaml +++ b/mess/config/config.yaml @@ -11,7 +11,8 @@ args: configfile: custom_err: dist: - error: + error: + errfree: fasta: frag_len: frag_sd: diff --git a/mess/test_data/minimal_test.tsv b/mess/test_data/minimal_test.tsv index d533b26..f958f96 100644 --- a/mess/test_data/minimal_test.tsv +++ b/mess/test_data/minimal_test.tsv @@ -1,3 +1,2 @@ taxon nb cov_sim sample staphylococcus_aureus 1 0.1 sample1 -1290 1 0.1 sample2 diff --git a/mess/util.py b/mess/util.py index 15054ff..f1813da 100644 --- a/mess/util.py +++ b/mess/util.py @@ -425,6 +425,13 @@ def sim_options(func): default=1, show_default=True, ), + click.option( + "--errfree", + help="Generate a zero sequencing errors SAM file", + is_flag=True, + default=True, + show_default=True, + ), ] for option in reversed(options): diff --git a/mess/workflow/Snakefile b/mess/workflow/Snakefile index 2fe43b8..0612fe5 100644 --- a/mess/workflow/Snakefile +++ b/mess/workflow/Snakefile @@ -109,6 +109,7 @@ include: os.path.join("rules", "processing", "fastas.smk") CUSTOM_ERR = config.args.custom_err ERROR = config.args.error BAM = config.args.bam +ERRFREE = config.args.errfree MIN_LEN = config.args.min_len MAX_LEN = config.args.max_len SD_LEN = config.args.sd_len diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index 7d60e2a..68517b7 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -16,17 +16,18 @@ if PAIRED: if BAM: - art_args += "-sam -M" + art_args += "-sam -M " + +if ERRFREE: + art_args += "-ef " fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") if ROTATE > 1: fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") -sam_out = temp(fq_prefix + ".txt") -if BAM: - sam_out = temp(temp(fq_prefix + ".sam")) - +sam_out = temp(fq_prefix + ".sam") if BAM else temp(fq_prefix + ".txt") +sam_ef_out = temp(fq_prefix + "_ef.sam") if ERRFREE else temp(fq_prefix + "_ef.txt") fastq_out = [ temp(fq_prefix + "1.fq"), @@ -48,6 +49,7 @@ rule art_illumina: output: sam=sam_out, fastqs=fastq_out, + sam_ef=sam_ef_out, params: args=art_args, read_len=MEAN_LEN, @@ -75,4 +77,5 @@ rule art_illumina: -f {params.cov} -na {params.args} \\ -o {params.prefix} &> {log} touch {output.sam} + touch {output.sam_ef} """ diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index 5e0a8cb..4f31bbd 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -90,6 +90,7 @@ include: os.path.join("rules", "processing", "fastas.smk") CUSTOM_ERR = config.args.custom_err ERROR = config.args.error BAM = config.args.bam +ERRFREE = config.args.errfree MIN_LEN = config.args.min_len MAX_LEN = config.args.max_len SD_LEN = config.args.sd_len diff --git a/minimal_test.tsv b/minimal_test.tsv new file mode 100644 index 0000000..4f38550 --- /dev/null +++ b/minimal_test.tsv @@ -0,0 +1,2 @@ +taxon nb cov_sim sample +1290 1 0.1 sample1 diff --git a/profiles/slurm/slurm-jobscript.sh b/profiles/slurm/slurm-jobscript.sh old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-sidecar.py b/profiles/slurm/slurm-sidecar.py old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-status.py b/profiles/slurm/slurm-status.py old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-submit.py b/profiles/slurm/slurm-submit.py old mode 100755 new mode 100644 From 7caf3451e3bfa54a00ab93f6e42dc43be0693cec Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 22 Oct 2024 15:45:15 +0100 Subject: [PATCH 02/55] remove minimal_test.tsv --- minimal_test.tsv | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 minimal_test.tsv diff --git a/minimal_test.tsv b/minimal_test.tsv deleted file mode 100644 index 4f38550..0000000 --- a/minimal_test.tsv +++ /dev/null @@ -1,2 +0,0 @@ -taxon nb cov_sim sample -1290 1 0.1 sample1 From 987a8795f4ea8ae4fa968f499340715ca615441b Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 22 Oct 2024 16:24:47 +0100 Subject: [PATCH 03/55] update default value for --errfree --- mess/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/util.py b/mess/util.py index f1813da..638e350 100644 --- a/mess/util.py +++ b/mess/util.py @@ -429,7 +429,7 @@ def sim_options(func): "--errfree", help="Generate a zero sequencing errors SAM file", is_flag=True, - default=True, + default=False, show_default=True, ), ] From b55a0c44cca87a412e3df21365ab088c36cea1ff Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Thu, 24 Oct 2024 10:25:48 +0100 Subject: [PATCH 04/55] add directory to store error-free bam files --- mess/workflow/rules/preflight/directories.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/mess/workflow/rules/preflight/directories.smk b/mess/workflow/rules/preflight/directories.smk index fdcd5cc..ec2f5d1 100644 --- a/mess/workflow/rules/preflight/directories.smk +++ b/mess/workflow/rules/preflight/directories.smk @@ -34,6 +34,7 @@ dir.out.short = os.path.join(dir.out.processing, "short") dir.out.long = os.path.join(dir.out.processing, "long") dir.out.fastq = os.path.join(dir.out.base, "fastq") dir.out.bam = os.path.join(dir.out.base, "bam") +dir.out.ef = os.path.join(dir.out.base, "ef") dir.out.tax = os.path.join(dir.out.base, "tax") From 393a4b9da77a4a020f5a8a17b6216fd94721383c Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Thu, 24 Oct 2024 10:26:51 +0100 Subject: [PATCH 05/55] add targets for errfree bam files --- mess/workflow/rules/preflight/functions.smk | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index edf0a4a..85aca9e 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -38,6 +38,13 @@ def list_reads(wildcards): ) reads = reads + bams + tax + if ERRFREE: + bams_ef = expand( + os.path.join(dir.out.ef, "{sample}.{bam}"), + sample=SAMPLES, + bam=["bam", "bam.bai"], + ) + reads = reads + bams_ef return reads From 796f7bbc6b246f43f44c85a26b01d07dd27fc7fe Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Thu, 24 Oct 2024 10:27:44 +0100 Subject: [PATCH 06/55] add separate rules for processing errfree bam --- mess/workflow/rules/processing/reads.smk | 143 +++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 60d0c8e..bdcec1f 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -3,9 +3,12 @@ contig = "{contig}" if ROTATE > 1: contig = "{contig}_{n}" sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam") +sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") + if SEQ_TECH == "illumina": fastq_dir = dir.out.short sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") + sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -537,6 +540,146 @@ if not SKIP_SHUFFLE: paste -d '\t' <(seqkit seq -n {output}) <(seqkit seq -n {input}) > {log[1]} """ +if ERRFREE: + rule fix_art_sam_ef: + """ + rule to replace SAM cigar string with read length + M + Fixes truncated art_illumina SAM files with some genomes + """ + input: + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.sam"), + output: + temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.fixed")), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + params: + maxlen=MEAN_LEN, + shell: + """ + awk 'BEGIN {{OFS="\t"}} {{ if ($1 ~ /^@/) {{ print $0 }} \\ + else {{ $6 = "{params.maxlen}M"; print $0 }} }}' \\ + {input} > {output} + """ + + rule convert_sam_to_bam_ef: + input: + sam_in_ef, + output: + temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), + log: + os.path.join( + dir.out.logs, + "bioconvert", + "sam2bam", + "{sample}", + "{fasta}" + contig + "_ef.log", + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "bioconvert.yml") + container: + containers.bioconvert + shell: + """ + bioconvert sam2bam {input} {output} -t {threads} 2> {log} + """ + + rule merge_contig_bams_ef: + input: + lambda wildcards: aggregate(wildcards, dir.out.ef, "contig", "bam"), + output: + temp(os.path.join(dir.out.ef, "{sample}", "{fasta}.bam")), + benchmark: + os.path.join(dir.out.bench, "samtools", "merge", "{sample}", "{fasta}_ef.txt") + log: + os.path.join(dir.out.logs, "samtools", "merge", "{sample}", "{fasta}_ef.log"), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "bioconvert.yml") + container: + containers.bioconvert + shell: + """ + samtools merge -@ {threads} -o {output} {input} 2> {log} + """ + + rule merge_sample_bams_ef: + input: + lambda wildcards: aggregate(wildcards, dir.out.ef, "fasta", "bam"), + output: + temp(os.path.join(dir.out.ef, "{sample}.unsorted")), + benchmark: + os.path.join(dir.out.bench, "samtools", "merge", "{sample}_ef.txt") + log: + os.path.join(dir.out.logs, "samtools", "merge", "{sample}_ef.log"), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "bioconvert.yml") + container: + containers.bioconvert + shell: + """ + samtools merge -@ {threads} -o {output} {input} 2> {log} + """ + + rule sort_bams_ef: + input: + os.path.join(dir.out.ef, "{sample}.unsorted"), + output: + os.path.join(dir.out.ef, "{sample}.bam"), + benchmark: + os.path.join(dir.out.bench, "samtools", "sort", "{sample}_ef.txt"), + log: + os.path.join(dir.out.logs, "samtools", "sort", "{sample}_ef.log"), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "bioconvert.yml") + container: + containers.bioconvert + shell: + """ + samtools sort -@ {threads} {input} -o {output} 2> {log} + """ + + rule index_bams_ef: + input: + os.path.join(dir.out.ef, "{sample}.bam"), + output: + os.path.join(dir.out.ef, "{sample}.bam.bai"), + benchmark: + os.path.join(dir.out.bench, "samtools", "index", "{sample}_ef.txt") + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.norm.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "bioconvert.yml") + container: + containers.bioconvert + shell: + """ + samtools index -@ {threads} {input} + """ + rule cleanup_files: input: From b18bebd624984e94c9f5e02a57574712cc14363f Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Thu, 24 Oct 2024 10:28:35 +0100 Subject: [PATCH 07/55] update output file suffix for errfree sam file --- mess/workflow/rules/simulate/short_reads.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index 68517b7..21fb633 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -27,7 +27,7 @@ if ROTATE > 1: fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") sam_out = temp(fq_prefix + ".sam") if BAM else temp(fq_prefix + ".txt") -sam_ef_out = temp(fq_prefix + "_ef.sam") if ERRFREE else temp(fq_prefix + "_ef.txt") +sam_ef_out = temp(fq_prefix + "_errFree.sam") if ERRFREE else temp(fq_prefix + "_errFree.txt") fastq_out = [ temp(fq_prefix + "1.fq"), From acfb62061b1b9256adeceed0fbd26c9f146c9009 Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Thu, 24 Oct 2024 12:11:05 +0100 Subject: [PATCH 08/55] update errfree sam file name --- mess/workflow/rules/processing/reads.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index bdcec1f..d2a7c69 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -547,7 +547,7 @@ if ERRFREE: Fixes truncated art_illumina SAM files with some genomes """ input: - os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.sam"), + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), output: temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.fixed")), resources: From 90a0af48f9b0df3ea98507d589542fa962a917ca Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Fri, 25 Oct 2024 09:27:55 +0100 Subject: [PATCH 09/55] restore original minimal_test --- mess/test_data/minimal_test.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/mess/test_data/minimal_test.tsv b/mess/test_data/minimal_test.tsv index f958f96..437d863 100644 --- a/mess/test_data/minimal_test.tsv +++ b/mess/test_data/minimal_test.tsv @@ -1,2 +1,3 @@ taxon nb cov_sim sample staphylococcus_aureus 1 0.1 sample1 +1290 1 0.1 sample2 \ No newline at end of file From be262d39d918f570c47e2d7a3b13d560acf886b1 Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Fri, 25 Oct 2024 09:28:31 +0100 Subject: [PATCH 10/55] fix targets --- mess/workflow/rules/preflight/functions.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 85aca9e..994b062 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -45,6 +45,7 @@ def list_reads(wildcards): bam=["bam", "bam.bai"], ) reads = reads + bams_ef + return reads From 19643659a3b634edba0e4875726e4cbe934d28ea Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Fri, 25 Oct 2024 09:29:03 +0100 Subject: [PATCH 11/55] fix path for error free reads --- mess/workflow/rules/processing/reads.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index d2a7c69..b673047 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -8,7 +8,7 @@ sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") - sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") + sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.fixed") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) From ae730f98105dca5edac7767ae4ede5f15100a47e Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Fri, 25 Oct 2024 15:17:07 +0100 Subject: [PATCH 12/55] resolve conflicts --- mess/workflow/rules/simulate/short_reads.smk | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index bd9604e..9d2e28e 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -26,15 +26,8 @@ fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") if CIRCULAR: fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") -<<<<<<< HEAD sam_out = temp(fq_prefix + ".sam") if BAM else temp(fq_prefix + ".txt") sam_ef_out = temp(fq_prefix + "_errFree.sam") if ERRFREE else temp(fq_prefix + "_errFree.txt") -======= -sam_out = temp(fq_prefix + ".txt") -if BAM: - sam_out = temp(fq_prefix + ".sam") - ->>>>>>> upstream/main fastq_out = [ temp(fq_prefix + "1.fq"), From 1253a4fce507c1d40c53f70f49aa6dafbe9b36ae Mon Sep 17 00:00:00 2001 From: farchaab Date: Sat, 26 Oct 2024 23:55:45 +0200 Subject: [PATCH 13/55] added sambamba env --- mess/workflow/envs/conda/sambamba.yml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 mess/workflow/envs/conda/sambamba.yml diff --git a/mess/workflow/envs/conda/sambamba.yml b/mess/workflow/envs/conda/sambamba.yml new file mode 100644 index 0000000..83189eb --- /dev/null +++ b/mess/workflow/envs/conda/sambamba.yml @@ -0,0 +1,7 @@ +name: sambamba +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - sambamba =1.0.1 \ No newline at end of file From 939d5ecb6f126253e6bcd0f0caf10102626c611b Mon Sep 17 00:00:00 2001 From: farchaab Date: Sat, 26 Oct 2024 23:55:54 +0200 Subject: [PATCH 14/55] added sambamba container --- mess/workflow/envs/containers.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index fa9f89e..7b03c7e 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -7,3 +7,4 @@ pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0 pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1 +sambamba: docker://quay.io/biocontainers/sambamba:1.0.1--h6f6fda4_2 \ No newline at end of file From 24928f530fbc499e804f9b6ac9f7ec4eb72b180e Mon Sep 17 00:00:00 2001 From: farchaab Date: Sun, 27 Oct 2024 00:45:05 +0200 Subject: [PATCH 15/55] added samtools --- mess/workflow/envs/conda/samtools.yml | 7 +++++++ mess/workflow/envs/containers.yml | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 mess/workflow/envs/conda/samtools.yml diff --git a/mess/workflow/envs/conda/samtools.yml b/mess/workflow/envs/conda/samtools.yml new file mode 100644 index 0000000..448f1d0 --- /dev/null +++ b/mess/workflow/envs/conda/samtools.yml @@ -0,0 +1,7 @@ +name: samtools +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - samtools =1.21 \ No newline at end of file diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index 7b03c7e..a0714f3 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -7,4 +7,5 @@ pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0 pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1 -sambamba: docker://quay.io/biocontainers/sambamba:1.0.1--h6f6fda4_2 \ No newline at end of file +sambamba: docker://quay.io/biocontainers/sambamba:1.0.1--h6f6fda4_2 +samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0 \ No newline at end of file From cae1017c9ae319780f9964029372c0ba677ad95c Mon Sep 17 00:00:00 2001 From: farchaab Date: Sun, 27 Oct 2024 00:45:23 +0200 Subject: [PATCH 16/55] simplified aggregate --- mess/workflow/rules/preflight/functions.smk | 107 +++++++------------- 1 file changed, 37 insertions(+), 70 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 74655e1..f336d37 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -144,78 +144,45 @@ def is_circular(): return False -def aggregate(wildcards, outdir, level, ext): +def aggregate(wildcards, outdir, ext): table = checkpoints.split_contigs.get(**wildcards).output[0] - df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]).sort_index() - if level == "contig": - contigs = list( - df.loc[(wildcards.sample, wildcards.fasta)]["contig"].drop_duplicates() - ) - if "rotate" in df.columns: - rotates = int( - df.loc[(wildcards.sample, wildcards.fasta)]["rotate"] - .drop_duplicates() - .values - ) - - if PAIRED and ext != "bam": - if "rotate" in df.columns: - return expand( - os.path.join( - outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}" - ), - sample=wildcards.sample, - fasta=wildcards.fasta, - n=list(range(1, rotates + 1)), - p=wildcards.p, - contig=contigs, - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - p=wildcards.p, - contig=contigs, - ext=ext, - ) + df = pd.read_csv( + table, + sep="\t", + index_col=["samplename", "fasta"], + ).sort_index() + fastas = list(set(df.loc[wildcards.sample].index)) + contigs = list( + chain(*[list(df.loc[(wildcards.sample, fasta), "contig"]) for fasta in fastas]) + ) - else: - if "rotate" in df.columns: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - n=list(range(1, rotates + 1)), - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - ext=ext, - ) - if level == "fasta": - fastas = list(set(df.loc[wildcards.sample].index)) - if PAIRED and ext != "bam": - return expand( - os.path.join(outdir, "{sample}", "{fasta}{p}.{ext}"), - sample=wildcards.sample, - fasta=fastas, - p=wildcards.p, - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}.{ext}"), - sample=wildcards.sample, - fasta=fastas, - ext=ext, - ) + collect_args = { + "sample": wildcards.sample, + "fasta": fastas, + "contig": contigs, + "ext": ext, + } + path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}") + if CIRCULAR: + path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}") + rotates = list( + chain(*[list(df.loc[(wildcards.sample, fasta), "n"]) for fasta in fastas]) + ) + collect_args.update( + { + "n": rotates, + } + ) + if PAIRED and ext != "bam": + path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}") + collect_args.update( + { + "p": wildcards.p, + } + ) + if CIRCULAR: + path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}") + return collect(path, **collect_args) def get_header(fa): From 4db4d0318567019382ca2065187f7cab9fefd502 Mon Sep 17 00:00:00 2001 From: farchaab Date: Sun, 27 Oct 2024 00:46:13 +0200 Subject: [PATCH 17/55] removed redundant cat rules, added sambamba --- mess/workflow/rules/processing/reads.smk | 146 +++++------------------ 1 file changed, 28 insertions(+), 118 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index d625f53..3746c21 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -5,7 +5,7 @@ if CIRCULAR: sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short - sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") + sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -37,13 +37,13 @@ if PASSES > 1: time=config.resources.norm.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "sambamba.yml") container: - containers.bioconvert + containers.sambamba shell: """ - samtools view -@ {threads} -bS {input} | \ - samtools sort -@ {threads} > {output} 2> {log} + sambamba view -t {threads} -S -f bam {input} | \ + sambamba sort -t {threads} > {output} 2> {log} """ rule ccs_bam_to_fastq: @@ -118,7 +118,6 @@ if BAM: log: os.path.join( dir.out.logs, - "bioconvert", "maf2sam", "{sample}", "{fasta}" + "_" + contig + ".log", @@ -138,29 +137,6 @@ if BAM: """ -rule fix_art_sam: - """ - rule to replace SAM cigar string with read length + M - Fixes truncated art_illumina SAM files with some genomes - """ - input: - os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"), - output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - params: - maxlen=MEAN_LEN, - shell: - """ - awk 'BEGIN {{OFS="\t"}} {{ if ($1 ~ /^@/) {{ print $0 }} \\ - else {{ $6 = "{params.maxlen}M"; print $0 }} }}' \\ - {input} > {output} - """ - - rule convert_sam_to_bam: input: sam_in, @@ -169,7 +145,6 @@ rule convert_sam_to_bam: log: os.path.join( dir.out.logs, - "bioconvert", "sam2bam", "{sample}", "{fasta}" + contig + ".log", @@ -180,84 +155,37 @@ rule convert_sam_to_bam: time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") - container: - containers.bioconvert - shell: - """ - bioconvert sam2bam {input} {output} -t {threads} 2> {log} - """ - - -rule merge_contig_bams: - input: - lambda wildcards: aggregate(wildcards, dir.out.bam, "contig", "bam"), - output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}.bam")), - benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}", "{fasta}.txt") - log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}", "{fasta}.log"), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.sml.cpu - conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "sambamba.yml") container: - containers.bioconvert + containers.sambamba shell: """ - samtools merge -@ {threads} -o {output} {input} 2> {log} + sambamba view -S -f bam {input} 2> {log} | \\ + sambamba sort -o {output} /dev/stdin 2>> {log} """ -rule merge_sample_bams: +rule merge_bams: input: - lambda wildcards: aggregate(wildcards, dir.out.bam, "fasta", "bam"), + lambda wildcards: aggregate(wildcards, dir.out.bam, "bam"), output: - temp(os.path.join(dir.out.bam, "{sample}.unsorted")), + temp(os.path.join(dir.out.bam, "{sample}.bam")), benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}.txt") + os.path.join(dir.out.bench, "merge", "{sample}.txt") log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}.log"), + os.path.join(dir.out.logs, "merge", "{sample}.log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "sambamba.yml") container: - containers.bioconvert + containers.sambamba shell: """ - samtools merge -@ {threads} -o {output} {input} 2> {log} - """ - - -rule sort_bams: - input: - os.path.join(dir.out.bam, "{sample}.unsorted"), - output: - os.path.join(dir.out.bam, "{sample}.bam"), - benchmark: - os.path.join(dir.out.bench, "samtools", "sort", "{sample}.txt") - log: - os.path.join(dir.out.logs, "samtools", "sort", "{sample}.log"), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.norm.cpu - conda: - os.path.join(dir.conda, "bioconvert.yml") - container: - containers.bioconvert - shell: - """ - samtools sort -@ {threads} {input} -o {output} 2> {log} + sambamba merge -t {threads} {output} {input} 2> {log} """ @@ -267,19 +195,19 @@ rule get_bam_coverage: output: temp(os.path.join(dir.out.bam, "{sample}.txt")), log: - os.path.join(dir.out.logs, "samtools", "coverage", "{sample}.log"), + os.path.join(dir.out.logs, "coverage", "{sample}.log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools coverage {input} > {output} 2> {log} + samtools coverage {input} 1> {output} 2> {log} """ @@ -374,20 +302,20 @@ rule index_bams: os.path.join(dir.out.bam, "{sample}.bam"), output: os.path.join(dir.out.bam, "{sample}.bam.bai"), - benchmark: - os.path.join(dir.out.bench, "samtools", "index", "{sample}.txt") + log: + os.path.join(dir.out.logs, "index", "{sample}.txt"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.norm.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "sambamba.yml") container: - containers.bioconvert + containers.sambamba shell: """ - samtools index -@ {threads} {input} + sambamba index -t {threads} {input} {output} 2> {log} """ @@ -411,24 +339,6 @@ rule compress_contig_fastqs: """ -rule cat_contig_fastqs: - input: - flag=get_cov_table, - fq=lambda wildcards: aggregate(wildcards, fastq_dir, "contig", "fq.gz"), - output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}{p}.fq.gz")) - if PAIRED - else temp(os.path.join(fastq_dir, "{sample}", "{fasta}.fq.gz")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - shell: - """ - cat {input.fq} > {output} - """ - - sample_fastq_out = [] if SKIP_SHUFFLE: if PAIRED: @@ -442,9 +352,9 @@ else: sample_fastq_out = temp(os.path.join(dir.out.cat, "{sample}.fq.gz")) -rule cat_sample_fastqs: +rule cat_fastqs: input: - lambda wildcards: aggregate(wildcards, fastq_dir, "fasta", "fq.gz"), + lambda wildcards: aggregate(wildcards, fastq_dir, "fq.gz"), output: sample_fastq_out, resources: From 4872558cde8e0bbd7383b2091aae9bc7c6944441 Mon Sep 17 00:00:00 2001 From: farchaab Date: Sun, 27 Oct 2024 00:46:53 +0200 Subject: [PATCH 18/55] added default flag for 0 lenth indels --- mess/workflow/rules/simulate/short_reads.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index 6d83d11..9e6e886 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -1,4 +1,4 @@ -art_args = "" +art_args = "-k 0 " if CUSTOM_ERR == None: art_args += f"-ss {ERROR} " if CUSTOM_ERR: @@ -16,7 +16,7 @@ if PAIRED: if BAM: - art_args += "-sam -M" + art_args += "-sam" fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") From 023de5bf3d7cafca22e6333218d811daaadfd76a Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 10:05:50 +0100 Subject: [PATCH 19/55] improved functions --- mess/workflow/rules/preflight/functions.smk | 69 ++++++++++----------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index f336d37..2031724 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -67,32 +67,43 @@ def parse_samples(indir, replicates): fasta_cache = {} -def fasta_input(wildcards): - table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] +def get_fasta_table(wildcards): + fa_table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] + if fa_table not in fasta_cache: + fa_df = pd.read_csv(fa_table, sep="\t", index_col="fasta") + fasta_cache[fa_table] = fa_df + fa_df = fasta_cache[fa_table] + return fa_df + - df = pd.read_csv(table, sep="\t", index_col="fasta") +def fasta_input(wildcards): + df = get_fasta_table(wildcards) try: return df.loc[wildcards.fasta]["path"].drop_duplicates() except AttributeError: return df.loc[wildcards.fasta]["path"] - # some samples use the same genome path, drop duplicates to avoid duplicate paths when processing fasta def list_fastas(wildcards): - table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] - if table not in fasta_cache: - df = pd.read_csv(table, sep="\t") - fasta_cache[table] = df - df = fasta_cache[table] - fastas = list(set(df["fasta"])) - return expand(os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=fastas) + df = get_fasta_table(wildcards) + return expand( + os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=list(set(df.index)) + ) table_cache = {} -def get_value(value, wildcards): +def get_cov_table(wildcards, key, idx_col): + cov_table = checkpoints.split_contigs.get(**wildcards).output[0] + if cov_table not in table_cache: + cov_df = pd.read_csv(cov_table, sep="\t", index_col=idx_col).sort_index() + table_cache[key] = cov_df + cov_df = table_cache[key] + return cov_df + +def get_value(value, wildcards): vals = ( f"{wildcards.sample}", f"{wildcards.fasta}", @@ -103,17 +114,8 @@ def get_value(value, wildcards): if CIRCULAR: idx_col += ["n"] vals += (int(wildcards.n),) - - table = checkpoints.split_contigs.get(**wildcards).output[0] - if table not in table_cache: - df = pd.read_csv( - table, - sep="\t", - index_col=idx_col, - ).sort_index() - table_cache[table] = df - df = table_cache[table] - return df.loc[vals, value] + val_df = get_cov_table(wildcards, "values", idx_col) + return val_df.loc[vals, value] def get_asm_summary(wildcards): @@ -128,10 +130,6 @@ def get_asm_summary(wildcards): return table -def get_cov_table(wildcards): - return checkpoints.split_contigs.get(**wildcards).output[0] - - def is_circular(): if os.path.isfile(INPUT): files = [INPUT] @@ -145,15 +143,12 @@ def is_circular(): def aggregate(wildcards, outdir, ext): - table = checkpoints.split_contigs.get(**wildcards).output[0] - df = pd.read_csv( - table, - sep="\t", - index_col=["samplename", "fasta"], - ).sort_index() - fastas = list(set(df.loc[wildcards.sample].index)) + agg_df = get_cov_table(wildcards, "aggregate", ["samplename", "fasta"]) + fastas = list(set(agg_df.loc[wildcards.sample].index)) contigs = list( - chain(*[list(df.loc[(wildcards.sample, fasta), "contig"]) for fasta in fastas]) + chain( + *[list(agg_df.loc[(wildcards.sample, fasta), "contig"]) for fasta in fastas] + ) ) collect_args = { @@ -166,7 +161,9 @@ def aggregate(wildcards, outdir, ext): if CIRCULAR: path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}") rotates = list( - chain(*[list(df.loc[(wildcards.sample, fasta), "n"]) for fasta in fastas]) + chain( + *[list(agg_df.loc[(wildcards.sample, fasta), "n"]) for fasta in fastas] + ) ) collect_args.update( { From c95b1743a29003181f0299ffcace64e010b7345e Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 11:24:39 +0100 Subject: [PATCH 20/55] replaced sambamba and bioconvert with samtools --- mess/workflow/rules/processing/reads.smk | 68 ++++++++++++------------ 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 3746c21..c5653d1 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -37,13 +37,13 @@ if PASSES > 1: time=config.resources.norm.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "sambamba.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.sambamba + containers.samtools shell: """ - sambamba view -t {threads} -S -f bam {input} | \ - sambamba sort -t {threads} > {output} 2> {log} + samtools view -@ {threads} -bS {input} | \\ + samtools sort -@ {threads} > {output} 2> {log} """ rule ccs_bam_to_fastq: @@ -155,13 +155,13 @@ rule convert_sam_to_bam: time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "sambamba.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.sambamba + containers.samtools shell: """ - sambamba view -S -f bam {input} 2> {log} | \\ - sambamba sort -o {output} /dev/stdin 2>> {log} + samtools view -@ {threads} -bS {input} | \\ + samtools sort -@ {threads} > {output} 2> {log} """ @@ -169,7 +169,7 @@ rule merge_bams: input: lambda wildcards: aggregate(wildcards, dir.out.bam, "bam"), output: - temp(os.path.join(dir.out.bam, "{sample}.bam")), + os.path.join(dir.out.bam, "{sample}.bam"), benchmark: os.path.join(dir.out.bench, "merge", "{sample}.txt") log: @@ -180,12 +180,32 @@ rule merge_bams: time=config.resources.sml.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "sambamba.yml") + os.path.join(dir.conda, "samtools.yml") + container: + containers.samtools + shell: + """ + samtools merge -@ {threads} -o {output} {input} 2> {log} + """ + + +rule index_bams: + input: + os.path.join(dir.out.bam, "{sample}.bam"), + output: + os.path.join(dir.out.bam, "{sample}.bam.bai"), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.norm.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "samtools.yml") container: - containers.sambamba + containers.samtools shell: """ - sambamba merge -t {threads} {output} {input} 2> {log} + samtools index -@ {threads} {input} """ @@ -214,7 +234,7 @@ rule get_bam_coverage: rule get_tax_profile: input: cov=os.path.join(dir.out.bam, "{sample}.txt"), - tax=get_cov_table, + tax=os.path.join(dir.out.processing, "cov.tsv"), output: counts=os.path.join(dir.out.tax, "{sample}.tsv"), seq_abundance=temp(os.path.join(dir.out.tax, "{sample}_seq.tsv")), @@ -297,28 +317,6 @@ rule tax_profile_to_biobox: """ -rule index_bams: - input: - os.path.join(dir.out.bam, "{sample}.bam"), - output: - os.path.join(dir.out.bam, "{sample}.bam.bai"), - log: - os.path.join(dir.out.logs, "index", "{sample}.txt"), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.norm.time, - threads: config.resources.norm.cpu - conda: - os.path.join(dir.conda, "sambamba.yml") - container: - containers.sambamba - shell: - """ - sambamba index -t {threads} {input} {output} 2> {log} - """ - - rule compress_contig_fastqs: input: fastq, From 8e5db9ea929ee92a37ba2de1b2acf2bee9c1ab03 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 11:39:00 +0100 Subject: [PATCH 21/55] added rustybam and wgatools --- mess/workflow/envs/conda/rustybam.yml | 7 +++++ mess/workflow/envs/conda/wgatools.yml | 7 +++++ mess/workflow/envs/containers.yml | 4 ++- mess/workflow/rules/processing/reads.smk | 38 ++++++++++++++++++++---- 4 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 mess/workflow/envs/conda/rustybam.yml create mode 100644 mess/workflow/envs/conda/wgatools.yml diff --git a/mess/workflow/envs/conda/rustybam.yml b/mess/workflow/envs/conda/rustybam.yml new file mode 100644 index 0000000..2ed9837 --- /dev/null +++ b/mess/workflow/envs/conda/rustybam.yml @@ -0,0 +1,7 @@ +name: rustybam +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - rustybam =0.1.33 \ No newline at end of file diff --git a/mess/workflow/envs/conda/wgatools.yml b/mess/workflow/envs/conda/wgatools.yml new file mode 100644 index 0000000..e7741ce --- /dev/null +++ b/mess/workflow/envs/conda/wgatools.yml @@ -0,0 +1,7 @@ +name: wgatools +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - wgatools =0.1.1 \ No newline at end of file diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index a0714f3..53ee4d0 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -8,4 +8,6 @@ pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1 sambamba: docker://quay.io/biocontainers/sambamba:1.0.1--h6f6fda4_2 -samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0 \ No newline at end of file +samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0 +wgatools: docker://quay.io/biocontainers/wgatools:0.1.1--h7c767d4_0 +rustybam: docker://quay.io/biocontainers/rustybam:0.1.33--h0d9f037_1 \ No newline at end of file diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index c5653d1..6660928 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -110,15 +110,41 @@ if BAM: sed 's/ref/{params.seqname}/g' {input.maf} > {output} """ - rule convert_maf_to_sam: + rule convert_maf_to_paf: input: os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"), + output: + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".paf")), + log: + os.path.join( + dir.out.logs, + "maf2paf", + "{sample}", + "{fasta}" + "_" + contig + ".log", + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "wgatools.yml") + container: + containers.wgatools + shell: + """ + wgatools maf2paf {input} > {output} 2> {log} + """ + + rule convert_paf_to_sam: + input: + os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".paf"), output: temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")), log: os.path.join( dir.out.logs, - "maf2sam", + "paf2sam", "{sample}", "{fasta}" + "_" + contig + ".log", ), @@ -128,12 +154,12 @@ if BAM: time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "rustybam.yml") container: - containers.bioconvert + containers.rustybam shell: """ - bioconvert {input} {output} 2> {log} + rustybam paf2sam {input} > {output} 2> {log} """ @@ -227,7 +253,7 @@ rule get_bam_coverage: containers.samtools shell: """ - samtools coverage {input} 1> {output} 2> {log} + samtools coverage {input} > {output} 2> {log} """ From dc73f88d4829e52a44237a82a04cec7feb91fabd Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 11:52:36 +0100 Subject: [PATCH 22/55] added bioconvert back --- mess/workflow/rules/processing/reads.smk | 36 ++++-------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 6660928..d41e343 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -110,41 +110,15 @@ if BAM: sed 's/ref/{params.seqname}/g' {input.maf} > {output} """ - rule convert_maf_to_paf: + rule convert_maf_to_sam: input: os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"), - output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".paf")), - log: - os.path.join( - dir.out.logs, - "maf2paf", - "{sample}", - "{fasta}" + "_" + contig + ".log", - ), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.sml.cpu - conda: - os.path.join(dir.conda, "wgatools.yml") - container: - containers.wgatools - shell: - """ - wgatools maf2paf {input} > {output} 2> {log} - """ - - rule convert_paf_to_sam: - input: - os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".paf"), output: temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")), log: os.path.join( dir.out.logs, - "paf2sam", + "maf2paf", "{sample}", "{fasta}" + "_" + contig + ".log", ), @@ -154,12 +128,12 @@ if BAM: time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "rustybam.yml") + os.path.join(dir.conda, "bioconvert.yml") container: - containers.rustybam + containers.bioconvert shell: """ - rustybam paf2sam {input} > {output} 2> {log} + bioconvert {input} {output} 2> {log} """ From 04d56ccc634bedb3dd679ae26dae4e125bcc2289 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 11:53:22 +0100 Subject: [PATCH 23/55] env cleanup --- mess/workflow/envs/conda/rustybam.yml | 7 ------- mess/workflow/envs/conda/sambamba.yml | 7 ------- mess/workflow/envs/conda/wgatools.yml | 7 ------- 3 files changed, 21 deletions(-) delete mode 100644 mess/workflow/envs/conda/rustybam.yml delete mode 100644 mess/workflow/envs/conda/sambamba.yml delete mode 100644 mess/workflow/envs/conda/wgatools.yml diff --git a/mess/workflow/envs/conda/rustybam.yml b/mess/workflow/envs/conda/rustybam.yml deleted file mode 100644 index 2ed9837..0000000 --- a/mess/workflow/envs/conda/rustybam.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: rustybam -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - rustybam =0.1.33 \ No newline at end of file diff --git a/mess/workflow/envs/conda/sambamba.yml b/mess/workflow/envs/conda/sambamba.yml deleted file mode 100644 index 83189eb..0000000 --- a/mess/workflow/envs/conda/sambamba.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: sambamba -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - sambamba =1.0.1 \ No newline at end of file diff --git a/mess/workflow/envs/conda/wgatools.yml b/mess/workflow/envs/conda/wgatools.yml deleted file mode 100644 index e7741ce..0000000 --- a/mess/workflow/envs/conda/wgatools.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: wgatools -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - wgatools =0.1.1 \ No newline at end of file From f4c4f96abbcf0cd1f26f0d8ce21466dba5e2ef0d Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 11:53:29 +0100 Subject: [PATCH 24/55] containers cleanup --- mess/workflow/envs/containers.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index 53ee4d0..eb97a5f 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -7,7 +7,4 @@ pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0 pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1 -sambamba: docker://quay.io/biocontainers/sambamba:1.0.1--h6f6fda4_2 samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0 -wgatools: docker://quay.io/biocontainers/wgatools:0.1.1--h7c767d4_0 -rustybam: docker://quay.io/biocontainers/rustybam:0.1.33--h0d9f037_1 \ No newline at end of file From 09efdefa54421a4f66dcb6a112f186ae1501b8a8 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 12:39:10 +0100 Subject: [PATCH 25/55] added zip to avoid all fasta and contigs combinations --- mess/workflow/rules/preflight/functions.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 2031724..8d32b34 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -150,7 +150,6 @@ def aggregate(wildcards, outdir, ext): *[list(agg_df.loc[(wildcards.sample, fasta), "contig"]) for fasta in fastas] ) ) - collect_args = { "sample": wildcards.sample, "fasta": fastas, @@ -179,7 +178,7 @@ def aggregate(wildcards, outdir, ext): ) if CIRCULAR: path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}") - return collect(path, **collect_args) + return collect(path, zip, **collect_args) def get_header(fa): From c8a4ad9f58a73ca8eea38ef8e80666309755cad3 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 16:36:49 +0100 Subject: [PATCH 26/55] fixed aggregate function --- mess/workflow/rules/preflight/functions.smk | 50 +++++++-------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 8d32b34..5303e5d 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -143,42 +143,22 @@ def is_circular(): def aggregate(wildcards, outdir, ext): - agg_df = get_cov_table(wildcards, "aggregate", ["samplename", "fasta"]) - fastas = list(set(agg_df.loc[wildcards.sample].index)) - contigs = list( - chain( - *[list(agg_df.loc[(wildcards.sample, fasta), "contig"]) for fasta in fastas] - ) - ) - collect_args = { - "sample": wildcards.sample, - "fasta": fastas, - "contig": contigs, - "ext": ext, - } - path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}") - if CIRCULAR: - path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}") - rotates = list( - chain( - *[list(agg_df.loc[(wildcards.sample, fasta), "n"]) for fasta in fastas] - ) - ) - collect_args.update( - { - "n": rotates, - } - ) - if PAIRED and ext != "bam": - path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}") - collect_args.update( - { - "p": wildcards.p, - } - ) + df = get_cov_table(wildcards, "aggregate", ["samplename"]) + files = [ + os.path.join(outdir, wildcards.sample, row.fasta, f"{row.contig}.{ext}") + for row in df.loc[wildcards.sample].itertuples() + ] + files = [] + for row in df.loc[wildcards.sample].itertuples(): + prefix = f"{row.contig}" if CIRCULAR: - path = os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}") - return collect(path, zip, **collect_args) + prefix += f"_{row.n}" + if PAIRED and ext != "bam": + prefix += f"{wildcards.p}" + files.append( + os.path.join(outdir, wildcards.sample, row.fasta, f"{prefix}.{ext}") + ) + return files def get_header(fa): From 4fc53f493c39e0e2144ac431e8fcbdf33a8096a3 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 17:02:43 +0100 Subject: [PATCH 27/55] removed benchmarks --- .../rules/download/assembly_finder.smk | 2 -- mess/workflow/rules/processing/reads.smk | 33 ++----------------- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/mess/workflow/rules/download/assembly_finder.smk b/mess/workflow/rules/download/assembly_finder.smk index 9b06676..0d7ac6f 100644 --- a/mess/workflow/rules/download/assembly_finder.smk +++ b/mess/workflow/rules/download/assembly_finder.smk @@ -65,8 +65,6 @@ checkpoint download_assemblies: args=af_args, taxonkit=TAXONKIT, out=os.path.join(dir.out.base, "assembly_finder"), - benchmark: - os.path.join(dir.out.bench, "assembly_finder.txt") log: os.path.join(dir.out.logs, "assembly_finder.log"), resources: diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index d41e343..13760a9 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -34,7 +34,7 @@ if PASSES > 1: resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", - time=config.resources.norm.time, + time=config.resources.sml.time, threads: config.resources.norm.cpu conda: os.path.join(dir.conda, "samtools.yml") @@ -142,18 +142,11 @@ rule convert_sam_to_bam: sam_in, output: temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")), - log: - os.path.join( - dir.out.logs, - "sam2bam", - "{sample}", - "{fasta}" + contig + ".log", - ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - threads: config.resources.sml.cpu + threads: config.resources.norm.cpu conda: os.path.join(dir.conda, "samtools.yml") container: @@ -170,10 +163,6 @@ rule merge_bams: lambda wildcards: aggregate(wildcards, dir.out.bam, "bam"), output: os.path.join(dir.out.bam, "{sample}.bam"), - benchmark: - os.path.join(dir.out.bench, "merge", "{sample}.txt") - log: - os.path.join(dir.out.logs, "merge", "{sample}.log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -214,8 +203,6 @@ rule get_bam_coverage: os.path.join(dir.out.bam, "{sample}.bam"), output: temp(os.path.join(dir.out.bam, "{sample}.txt")), - log: - os.path.join(dir.out.logs, "coverage", "{sample}.log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -291,10 +278,6 @@ rule tax_profile_to_biobox: dmp=os.path.join(TAXONKIT, "names.dmp"), output: os.path.join(dir.out.tax, "{sample}_{abundance}.txt"), - log: - os.path.join( - dir.out.logs, "taxonkit", "profile2cami", "{sample}_{abundance}.log" - ), params: dir=TAXONKIT, ranks=RANKS, @@ -378,12 +361,6 @@ if not SKIP_SHUFFLE: else temp(os.path.join(dir.out.shuffle, "{sample}.fq.gz")), params: lambda wildcards: SHUFFLE[wildcards.sample], - benchmark: - ( - os.path.join(dir.out.bench, "seqkit", "shuffle", "{sample}_R{p}.txt") - if PAIRED - else os.path.join(dir.out.bench, "seqkit", "shuffle", "{sample}.txt") - ) log: os.path.join(dir.out.logs, "seqkit", "shuffle", "{sample}_R{p}.log") if PAIRED @@ -415,12 +392,6 @@ if not SKIP_SHUFFLE: os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz") if PAIRED else os.path.join(dir.out.fastq, "{sample}.fq.gz"), - benchmark: - ( - os.path.join(dir.out.bench, "seqkit", "anonymize", "{sample}_R{p}.txt") - if PAIRED - else os.path.join(dir.out.bench, "seqkit", "anonymize", "{sample}.txt") - ) log: os.path.join(dir.out.logs, "seqkit", "replace", "{sample}_R{p}.log") if PAIRED From 8c61950b274975002c224ae136ce804a2b87fe93 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 28 Oct 2024 17:36:50 +0100 Subject: [PATCH 28/55] updated assembly_finder --- mess/workflow/envs/conda/assembly_finder.yml | 4 ++-- mess/workflow/envs/containers.yml | 2 +- mess/workflow/rules/download/assembly_finder.smk | 4 +--- mess/workflow/rules/preflight/targets_download.smk | 2 -- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/mess/workflow/envs/conda/assembly_finder.yml b/mess/workflow/envs/conda/assembly_finder.yml index 0055859..3328d70 100644 --- a/mess/workflow/envs/conda/assembly_finder.yml +++ b/mess/workflow/envs/conda/assembly_finder.yml @@ -4,8 +4,8 @@ channels: - bioconda - defaults dependencies: - - assembly_finder =0.7.6 - - ncbi-datasets-cli =16.26.2 + - assembly_finder =0.8.0 + - ncbi-datasets-cli =16.31.0 - taxonkit =0.17.0 - csvtk =0.30.0 - rsync =3.3.0 diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index eb97a5f..56338a2 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -1,5 +1,5 @@ art: docker://quay.io/biocontainers/art:2016.06.05--heacdb12_11 -assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.7 +assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.8.0 bioconvert: docker://quay.io/biocontainers/bioconvert:1.1.1--pyhdfd78af_0 curl: docker://quay.io/biocontainers/curl:7.80.0 pigz: docker://quay.io/biocontainers/pigz:2.8 diff --git a/mess/workflow/rules/download/assembly_finder.smk b/mess/workflow/rules/download/assembly_finder.smk index 0d7ac6f..a8cc64b 100644 --- a/mess/workflow/rules/download/assembly_finder.smk +++ b/mess/workflow/rules/download/assembly_finder.smk @@ -20,7 +20,7 @@ rule get_unique_entries: ) -af_args = "" +af_args = "--no-use-conda " if TAXON: af_args += "--taxon " if LIMIT: @@ -59,7 +59,6 @@ checkpoint download_assemblies: tsv=os.path.join(dir.out.base, "uniq_entries.tsv"), output: asm=os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"), - seq=os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"), tax=os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"), params: args=af_args, @@ -82,6 +81,5 @@ checkpoint download_assemblies: --taxonkit {params.taxonkit} \\ --threads {threads} \\ {params.args} \\ - --no-use-conda \\ -o {params.out} 2> {log} """ diff --git a/mess/workflow/rules/preflight/targets_download.smk b/mess/workflow/rules/preflight/targets_download.smk index 88e6b20..610bff8 100644 --- a/mess/workflow/rules/preflight/targets_download.smk +++ b/mess/workflow/rules/preflight/targets_download.smk @@ -2,10 +2,8 @@ All target download files are declared here """ - TargetDownloads = [ os.path.join(dir.out.base, "uniq_entries.tsv"), os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"), - os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"), os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"), ] From f79213012152e6bc03f767d7f62bd6cda508e8fa Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 08:46:45 +0100 Subject: [PATCH 29/55] simplified list_reads --- mess/workflow/rules/preflight/functions.smk | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 5303e5d..1d7add3 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -13,25 +13,21 @@ wildcard_constraints: def list_reads(wildcards): + fastqs = "{sample}.fq.gz" + args = {"sample": SAMPLES} + if PAIRED: - reads = expand( - os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz"), - sample=SAMPLES, - p=PAIRS, - ) - else: - reads = expand( - os.path.join(dir.out.fastq, "{sample}.fq.gz"), - sample=SAMPLES, - ) + fastqs = "{sample}_R{p}.fq.gz" + args.update({"p": PAIRS}) + reads = collect(os.path.join(dir.out.fastq, fastqs), **args) if BAM: - bams = expand( + bams = collect( os.path.join(dir.out.bam, "{sample}.{bam}"), sample=SAMPLES, bam=["bam", "bam.bai"], ) - tax = expand( + tax = collect( os.path.join(dir.out.tax, "{sample}_{abundance}.txt"), sample=SAMPLES, abundance=["seq", "tax"], From 6eda0496b24064e5bb6bc96c8fbf3cce37c83b85 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 08:46:51 +0100 Subject: [PATCH 30/55] fixed logs --- mess/workflow/rules/processing/reads.smk | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 13760a9..f3f63f0 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -29,8 +29,6 @@ if PASSES > 1: contig + ".ccs.bam", ) ), - log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -42,8 +40,8 @@ if PASSES > 1: containers.samtools shell: """ - samtools view -@ {threads} -bS {input} | \\ - samtools sort -@ {threads} > {output} 2> {log} + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} 2> {log} """ rule ccs_bam_to_fastq: @@ -153,8 +151,8 @@ rule convert_sam_to_bam: containers.samtools shell: """ - samtools view -@ {threads} -bS {input} | \\ - samtools sort -@ {threads} > {output} 2> {log} + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} """ @@ -163,6 +161,13 @@ rule merge_bams: lambda wildcards: aggregate(wildcards, dir.out.bam, "bam"), output: os.path.join(dir.out.bam, "{sample}.bam"), + log: + os.path.join( + dir.out.logs, + "samtools", + "merge", + "{sample}.log", + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -214,7 +219,7 @@ rule get_bam_coverage: containers.samtools shell: """ - samtools coverage {input} > {output} 2> {log} + samtools coverage {input} > {output} """ From 65d0e9ae98ca00accf847f72e1b978ccfd1270f1 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 09:21:30 +0100 Subject: [PATCH 31/55] added xargs for fastq concat --- mess/workflow/rules/processing/reads.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index f3f63f0..e00e36c 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -343,13 +343,16 @@ rule cat_fastqs: lambda wildcards: aggregate(wildcards, fastq_dir, "fq.gz"), output: sample_fastq_out, + params: + dir=os.path.join(fastq_dir, "{sample}"), + name="*{p}.fq.gz" if PAIRED else "*.fq.gz", resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.norm.time, shell: """ - cat {input} > {output} + find {params.dir} -name "{params.name}" | xargs cat > {output} """ From a5192009e25adfcffbeea113d96f715c64d6b7ec Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:30:21 +0100 Subject: [PATCH 32/55] fixed shuffle seed --- mess/workflow/Snakefile | 1 + mess/workflow/simulate.smk | 1 + 2 files changed, 2 insertions(+) diff --git a/mess/workflow/Snakefile b/mess/workflow/Snakefile index 8e89441..d38afb6 100644 --- a/mess/workflow/Snakefile +++ b/mess/workflow/Snakefile @@ -131,6 +131,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index 4f2b23f..580e945 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -110,6 +110,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks From b188a7b5619632e77774dcef732d3bcb93ff32b2 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:30:45 +0100 Subject: [PATCH 33/55] added fasta in wildcard_constraints --- mess/workflow/rules/preflight/functions.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 1d7add3..cd55730 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -10,6 +10,7 @@ import random wildcard_constraints: sample="[^/]+", contig="[^/]+", + fasta="[^/]+", def list_reads(wildcards): @@ -120,7 +121,7 @@ def get_asm_summary(wildcards): except AttributeError: if FASTA and not ASM_SUMMARY: - table = os.path.join(dir.out.processing, "seqkit_stats.tsv") + table = "seqkit_stats.tsv" else: table = ASM_SUMMARY return table From 3d3ee4643fce302182492504c1facd2664009cbe Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:36:24 +0100 Subject: [PATCH 34/55] print only 3 first fastqs in cat_fastqs --- mess/workflow/rules/processing/reads.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index e00e36c..cc996ed 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -346,10 +346,13 @@ rule cat_fastqs: params: dir=os.path.join(fastq_dir, "{sample}"), name="*{p}.fq.gz" if PAIRED else "*.fq.gz", + head=lambda wildcards, input: list(input)[:3], resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.norm.time, + message: + "Concatenating {wildcards.sample} reads : {params.head} ... " shell: """ find {params.dir} -name "{params.name}" | xargs cat > {output} From e14c7e486b8fd937afa35cb2ebdc2c353edcf8a7 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:36:43 +0100 Subject: [PATCH 35/55] improved formatting --- mess/workflow/rules/preflight/targets_simulate.smk | 1 - mess/workflow/rules/simulate/long_reads.smk | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mess/workflow/rules/preflight/targets_simulate.smk b/mess/workflow/rules/preflight/targets_simulate.smk index 79d223b..e96efa9 100644 --- a/mess/workflow/rules/preflight/targets_simulate.smk +++ b/mess/workflow/rules/preflight/targets_simulate.smk @@ -2,5 +2,4 @@ All simulated reads are declared here """ - TargetSimreads = [list_reads, os.path.join(dir.out.base, "cleanup.done")] diff --git a/mess/workflow/rules/simulate/long_reads.smk b/mess/workflow/rules/simulate/long_reads.smk index 82be4f4..c2336d4 100644 --- a/mess/workflow/rules/simulate/long_reads.smk +++ b/mess/workflow/rules/simulate/long_reads.smk @@ -7,6 +7,7 @@ if CIRCULAR: ) id_prefix = os.path.basename(prefix) + if PASSES > 1: pbsim3_out = temp(prefix + ".sam") rename = f"mv {prefix}_0001.sam {prefix}.sam" @@ -17,7 +18,7 @@ else: rule pbsim3: input: - fa=fasta, + fasta, output: pbsim3_out, temp(prefix + ".maf"), @@ -35,7 +36,7 @@ rule pbsim3: seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=prefix, id_prefix=id_prefix, - reads_rename=rename, + rename=rename, log: os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log") if not CIRCULAR @@ -64,9 +65,10 @@ rule pbsim3: --qshmm {params.model} \\ --pass-num {params.passes} \\ --accuracy-mean {params.accuracy} \\ - --depth {params.cov} --genome {input.fa} &> {log} + --depth {params.cov} \\ + --genome {input} &> {log} mv {params.prefix}_0001.maf {params.prefix}.maf mv {params.prefix}_0001.ref {params.prefix}.ref - {params.reads_rename} + {params.rename} """ From 926e0025a3e0582d8c98eb77ddc1d970beb47ce5 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:59:02 +0100 Subject: [PATCH 36/55] fixed aggregate --- mess/workflow/rules/preflight/functions.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index cd55730..17cfc4d 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -121,7 +121,7 @@ def get_asm_summary(wildcards): except AttributeError: if FASTA and not ASM_SUMMARY: - table = "seqkit_stats.tsv" + table = os.path.join(dir.out.processing, "seqkit_stats.tsv") else: table = ASM_SUMMARY return table @@ -141,12 +141,8 @@ def is_circular(): def aggregate(wildcards, outdir, ext): df = get_cov_table(wildcards, "aggregate", ["samplename"]) - files = [ - os.path.join(outdir, wildcards.sample, row.fasta, f"{row.contig}.{ext}") - for row in df.loc[wildcards.sample].itertuples() - ] files = [] - for row in df.loc[wildcards.sample].itertuples(): + for row in df.loc[[wildcards.sample]].itertuples(): prefix = f"{row.contig}" if CIRCULAR: prefix += f"_{row.n}" From fbf1b900be8b85837dbdde750e6acab9a15a4c49 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 16:04:40 +0100 Subject: [PATCH 37/55] removed ccs_sam_to_bam log --- mess/workflow/rules/processing/reads.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index cc996ed..bf84bee 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -41,7 +41,7 @@ if PASSES > 1: shell: """ samtools view -@ {threads} -Sb {input} | \\ - samtools sort -@ {threads} -o {output} 2> {log} + samtools sort -@ {threads} -o {output} """ rule ccs_bam_to_fastq: From 804d1657b8e92d2f6783bfc63a96cd9266b03c0d Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:30:21 +0100 Subject: [PATCH 38/55] fixed shuffle seed --- mess/workflow/Snakefile | 1 + mess/workflow/simulate.smk | 1 + 2 files changed, 2 insertions(+) diff --git a/mess/workflow/Snakefile b/mess/workflow/Snakefile index 4261ec0..6b84992 100644 --- a/mess/workflow/Snakefile +++ b/mess/workflow/Snakefile @@ -132,6 +132,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index f8f0ba7..5a5c2d0 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -111,6 +111,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks From 7595b8975b2fb762723a2b9114db15ae83bb8725 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:30:45 +0100 Subject: [PATCH 39/55] added fasta in wildcard_constraints --- mess/workflow/rules/preflight/functions.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 3f7bb01..4797568 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -10,6 +10,7 @@ import random wildcard_constraints: sample="[^/]+", contig="[^/]+", + fasta="[^/]+", def list_reads(wildcards): @@ -128,7 +129,7 @@ def get_asm_summary(wildcards): except AttributeError: if FASTA and not ASM_SUMMARY: - table = os.path.join(dir.out.processing, "seqkit_stats.tsv") + table = "seqkit_stats.tsv" else: table = ASM_SUMMARY return table From 1e42020bca0ab480b55f162d5758273a55be5144 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:36:24 +0100 Subject: [PATCH 40/55] print only 3 first fastqs in cat_fastqs --- mess/workflow/rules/processing/reads.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index cad7ba6..89f55d5 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -349,10 +349,13 @@ rule cat_fastqs: params: dir=os.path.join(fastq_dir, "{sample}"), name="*{p}.fq.gz" if PAIRED else "*.fq.gz", + head=lambda wildcards, input: list(input)[:3], resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.norm.time, + message: + "Concatenating {wildcards.sample} reads : {params.head} ... " shell: """ find {params.dir} -name "{params.name}" | xargs cat > {output} From 06acb19b2346a30150c0cfc8f975fa1bf7b79f64 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:36:43 +0100 Subject: [PATCH 41/55] improved formatting --- mess/workflow/rules/preflight/targets_simulate.smk | 1 - mess/workflow/rules/simulate/long_reads.smk | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mess/workflow/rules/preflight/targets_simulate.smk b/mess/workflow/rules/preflight/targets_simulate.smk index 79d223b..e96efa9 100644 --- a/mess/workflow/rules/preflight/targets_simulate.smk +++ b/mess/workflow/rules/preflight/targets_simulate.smk @@ -2,5 +2,4 @@ All simulated reads are declared here """ - TargetSimreads = [list_reads, os.path.join(dir.out.base, "cleanup.done")] diff --git a/mess/workflow/rules/simulate/long_reads.smk b/mess/workflow/rules/simulate/long_reads.smk index 82be4f4..c2336d4 100644 --- a/mess/workflow/rules/simulate/long_reads.smk +++ b/mess/workflow/rules/simulate/long_reads.smk @@ -7,6 +7,7 @@ if CIRCULAR: ) id_prefix = os.path.basename(prefix) + if PASSES > 1: pbsim3_out = temp(prefix + ".sam") rename = f"mv {prefix}_0001.sam {prefix}.sam" @@ -17,7 +18,7 @@ else: rule pbsim3: input: - fa=fasta, + fasta, output: pbsim3_out, temp(prefix + ".maf"), @@ -35,7 +36,7 @@ rule pbsim3: seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=prefix, id_prefix=id_prefix, - reads_rename=rename, + rename=rename, log: os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log") if not CIRCULAR @@ -64,9 +65,10 @@ rule pbsim3: --qshmm {params.model} \\ --pass-num {params.passes} \\ --accuracy-mean {params.accuracy} \\ - --depth {params.cov} --genome {input.fa} &> {log} + --depth {params.cov} \\ + --genome {input} &> {log} mv {params.prefix}_0001.maf {params.prefix}.maf mv {params.prefix}_0001.ref {params.prefix}.ref - {params.reads_rename} + {params.rename} """ From dc3dd0fb894fc677c23f32cbfc2b6aa8cab656a1 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 15:59:02 +0100 Subject: [PATCH 42/55] fixed aggregate --- mess/workflow/rules/preflight/functions.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 4797568..3e5cc2f 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -129,7 +129,7 @@ def get_asm_summary(wildcards): except AttributeError: if FASTA and not ASM_SUMMARY: - table = "seqkit_stats.tsv" + table = os.path.join(dir.out.processing, "seqkit_stats.tsv") else: table = ASM_SUMMARY return table @@ -149,12 +149,8 @@ def is_circular(): def aggregate(wildcards, outdir, ext): df = get_cov_table(wildcards, "aggregate", ["samplename"]) - files = [ - os.path.join(outdir, wildcards.sample, row.fasta, f"{row.contig}.{ext}") - for row in df.loc[wildcards.sample].itertuples() - ] files = [] - for row in df.loc[wildcards.sample].itertuples(): + for row in df.loc[[wildcards.sample]].itertuples(): prefix = f"{row.contig}" if CIRCULAR: prefix += f"_{row.n}" From 34a412a6247758457dd579126e2b9574bb4d52c4 Mon Sep 17 00:00:00 2001 From: farchaab Date: Tue, 29 Oct 2024 16:04:40 +0100 Subject: [PATCH 43/55] removed ccs_sam_to_bam log --- mess/workflow/rules/processing/reads.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 89f55d5..45be37f 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -44,7 +44,7 @@ if PASSES > 1: shell: """ samtools view -@ {threads} -Sb {input} | \\ - samtools sort -@ {threads} -o {output} 2> {log} + samtools sort -@ {threads} -o {output} """ rule ccs_bam_to_fastq: From ac5c470777cbee5d2cc09542499f83a1471bcbc1 Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 29 Oct 2024 16:56:35 +0000 Subject: [PATCH 44/55] update file extension --- mess/workflow/rules/processing/reads.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 45be37f..124876b 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -7,8 +7,7 @@ sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short - sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") - sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.fixed") + sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) From cdd8b0f35b81f2fd53c3ddc50aa7a2d25a9f449a Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 29 Oct 2024 17:00:40 +0000 Subject: [PATCH 45/55] remove fix_art_sam_ef and update usage of aggregate() --- mess/workflow/rules/processing/reads.smk | 30 ++++-------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 124876b..0074352 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -429,32 +429,10 @@ if not SKIP_SHUFFLE: paste -d '\t' <(seqkit seq -n {output}) <(seqkit seq -n {input}) > {log[1]} """ -if ERRFREE: - rule fix_art_sam_ef: - """ - rule to replace SAM cigar string with read length + M - Fixes truncated art_illumina SAM files with some genomes - """ - input: - os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), - output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_ef.fixed")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - params: - maxlen=MEAN_LEN, - shell: - """ - awk 'BEGIN {{OFS="\t"}} {{ if ($1 ~ /^@/) {{ print $0 }} \\ - else {{ $6 = "{params.maxlen}M"; print $0 }} }}' \\ - {input} > {output} - """ - +if ERRFREE: rule convert_sam_to_bam_ef: input: - sam_in_ef, + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), output: temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), log: @@ -481,7 +459,7 @@ if ERRFREE: rule merge_contig_bams_ef: input: - lambda wildcards: aggregate(wildcards, dir.out.ef, "contig", "bam"), + lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), output: temp(os.path.join(dir.out.ef, "{sample}", "{fasta}.bam")), benchmark: @@ -504,7 +482,7 @@ if ERRFREE: rule merge_sample_bams_ef: input: - lambda wildcards: aggregate(wildcards, dir.out.ef, "fasta", "bam"), + lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), output: temp(os.path.join(dir.out.ef, "{sample}.unsorted")), benchmark: From 5c849c00ebf4fdae74cebb92a61d6678c0e39ab9 Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 29 Oct 2024 17:07:28 +0000 Subject: [PATCH 46/55] update convert_sam_to_bam_ef rule to match updated convert_sam_to_bam rule --- mess/workflow/rules/processing/reads.smk | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 0074352..153473d 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -8,6 +8,7 @@ sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam") + sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -432,29 +433,22 @@ if not SKIP_SHUFFLE: if ERRFREE: rule convert_sam_to_bam_ef: input: - os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), + sam_in_ef, output: temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), - log: - os.path.join( - dir.out.logs, - "bioconvert", - "sam2bam", - "{sample}", - "{fasta}" + contig + "_ef.log", - ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - threads: config.resources.sml.cpu + threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - bioconvert sam2bam {input} {output} -t {threads} 2> {log} + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} """ rule merge_contig_bams_ef: From e11410313927bdf8af212795bd4b83b7c23efb1e Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 29 Oct 2024 17:09:26 +0000 Subject: [PATCH 47/55] update merge_bams_ef rule to match merge_bams rule --- mess/workflow/rules/processing/reads.smk | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 153473d..b4c8659 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -451,24 +451,27 @@ if ERRFREE: samtools sort -@ {threads} -o {output} """ - rule merge_contig_bams_ef: + rule merge_bams_ef: input: lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), output: - temp(os.path.join(dir.out.ef, "{sample}", "{fasta}.bam")), - benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}", "{fasta}_ef.txt") + os.path.join(dir.out.ef, "{sample}", "{fasta}.bam"), log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}", "{fasta}_ef.log"), + os.path.join( + dir.out.logs, + "samtools", + "merge", + "{sample}_ef.log", + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - threads: config.resources.sml.cpu + threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ samtools merge -@ {threads} -o {output} {input} 2> {log} From 32e228c88b513dd29c6bcd26ee3c3bb58c2cf20a Mon Sep 17 00:00:00 2001 From: Ryan Teo Date: Tue, 29 Oct 2024 17:14:25 +0000 Subject: [PATCH 48/55] update ef rules to match existing workflow --- mess/workflow/rules/processing/reads.smk | 59 +++++++----------------- 1 file changed, 16 insertions(+), 43 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index b4c8659..3060633 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -455,7 +455,7 @@ if ERRFREE: input: lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), output: - os.path.join(dir.out.ef, "{sample}", "{fasta}.bam"), + os.path.join(dir.out.ef, "{sample}.bam"), log: os.path.join( dir.out.logs, @@ -477,71 +477,44 @@ if ERRFREE: samtools merge -@ {threads} -o {output} {input} 2> {log} """ - rule merge_sample_bams_ef: - input: - lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), - output: - temp(os.path.join(dir.out.ef, "{sample}.unsorted")), - benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}_ef.txt") - log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}_ef.log"), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.norm.cpu - conda: - os.path.join(dir.conda, "bioconvert.yml") - container: - containers.bioconvert - shell: - """ - samtools merge -@ {threads} -o {output} {input} 2> {log} - """ - - rule sort_bams_ef: + rule index_bams_ef: input: - os.path.join(dir.out.ef, "{sample}.unsorted"), - output: os.path.join(dir.out.ef, "{sample}.bam"), + output: + os.path.join(dir.out.ef, "{sample}.bam.bai"), benchmark: - os.path.join(dir.out.bench, "samtools", "sort", "{sample}_ef.txt"), - log: - os.path.join(dir.out.logs, "samtools", "sort", "{sample}_ef.log"), + os.path.join(dir.out.bench, "samtools", "index", "{sample}_ef.txt") resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, + time=config.resources.norm.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools sort -@ {threads} {input} -o {output} 2> {log} + samtools index -@ {threads} {input} """ - rule index_bams_ef: + rule get_bam_coverage_ef: input: os.path.join(dir.out.ef, "{sample}.bam"), output: - os.path.join(dir.out.ef, "{sample}.bam.bai"), - benchmark: - os.path.join(dir.out.bench, "samtools", "index", "{sample}_ef.txt") + temp(os.path.join(dir.out.ef, "{sample}.txt")), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", - time=config.resources.norm.time, - threads: config.resources.norm.cpu + time=config.resources.sml.time, + threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools index -@ {threads} {input} + samtools coverage {input} > {output} """ From c5866cc81c1630323922228a8e95abdd1c32f808 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 30 Oct 2024 18:26:00 +0100 Subject: [PATCH 49/55] moved error free bams in bam dir --- mess/workflow/rules/preflight/directories.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mess/workflow/rules/preflight/directories.smk b/mess/workflow/rules/preflight/directories.smk index ec2f5d1..5fd343d 100644 --- a/mess/workflow/rules/preflight/directories.smk +++ b/mess/workflow/rules/preflight/directories.smk @@ -34,7 +34,7 @@ dir.out.short = os.path.join(dir.out.processing, "short") dir.out.long = os.path.join(dir.out.processing, "long") dir.out.fastq = os.path.join(dir.out.base, "fastq") dir.out.bam = os.path.join(dir.out.base, "bam") -dir.out.ef = os.path.join(dir.out.base, "ef") +dir.out.ef = os.path.join(dir.out.bam, "error-free") dir.out.tax = os.path.join(dir.out.base, "tax") From 4afff3cdf8a27ff77c50021e3216b116d16b6c93 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 30 Oct 2024 18:27:19 +0100 Subject: [PATCH 50/55] added logs for samtools sort stderr --- mess/workflow/rules/processing/reads.smk | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 3060633..721fd43 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -8,7 +8,9 @@ sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam") - sam_in_ef = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), + sam_in_ef = ( + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), + ) fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -143,6 +145,8 @@ rule convert_sam_to_bam: sam_in, output: temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")), + log: + os.path.join(dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -155,7 +159,7 @@ rule convert_sam_to_bam: shell: """ samtools view -@ {threads} -Sb {input} | \\ - samtools sort -@ {threads} -o {output} + samtools sort -@ {threads} -o {output} 2> {log} """ @@ -430,12 +434,18 @@ if not SKIP_SHUFFLE: paste -d '\t' <(seqkit seq -n {output}) <(seqkit seq -n {input}) > {log[1]} """ -if ERRFREE: + +if ERRFREE: + rule convert_sam_to_bam_ef: input: sam_in_ef, output: temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), + log: + os.path.join( + dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -448,9 +458,9 @@ if ERRFREE: shell: """ samtools view -@ {threads} -Sb {input} | \\ - samtools sort -@ {threads} -o {output} + samtools sort -@ {threads} -o {output} 2> {log} """ - + rule merge_bams_ef: input: lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), @@ -476,7 +486,7 @@ if ERRFREE: """ samtools merge -@ {threads} -o {output} {input} 2> {log} """ - + rule index_bams_ef: input: os.path.join(dir.out.ef, "{sample}.bam"), From 48ead09c9fa6ebf3d40807172026d0dca458cedd Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 30 Oct 2024 18:27:51 +0100 Subject: [PATCH 51/55] linted options --- mess/__main__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mess/__main__.py b/mess/__main__.py index d1e383d..891937b 100644 --- a/mess/__main__.py +++ b/mess/__main__.py @@ -64,7 +64,13 @@ }, { "name": "art_illumina options", - "options": ["--custom-err", "--paired", "--frag-len", "--frag-sd", "--errfree"], + "options": [ + "--custom-err", + "--errfree", + "--paired", + "--frag-len", + "--frag-sd", + ], }, { "name": "pbsim3 options", From db88db9d227e7af916b7afac6ed2b2d0772038d2 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 30 Oct 2024 18:28:42 +0100 Subject: [PATCH 52/55] moved errfree option --- mess/util.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mess/util.py b/mess/util.py index ca781aa..ccf412b 100644 --- a/mess/util.py +++ b/mess/util.py @@ -291,6 +291,11 @@ def sim_options(func): type=str, default=None, ), + click.option( + "--errfree", + help="Generate error free alignments with art_illumina", + is_flag=True, + ), click.option( "--replicates", help="Number of replicates per sample", @@ -425,13 +430,6 @@ def sim_options(func): default=1, show_default=True, ), - click.option( - "--errfree", - help="Generate a zero sequencing errors SAM file", - is_flag=True, - default=False, - show_default=True, - ), ] for option in reversed(options): From 9c9f4175de70910897e9c464c59debfeef072fc4 Mon Sep 17 00:00:00 2001 From: farchaab Date: Fri, 1 Nov 2024 12:43:17 +0100 Subject: [PATCH 53/55] removed bam coverage for ef bams, fixed logs --- mess/workflow/rules/processing/reads.smk | 30 ++++++------------------ 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index 721fd43..6ab2e62 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -34,6 +34,10 @@ if PASSES > 1: contig + ".ccs.bam", ) ), + log: + os.path.join( + dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".ccs.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -46,7 +50,7 @@ if PASSES > 1: shell: """ samtools view -@ {threads} -Sb {input} | \\ - samtools sort -@ {threads} -o {output} + samtools sort -@ {threads} -o {output} 2> {log} """ rule ccs_bam_to_fastq: @@ -121,7 +125,7 @@ if BAM: log: os.path.join( dir.out.logs, - "maf2paf", + "maf2sam", "{sample}", "{fasta}" + "_" + contig + ".log", ), @@ -171,7 +175,6 @@ rule merge_bams: log: os.path.join( dir.out.logs, - "samtools", "merge", "{sample}.log", ), @@ -444,7 +447,7 @@ if ERRFREE: temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), log: os.path.join( - dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".log" + dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + "_ef.log" ), resources: mem_mb=config.resources.sml.mem, @@ -508,25 +511,6 @@ if ERRFREE: samtools index -@ {threads} {input} """ - rule get_bam_coverage_ef: - input: - os.path.join(dir.out.ef, "{sample}.bam"), - output: - temp(os.path.join(dir.out.ef, "{sample}.txt")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.sml.cpu - conda: - os.path.join(dir.conda, "samtools.yml") - container: - containers.samtools - shell: - """ - samtools coverage {input} > {output} - """ - rule cleanup_files: input: From 58c2d5db8144bfd057903e35bb6aa58cdf7ef179 Mon Sep 17 00:00:00 2001 From: farchaab Date: Fri, 1 Nov 2024 14:20:45 +0100 Subject: [PATCH 54/55] improved sam output --- mess/workflow/rules/simulate/short_reads.smk | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index c84a2cf..88d1d2f 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -26,8 +26,13 @@ fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") if CIRCULAR: fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") -sam_out = temp(fq_prefix + ".sam") if BAM else temp(fq_prefix + ".txt") -sam_ef_out = temp(fq_prefix + "_errFree.sam") if ERRFREE else temp(fq_prefix + "_errFree.txt") +sam_out = [temp(fq_prefix + ".txt")] +ext_cmd = f"touch {sam_out[0]} " +if BAM: + sam_out += [temp(fq_prefix + ".sam")] +if ERRFREE: + sam_out += [temp(fq_prefix + "_errFree.sam")] + fastq_out = [ temp(fq_prefix + "1.fq"), @@ -48,15 +53,15 @@ rule art_illumina: input: fasta, output: - sam=sam_out, fastqs=fastq_out, - sam_ef=sam_ef_out, + sam=sam_out, params: args=art_args, read_len=MEAN_LEN, cov=lambda wildcards: get_value("cov_sim", wildcards), seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=fq_prefix, + cmd=ext_cmd, log: os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log") if not CIRCULAR @@ -77,6 +82,5 @@ rule art_illumina: -rs {params.seed} -l {params.read_len} \\ -f {params.cov} -na {params.args} \\ -o {params.prefix} &> {log} - touch {output.sam} - touch {output.sam_ef} + {params.cmd} """ From 568e0808e6459418426e18b184d2f2fbbac342ac Mon Sep 17 00:00:00 2001 From: farchaab Date: Fri, 1 Nov 2024 15:17:23 +0100 Subject: [PATCH 55/55] added apptainer setup actions --- .github/workflows/unit-tests.yml | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b614402..5faab87 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -46,6 +46,9 @@ jobs: with: fetch-depth: 0 + - name: Setup apptainer + uses: eWaterCycle/setup-apptainer@v2.0.0 + - name: Setup MeSS environment uses: conda-incubator/setup-miniconda@v3 with: @@ -54,18 +57,6 @@ jobs: python-version: ${{ matrix.python-version }} auto-update-conda: true - - name: Setup apt dependencies - run: | - sudo add-apt-repository -y ppa:apptainer/ppa - sudo apt-get update - sudo apt install -y squashfuse fuse2fs gocryptfs apptainer - - - name: Disable apparmor namespace restrictions for apptainer - run: | - sudo sh -c 'echo kernel.apparmor_restrict_unprivileged_userns=0 \ - >/etc/sysctl.d/90-disable-userns-restrictions.conf' - sudo sysctl -p /etc/sysctl.d/90-disable-userns-restrictions.conf - - name: Install MeSS and pytest-cov run: | pip install -e .