diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b614402..5faab87 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -46,6 +46,9 @@ jobs: with: fetch-depth: 0 + - name: Setup apptainer + uses: eWaterCycle/setup-apptainer@v2.0.0 + - name: Setup MeSS environment uses: conda-incubator/setup-miniconda@v3 with: @@ -54,18 +57,6 @@ jobs: python-version: ${{ matrix.python-version }} auto-update-conda: true - - name: Setup apt dependencies - run: | - sudo add-apt-repository -y ppa:apptainer/ppa - sudo apt-get update - sudo apt install -y squashfuse fuse2fs gocryptfs apptainer - - - name: Disable apparmor namespace restrictions for apptainer - run: | - sudo sh -c 'echo kernel.apparmor_restrict_unprivileged_userns=0 \ - >/etc/sysctl.d/90-disable-userns-restrictions.conf' - sudo sysctl -p /etc/sysctl.d/90-disable-userns-restrictions.conf - - name: Install MeSS and pytest-cov run: | pip install -e . diff --git a/mess/__main__.py b/mess/__main__.py index 9e2a51a..891937b 100644 --- a/mess/__main__.py +++ b/mess/__main__.py @@ -64,7 +64,13 @@ }, { "name": "art_illumina options", - "options": ["--custom-err", "--paired", "--frag-len", "--frag-sd"], + "options": [ + "--custom-err", + "--errfree", + "--paired", + "--frag-len", + "--frag-sd", + ], }, { "name": "pbsim3 options", diff --git a/mess/config/config.yaml b/mess/config/config.yaml index 1c16f56..36e8155 100644 --- a/mess/config/config.yaml +++ b/mess/config/config.yaml @@ -11,7 +11,8 @@ args: configfile: custom_err: dist: - error: + error: + errfree: fasta: frag_len: frag_sd: diff --git a/mess/test_data/minimal_test.tsv b/mess/test_data/minimal_test.tsv index d533b26..437d863 100644 --- a/mess/test_data/minimal_test.tsv +++ b/mess/test_data/minimal_test.tsv @@ -1,3 +1,3 @@ taxon nb cov_sim sample staphylococcus_aureus 1 0.1 sample1 -1290 1 0.1 sample2 +1290 1 0.1 sample2 \ No newline at end of file diff --git a/mess/util.py b/mess/util.py index 749be9c..ccf412b 100644 --- a/mess/util.py +++ b/mess/util.py @@ -291,6 +291,11 @@ def sim_options(func): type=str, default=None, ), + click.option( + "--errfree", + help="Generate error free alignments with art_illumina", + is_flag=True, + ), click.option( "--replicates", help="Number of replicates per sample", diff --git a/mess/workflow/Snakefile b/mess/workflow/Snakefile index 8e89441..6b84992 100644 --- a/mess/workflow/Snakefile +++ b/mess/workflow/Snakefile @@ -112,6 +112,7 @@ include: os.path.join("rules", "processing", "fastas.smk") CUSTOM_ERR = config.args.custom_err ERROR = config.args.error BAM = config.args.bam +ERRFREE = config.args.errfree MIN_LEN = config.args.min_len MAX_LEN = config.args.max_len SD_LEN = config.args.sd_len @@ -131,6 +132,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks diff --git a/mess/workflow/envs/conda/assembly_finder.yml b/mess/workflow/envs/conda/assembly_finder.yml index 0055859..3328d70 100644 --- a/mess/workflow/envs/conda/assembly_finder.yml +++ b/mess/workflow/envs/conda/assembly_finder.yml @@ -4,8 +4,8 @@ channels: - bioconda - defaults dependencies: - - assembly_finder =0.7.6 - - ncbi-datasets-cli =16.26.2 + - assembly_finder =0.8.0 + - ncbi-datasets-cli =16.31.0 - taxonkit =0.17.0 - csvtk =0.30.0 - rsync =3.3.0 diff --git a/mess/workflow/envs/conda/samtools.yml b/mess/workflow/envs/conda/samtools.yml new file mode 100644 index 0000000..448f1d0 --- /dev/null +++ b/mess/workflow/envs/conda/samtools.yml @@ -0,0 +1,7 @@ +name: samtools +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - samtools =1.21 \ No newline at end of file diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml index fa9f89e..56338a2 100644 --- a/mess/workflow/envs/containers.yml +++ b/mess/workflow/envs/containers.yml @@ -1,5 +1,5 @@ art: docker://quay.io/biocontainers/art:2016.06.05--heacdb12_11 -assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.7 +assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.8.0 bioconvert: docker://quay.io/biocontainers/bioconvert:1.1.1--pyhdfd78af_0 curl: docker://quay.io/biocontainers/curl:7.80.0 pigz: docker://quay.io/biocontainers/pigz:2.8 @@ -7,3 +7,4 @@ pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0 pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1 +samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0 diff --git a/mess/workflow/rules/download/assembly_finder.smk b/mess/workflow/rules/download/assembly_finder.smk index 9b06676..a8cc64b 100644 --- a/mess/workflow/rules/download/assembly_finder.smk +++ b/mess/workflow/rules/download/assembly_finder.smk @@ -20,7 +20,7 @@ rule get_unique_entries: ) -af_args = "" +af_args = "--no-use-conda " if TAXON: af_args += "--taxon " if LIMIT: @@ -59,14 +59,11 @@ checkpoint download_assemblies: tsv=os.path.join(dir.out.base, "uniq_entries.tsv"), output: asm=os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"), - seq=os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"), tax=os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"), params: args=af_args, taxonkit=TAXONKIT, out=os.path.join(dir.out.base, "assembly_finder"), - benchmark: - os.path.join(dir.out.bench, "assembly_finder.txt") log: os.path.join(dir.out.logs, "assembly_finder.log"), resources: @@ -84,6 +81,5 @@ checkpoint download_assemblies: --taxonkit {params.taxonkit} \\ --threads {threads} \\ {params.args} \\ - --no-use-conda \\ -o {params.out} 2> {log} """ diff --git a/mess/workflow/rules/preflight/directories.smk b/mess/workflow/rules/preflight/directories.smk index fdcd5cc..5fd343d 100644 --- a/mess/workflow/rules/preflight/directories.smk +++ b/mess/workflow/rules/preflight/directories.smk @@ -34,6 +34,7 @@ dir.out.short = os.path.join(dir.out.processing, "short") dir.out.long = os.path.join(dir.out.processing, "long") dir.out.fastq = os.path.join(dir.out.base, "fastq") dir.out.bam = os.path.join(dir.out.base, "bam") +dir.out.ef = os.path.join(dir.out.bam, "error-free") dir.out.tax = os.path.join(dir.out.base, "tax") diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 74655e1..3e5cc2f 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -10,34 +10,39 @@ import random wildcard_constraints: sample="[^/]+", contig="[^/]+", + fasta="[^/]+", def list_reads(wildcards): + fastqs = "{sample}.fq.gz" + args = {"sample": SAMPLES} + if PAIRED: - reads = expand( - os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz"), - sample=SAMPLES, - p=PAIRS, - ) - else: - reads = expand( - os.path.join(dir.out.fastq, "{sample}.fq.gz"), - sample=SAMPLES, - ) + fastqs = "{sample}_R{p}.fq.gz" + args.update({"p": PAIRS}) + reads = collect(os.path.join(dir.out.fastq, fastqs), **args) if BAM: - bams = expand( + bams = collect( os.path.join(dir.out.bam, "{sample}.{bam}"), sample=SAMPLES, bam=["bam", "bam.bai"], ) - tax = expand( + tax = collect( os.path.join(dir.out.tax, "{sample}_{abundance}.txt"), sample=SAMPLES, abundance=["seq", "tax"], ) reads = reads + bams + tax + if ERRFREE: + bams_ef = expand( + os.path.join(dir.out.ef, "{sample}.{bam}"), + sample=SAMPLES, + bam=["bam", "bam.bai"], + ) + reads = reads + bams_ef + return reads @@ -67,32 +72,43 @@ def parse_samples(indir, replicates): fasta_cache = {} -def fasta_input(wildcards): - table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] +def get_fasta_table(wildcards): + fa_table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] + if fa_table not in fasta_cache: + fa_df = pd.read_csv(fa_table, sep="\t", index_col="fasta") + fasta_cache[fa_table] = fa_df + fa_df = fasta_cache[fa_table] + return fa_df - df = pd.read_csv(table, sep="\t", index_col="fasta") + +def fasta_input(wildcards): + df = get_fasta_table(wildcards) try: return df.loc[wildcards.fasta]["path"].drop_duplicates() except AttributeError: return df.loc[wildcards.fasta]["path"] - # some samples use the same genome path, drop duplicates to avoid duplicate paths when processing fasta def list_fastas(wildcards): - table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0] - if table not in fasta_cache: - df = pd.read_csv(table, sep="\t") - fasta_cache[table] = df - df = fasta_cache[table] - fastas = list(set(df["fasta"])) - return expand(os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=fastas) + df = get_fasta_table(wildcards) + return expand( + os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=list(set(df.index)) + ) table_cache = {} -def get_value(value, wildcards): +def get_cov_table(wildcards, key, idx_col): + cov_table = checkpoints.split_contigs.get(**wildcards).output[0] + if cov_table not in table_cache: + cov_df = pd.read_csv(cov_table, sep="\t", index_col=idx_col).sort_index() + table_cache[key] = cov_df + cov_df = table_cache[key] + return cov_df + +def get_value(value, wildcards): vals = ( f"{wildcards.sample}", f"{wildcards.fasta}", @@ -103,17 +119,8 @@ def get_value(value, wildcards): if CIRCULAR: idx_col += ["n"] vals += (int(wildcards.n),) - - table = checkpoints.split_contigs.get(**wildcards).output[0] - if table not in table_cache: - df = pd.read_csv( - table, - sep="\t", - index_col=idx_col, - ).sort_index() - table_cache[table] = df - df = table_cache[table] - return df.loc[vals, value] + val_df = get_cov_table(wildcards, "values", idx_col) + return val_df.loc[vals, value] def get_asm_summary(wildcards): @@ -128,10 +135,6 @@ def get_asm_summary(wildcards): return table -def get_cov_table(wildcards): - return checkpoints.split_contigs.get(**wildcards).output[0] - - def is_circular(): if os.path.isfile(INPUT): files = [INPUT] @@ -144,78 +147,19 @@ def is_circular(): return False -def aggregate(wildcards, outdir, level, ext): - table = checkpoints.split_contigs.get(**wildcards).output[0] - df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]).sort_index() - if level == "contig": - contigs = list( - df.loc[(wildcards.sample, wildcards.fasta)]["contig"].drop_duplicates() - ) - if "rotate" in df.columns: - rotates = int( - df.loc[(wildcards.sample, wildcards.fasta)]["rotate"] - .drop_duplicates() - .values - ) - - if PAIRED and ext != "bam": - if "rotate" in df.columns: - return expand( - os.path.join( - outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}" - ), - sample=wildcards.sample, - fasta=wildcards.fasta, - n=list(range(1, rotates + 1)), - p=wildcards.p, - contig=contigs, - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - p=wildcards.p, - contig=contigs, - ext=ext, - ) - - else: - if "rotate" in df.columns: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - n=list(range(1, rotates + 1)), - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - ext=ext, - ) - if level == "fasta": - fastas = list(set(df.loc[wildcards.sample].index)) +def aggregate(wildcards, outdir, ext): + df = get_cov_table(wildcards, "aggregate", ["samplename"]) + files = [] + for row in df.loc[[wildcards.sample]].itertuples(): + prefix = f"{row.contig}" + if CIRCULAR: + prefix += f"_{row.n}" if PAIRED and ext != "bam": - return expand( - os.path.join(outdir, "{sample}", "{fasta}{p}.{ext}"), - sample=wildcards.sample, - fasta=fastas, - p=wildcards.p, - ext=ext, - ) - else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}.{ext}"), - sample=wildcards.sample, - fasta=fastas, - ext=ext, - ) + prefix += f"{wildcards.p}" + files.append( + os.path.join(outdir, wildcards.sample, row.fasta, f"{prefix}.{ext}") + ) + return files def get_header(fa): diff --git a/mess/workflow/rules/preflight/targets_download.smk b/mess/workflow/rules/preflight/targets_download.smk index 88e6b20..610bff8 100644 --- a/mess/workflow/rules/preflight/targets_download.smk +++ b/mess/workflow/rules/preflight/targets_download.smk @@ -2,10 +2,8 @@ All target download files are declared here """ - TargetDownloads = [ os.path.join(dir.out.base, "uniq_entries.tsv"), os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"), - os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"), os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"), ] diff --git a/mess/workflow/rules/preflight/targets_simulate.smk b/mess/workflow/rules/preflight/targets_simulate.smk index 79d223b..e96efa9 100644 --- a/mess/workflow/rules/preflight/targets_simulate.smk +++ b/mess/workflow/rules/preflight/targets_simulate.smk @@ -2,5 +2,4 @@ All simulated reads are declared here """ - TargetSimreads = [list_reads, os.path.join(dir.out.base, "cleanup.done")] diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index d625f53..6ab2e62 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -3,9 +3,14 @@ contig = "{contig}" if CIRCULAR: contig = "{contig}_{n}" sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam") +sam_in_ef = os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".sam") + if SEQ_TECH == "illumina": fastq_dir = dir.out.short - sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") + sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam") + sam_in_ef = ( + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + "_errFree.sam"), + ) fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -30,20 +35,22 @@ if PASSES > 1: ) ), log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), + os.path.join( + dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".ccs.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", - time=config.resources.norm.time, + time=config.resources.sml.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools view -@ {threads} -bS {input} | \ - samtools sort -@ {threads} > {output} 2> {log} + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} 2> {log} """ rule ccs_bam_to_fastq: @@ -118,7 +125,6 @@ if BAM: log: os.path.join( dir.out.logs, - "bioconvert", "maf2sam", "{sample}", "{fasta}" + "_" + contig + ".log", @@ -138,126 +144,72 @@ if BAM: """ -rule fix_art_sam: - """ - rule to replace SAM cigar string with read length + M - Fixes truncated art_illumina SAM files with some genomes - """ - input: - os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"), - output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - params: - maxlen=MEAN_LEN, - shell: - """ - awk 'BEGIN {{OFS="\t"}} {{ if ($1 ~ /^@/) {{ print $0 }} \\ - else {{ $6 = "{params.maxlen}M"; print $0 }} }}' \\ - {input} > {output} - """ - - rule convert_sam_to_bam: input: sam_in, output: temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")), log: - os.path.join( - dir.out.logs, - "bioconvert", - "sam2bam", - "{sample}", - "{fasta}" + contig + ".log", - ), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - threads: config.resources.sml.cpu - conda: - os.path.join(dir.conda, "bioconvert.yml") - container: - containers.bioconvert - shell: - """ - bioconvert sam2bam {input} {output} -t {threads} 2> {log} - """ - - -rule merge_contig_bams: - input: - lambda wildcards: aggregate(wildcards, dir.out.bam, "contig", "bam"), - output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}.bam")), - benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}", "{fasta}.txt") - log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}", "{fasta}.log"), + os.path.join(dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + ".log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - threads: config.resources.sml.cpu + threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools merge -@ {threads} -o {output} {input} 2> {log} + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} 2> {log} """ -rule merge_sample_bams: +rule merge_bams: input: - lambda wildcards: aggregate(wildcards, dir.out.bam, "fasta", "bam"), + lambda wildcards: aggregate(wildcards, dir.out.bam, "bam"), output: - temp(os.path.join(dir.out.bam, "{sample}.unsorted")), - benchmark: - os.path.join(dir.out.bench, "samtools", "merge", "{sample}.txt") + os.path.join(dir.out.bam, "{sample}.bam"), log: - os.path.join(dir.out.logs, "samtools", "merge", "{sample}.log"), + os.path.join( + dir.out.logs, + "merge", + "{sample}.log", + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ samtools merge -@ {threads} -o {output} {input} 2> {log} """ -rule sort_bams: +rule index_bams: input: - os.path.join(dir.out.bam, "{sample}.unsorted"), - output: os.path.join(dir.out.bam, "{sample}.bam"), - benchmark: - os.path.join(dir.out.bench, "samtools", "sort", "{sample}.txt") - log: - os.path.join(dir.out.logs, "samtools", "sort", "{sample}.log"), + output: + os.path.join(dir.out.bam, "{sample}.bam.bai"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, + time=config.resources.norm.time, threads: config.resources.norm.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools sort -@ {threads} {input} -o {output} 2> {log} + samtools index -@ {threads} {input} """ @@ -266,27 +218,25 @@ rule get_bam_coverage: os.path.join(dir.out.bam, "{sample}.bam"), output: temp(os.path.join(dir.out.bam, "{sample}.txt")), - log: - os.path.join(dir.out.logs, "samtools", "coverage", "{sample}.log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, threads: config.resources.sml.cpu conda: - os.path.join(dir.conda, "bioconvert.yml") + os.path.join(dir.conda, "samtools.yml") container: - containers.bioconvert + containers.samtools shell: """ - samtools coverage {input} > {output} 2> {log} + samtools coverage {input} > {output} """ rule get_tax_profile: input: cov=os.path.join(dir.out.bam, "{sample}.txt"), - tax=get_cov_table, + tax=os.path.join(dir.out.processing, "cov.tsv"), output: counts=os.path.join(dir.out.tax, "{sample}.tsv"), seq_abundance=temp(os.path.join(dir.out.tax, "{sample}_seq.tsv")), @@ -343,10 +293,6 @@ rule tax_profile_to_biobox: dmp=os.path.join(TAXONKIT, "names.dmp"), output: os.path.join(dir.out.tax, "{sample}_{abundance}.txt"), - log: - os.path.join( - dir.out.logs, "taxonkit", "profile2cami", "{sample}_{abundance}.log" - ), params: dir=TAXONKIT, ranks=RANKS, @@ -369,28 +315,6 @@ rule tax_profile_to_biobox: """ -rule index_bams: - input: - os.path.join(dir.out.bam, "{sample}.bam"), - output: - os.path.join(dir.out.bam, "{sample}.bam.bai"), - benchmark: - os.path.join(dir.out.bench, "samtools", "index", "{sample}.txt") - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.norm.time, - threads: config.resources.norm.cpu - conda: - os.path.join(dir.conda, "bioconvert.yml") - container: - containers.bioconvert - shell: - """ - samtools index -@ {threads} {input} - """ - - rule compress_contig_fastqs: input: fastq, @@ -411,24 +335,6 @@ rule compress_contig_fastqs: """ -rule cat_contig_fastqs: - input: - flag=get_cov_table, - fq=lambda wildcards: aggregate(wildcards, fastq_dir, "contig", "fq.gz"), - output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}{p}.fq.gz")) - if PAIRED - else temp(os.path.join(fastq_dir, "{sample}", "{fasta}.fq.gz")), - resources: - mem_mb=config.resources.sml.mem, - mem=str(config.resources.sml.mem) + "MB", - time=config.resources.sml.time, - shell: - """ - cat {input.fq} > {output} - """ - - sample_fastq_out = [] if SKIP_SHUFFLE: if PAIRED: @@ -442,18 +348,24 @@ else: sample_fastq_out = temp(os.path.join(dir.out.cat, "{sample}.fq.gz")) -rule cat_sample_fastqs: +rule cat_fastqs: input: - lambda wildcards: aggregate(wildcards, fastq_dir, "fasta", "fq.gz"), + lambda wildcards: aggregate(wildcards, fastq_dir, "fq.gz"), output: sample_fastq_out, + params: + dir=os.path.join(fastq_dir, "{sample}"), + name="*{p}.fq.gz" if PAIRED else "*.fq.gz", + head=lambda wildcards, input: list(input)[:3], resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.norm.time, + message: + "Concatenating {wildcards.sample} reads : {params.head} ... " shell: """ - cat {input} > {output} + find {params.dir} -name "{params.name}" | xargs cat > {output} """ @@ -470,12 +382,6 @@ if not SKIP_SHUFFLE: else temp(os.path.join(dir.out.shuffle, "{sample}.fq.gz")), params: lambda wildcards: SHUFFLE[wildcards.sample], - benchmark: - ( - os.path.join(dir.out.bench, "seqkit", "shuffle", "{sample}_R{p}.txt") - if PAIRED - else os.path.join(dir.out.bench, "seqkit", "shuffle", "{sample}.txt") - ) log: os.path.join(dir.out.logs, "seqkit", "shuffle", "{sample}_R{p}.log") if PAIRED @@ -507,12 +413,6 @@ if not SKIP_SHUFFLE: os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz") if PAIRED else os.path.join(dir.out.fastq, "{sample}.fq.gz"), - benchmark: - ( - os.path.join(dir.out.bench, "seqkit", "anonymize", "{sample}_R{p}.txt") - if PAIRED - else os.path.join(dir.out.bench, "seqkit", "anonymize", "{sample}.txt") - ) log: os.path.join(dir.out.logs, "seqkit", "replace", "{sample}_R{p}.log") if PAIRED @@ -538,6 +438,80 @@ if not SKIP_SHUFFLE: """ +if ERRFREE: + + rule convert_sam_to_bam_ef: + input: + sam_in_ef, + output: + temp(os.path.join(dir.out.ef, "{sample}", "{fasta}", contig + ".bam")), + log: + os.path.join( + dir.out.logs, "sam2bam", "{sample}", "{fasta}", contig + "_ef.log" + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "samtools.yml") + container: + containers.samtools + shell: + """ + samtools view -@ {threads} -Sb {input} | \\ + samtools sort -@ {threads} -o {output} 2> {log} + """ + + rule merge_bams_ef: + input: + lambda wildcards: aggregate(wildcards, dir.out.ef, "bam"), + output: + os.path.join(dir.out.ef, "{sample}.bam"), + log: + os.path.join( + dir.out.logs, + "samtools", + "merge", + "{sample}_ef.log", + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "samtools.yml") + container: + containers.samtools + shell: + """ + samtools merge -@ {threads} -o {output} {input} 2> {log} + """ + + rule index_bams_ef: + input: + os.path.join(dir.out.ef, "{sample}.bam"), + output: + os.path.join(dir.out.ef, "{sample}.bam.bai"), + benchmark: + os.path.join(dir.out.bench, "samtools", "index", "{sample}_ef.txt") + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.norm.time, + threads: config.resources.norm.cpu + conda: + os.path.join(dir.conda, "samtools.yml") + container: + containers.samtools + shell: + """ + samtools index -@ {threads} {input} + """ + + rule cleanup_files: input: list_reads, diff --git a/mess/workflow/rules/simulate/long_reads.smk b/mess/workflow/rules/simulate/long_reads.smk index 82be4f4..c2336d4 100644 --- a/mess/workflow/rules/simulate/long_reads.smk +++ b/mess/workflow/rules/simulate/long_reads.smk @@ -7,6 +7,7 @@ if CIRCULAR: ) id_prefix = os.path.basename(prefix) + if PASSES > 1: pbsim3_out = temp(prefix + ".sam") rename = f"mv {prefix}_0001.sam {prefix}.sam" @@ -17,7 +18,7 @@ else: rule pbsim3: input: - fa=fasta, + fasta, output: pbsim3_out, temp(prefix + ".maf"), @@ -35,7 +36,7 @@ rule pbsim3: seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=prefix, id_prefix=id_prefix, - reads_rename=rename, + rename=rename, log: os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log") if not CIRCULAR @@ -64,9 +65,10 @@ rule pbsim3: --qshmm {params.model} \\ --pass-num {params.passes} \\ --accuracy-mean {params.accuracy} \\ - --depth {params.cov} --genome {input.fa} &> {log} + --depth {params.cov} \\ + --genome {input} &> {log} mv {params.prefix}_0001.maf {params.prefix}.maf mv {params.prefix}_0001.ref {params.prefix}.ref - {params.reads_rename} + {params.rename} """ diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index 6d83d11..88d1d2f 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -1,4 +1,4 @@ -art_args = "" +art_args = "-k 0 " if CUSTOM_ERR == None: art_args += f"-ss {ERROR} " if CUSTOM_ERR: @@ -16,16 +16,22 @@ if PAIRED: if BAM: - art_args += "-sam -M" + art_args += "-sam -M " + +if ERRFREE: + art_args += "-ef " fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") if CIRCULAR: fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") -sam_out = temp(fq_prefix + ".txt") +sam_out = [temp(fq_prefix + ".txt")] +ext_cmd = f"touch {sam_out[0]} " if BAM: - sam_out = temp(fq_prefix + ".sam") + sam_out += [temp(fq_prefix + ".sam")] +if ERRFREE: + sam_out += [temp(fq_prefix + "_errFree.sam")] fastq_out = [ @@ -47,14 +53,15 @@ rule art_illumina: input: fasta, output: - sam=sam_out, fastqs=fastq_out, + sam=sam_out, params: args=art_args, read_len=MEAN_LEN, cov=lambda wildcards: get_value("cov_sim", wildcards), seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=fq_prefix, + cmd=ext_cmd, log: os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log") if not CIRCULAR @@ -75,5 +82,5 @@ rule art_illumina: -rs {params.seed} -l {params.read_len} \\ -f {params.cov} -na {params.args} \\ -o {params.prefix} &> {log} - touch {output.sam} + {params.cmd} """ diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index 4f2b23f..5a5c2d0 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -91,6 +91,7 @@ include: os.path.join("rules", "processing", "fastas.smk") CUSTOM_ERR = config.args.custom_err ERROR = config.args.error BAM = config.args.bam +ERRFREE = config.args.errfree MIN_LEN = config.args.min_len MAX_LEN = config.args.max_len SD_LEN = config.args.sd_len @@ -110,6 +111,7 @@ else: # reads post-processsing options +random.seed(SEED) SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES)))) SKIP_SHUFFLE = config.args.skip_shuffle RANKS = config.args.ranks diff --git a/profiles/slurm/slurm-jobscript.sh b/profiles/slurm/slurm-jobscript.sh old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-sidecar.py b/profiles/slurm/slurm-sidecar.py old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-status.py b/profiles/slurm/slurm-status.py old mode 100755 new mode 100644 diff --git a/profiles/slurm/slurm-submit.py b/profiles/slurm/slurm-submit.py old mode 100755 new mode 100644