diff --git a/mess/__main__.py b/mess/__main__.py index 3b6fdee..9e2a51a 100644 --- a/mess/__main__.py +++ b/mess/__main__.py @@ -105,7 +105,11 @@ local_sim_options = ( { "name": "Local genomes options", - "options": ["--asm-summary", "--fasta"], + "options": [ + "--asm-summary", + "--fasta", + "--rotate", + ], }, ) diff --git a/mess/util.py b/mess/util.py index 2fba7cd..193a4c2 100644 --- a/mess/util.py +++ b/mess/util.py @@ -116,7 +116,6 @@ def run_snakemake( if prefix: snake_command += ["--conda-prefix", prefix] if sdm == "apptainer": - snake_command += [ f"--sdm apptainer --apptainer-args '-B {workflow_basedir}:{workflow_basedir}'" ] @@ -248,6 +247,13 @@ def sim_options(func): default=False, required=False, ), + click.option( + "--rotate", + help="Number of times to shuffle genome start for circular assemblies (2 or more for circular)", + type=int, + default=1, + show_default=True, + ), click.option( "--tech", help="Sequencing technology", diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 6e75413..edf0a4a 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -89,19 +89,47 @@ def list_fastas(wildcards): table_cache = {} +def get_value(value, wildcards): + vals = ( + f"{wildcards.sample}", + f"{wildcards.fasta}", + f"{wildcards.contig}", + ) + idx_col = ["samplename", "fasta", "contig"] -def get_value(table, wildcards, value): + if ROTATE > 1: + idx_col += ["n"] + vals += (int(wildcards.n),) + + table = checkpoints.split_contigs.get(**wildcards).output[0] if table not in table_cache: df = pd.read_csv( table, sep="\t", - index_col=["samplename", "fasta", "contig"], + index_col=idx_col, ) table_cache[table] = df df = table_cache[table] - val = df.loc[wildcards.sample].loc[wildcards.fasta].loc[wildcards.contig][value] - return val + return df.loc[vals, value] + + +fa_table_cache = {} +def get_random_start(wildcards): + table = checkpoints.split_contigs.get(**wildcards).output[0] + if table not in fa_table_cache: + df = pd.read_csv( + table, + sep="\t", + index_col=["fasta","contig","n"] + ) + fa_table_cache[table] = df + vals=(f"{wildcards.fasta}",f"{wildcards.contig}",int(wildcards.n)) + df = fa_table_cache[table] + return df.loc[vals, "random_start"] + + + def get_asm_summary(wildcards): @@ -130,22 +158,46 @@ def aggregate(wildcards, outdir, level, ext): else: contigs = list(set(contigs)) if PAIRED and ext != "bam": - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - p=wildcards.p, - contig=contigs, - ext=ext, - ) + if ROTATE > 1: + return expand( + os.path.join( + outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}" + ), + sample=wildcards.sample, + fasta=wildcards.fasta, + n=list(range(1, ROTATE + 1)), + p=wildcards.p, + contig=contigs, + ext=ext, + ) + else: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + p=wildcards.p, + contig=contigs, + ext=ext, + ) + else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - ext=ext, - ) + if ROTATE > 1: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + contig=contigs, + n=list(range(1, ROTATE + 1)), + ext=ext, + ) + else: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + contig=contigs, + ext=ext, + ) if level == "fasta": df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]) fastas = list(set(df.loc[wildcards.sample].index)) diff --git a/mess/workflow/rules/processing/fastas.smk b/mess/workflow/rules/processing/fastas.smk index e9346d6..c24014c 100644 --- a/mess/workflow/rules/processing/fastas.smk +++ b/mess/workflow/rules/processing/fastas.smk @@ -50,9 +50,45 @@ checkpoint split_contigs: output: tsv=os.path.join(dir.out.processing, "cov.tsv"), dir=directory(os.path.join(dir.out.processing, "split")), + params: + rotate=ROTATE, resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, script: os.path.join(dir.scripts, "split_contigs.py") + + +if ROTATE > 1: + + rule rotate_contigs: + input: + fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), + df=get_cov_table, + output: + os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna"), + params: + get_random_start, + log: + os.path.join( + dir.out.logs, + "seqkit", + "restart", + "{fasta}_{contig}_{n}.log", + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "seqkit.yml") + container: + containers.seqkit + shell: + """ + seqkit restart -i {params} {input.fa} | \ + seqkit seq -i | \ + seqkit replace -p .+ -r {wildcards.contig}_{wildcards.n} > {output} + """ diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index fc7d8f3..60d0c8e 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -1,8 +1,11 @@ fastq_dir = dir.out.long -sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam") +contig = "{contig}" +if ROTATE > 1: + contig = "{contig}_{n}" +sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short - sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed") + sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -15,7 +18,7 @@ if PASSES > 1: rule ccs_sam_to_bam: input: - os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam"), + os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".sam"), output: temp( os.path.join( @@ -23,19 +26,11 @@ if PASSES > 1: "ccs", "{sample}", "{fasta}", - "{contig}.ccs.bam", + contig + ".ccs.bam", ) ), - benchmark: - os.path.join( - dir.out.bench, - "samtools", - "sam2bam", - "{sample}", - "{fasta}_{contig}.txt", - ) log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -53,14 +48,16 @@ if PASSES > 1: rule ccs_bam_to_fastq: input: - os.path.join(dir.out.base, "ccs", "{sample}", "{fasta}", "{contig}.ccs.bam"), + os.path.join( + dir.out.base, "ccs", "{sample}", "{fasta}", contig + ".ccs.bam" + ), output: fq=temp( os.path.join( dir.out.long, "{sample}", "{fasta}", - "{contig}.fq.gz", + contig + ".fq.gz", ) ), json=temp( @@ -68,13 +65,11 @@ if PASSES > 1: dir.out.long, "{sample}", "{fasta}", - "{contig}.zmw_metrics.json.gz", + contig + ".zmw_metrics.json.gz", ) ), - benchmark: - os.path.join(dir.out.bench, "ccs", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), params: passes=PASSES, accuracy=ACCURACY, @@ -100,10 +95,10 @@ if BAM: rule add_reference_name: input: - maf=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf"), - fa=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref"), + maf=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".maf"), + fa=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".ref"), output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf")), + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf")), params: seqname=lambda wildcards, input: get_header(input.fa), resources: @@ -117,24 +112,16 @@ if BAM: rule convert_maf_to_sam: input: - os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf"), + os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"), output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")), - benchmark: - os.path.join( - dir.out.logs, - "bioconvert", - "maf2sam", - "{sample}", - "{fasta}_{contig}.txt", - ) + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")), log: os.path.join( dir.out.logs, "bioconvert", "maf2sam", "{sample}", - "{fasta}_{contig}.log", + "{fasta}" + "_" + contig + ".log", ), resources: mem_mb=config.resources.sml.mem, @@ -157,9 +144,9 @@ rule fix_art_sam: Fixes truncated art_illumina SAM files with some genomes """ input: - os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam"), + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"), output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")), + temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -178,10 +165,14 @@ rule convert_sam_to_bam: input: sam_in, output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.bam")), + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")), log: os.path.join( - dir.out.logs, "bioconvert", "sam2bam", "{sample}", "{fasta}{contig}.log" + dir.out.logs, + "bioconvert", + "sam2bam", + "{sample}", + "{fasta}" + contig + ".log", ), resources: mem_mb=config.resources.sml.mem, @@ -288,7 +279,7 @@ rule get_bam_coverage: containers.bioconvert shell: """ - samtools coverage {input} | tee {output} {log} + samtools coverage {input} > {output} 2> {log} """ @@ -306,10 +297,12 @@ rule get_tax_profile: time=config.resources.sml.time, threads: config.resources.sml.cpu params: - paired=PAIRED, + rotate=ROTATE, run: tax_df = pd.read_csv(input.tax, sep="\t") tax_df = tax_df[tax_df.samplename == wildcards.sample] + if params.rotate > 1: + tax_df["contig"] = tax_df["contig"] + "_" + tax_df["n"].astype(str) cov_df = pd.read_csv(input.cov, sep="\t") cov_df.rename(columns={"#rname": "contig"}, inplace=True) merge_df = tax_df.merge(cov_df) diff --git a/mess/workflow/rules/simulate/long_reads.smk b/mess/workflow/rules/simulate/long_reads.smk index fa9af5a..c6e4e46 100644 --- a/mess/workflow/rules/simulate/long_reads.smk +++ b/mess/workflow/rules/simulate/long_reads.smk @@ -1,20 +1,25 @@ prefix = os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}") +fasta = os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna") +if ROTATE > 1: + prefix = os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}_{n}") + fasta = os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna") +id_prefix = os.path.basename(prefix) + if PASSES > 1: - pbsim3_out = temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam")) + pbsim3_out = temp(prefix + ".sam") rename = f"mv {prefix}_0001.sam {prefix}.sam" else: - pbsim3_out = temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.fq")) + pbsim3_out = temp(prefix + ".fq") rename = f"mv {prefix}_0001.fastq {prefix}.fq" rule pbsim3: input: - fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), - df=get_cov_table, + fa=fasta, output: pbsim3_out, - temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf")), - temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref")), + temp(prefix + ".maf"), + temp(prefix + ".ref"), params: model=os.path.join(MODEL_PATH, f"{MODEL}.model"), ratio=RATIO, @@ -24,14 +29,17 @@ rule pbsim3: maxlen=MAX_LEN, passes=PASSES, accuracy=ACCURACY, - cov=lambda wildcards, input: get_value(input.df, wildcards, "cov_sim"), - seed=lambda wildcards, input: int(get_value(input.df, wildcards, "seed")), + cov=lambda wildcards: get_value("cov_sim", wildcards), + seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=prefix, + id_prefix=id_prefix, reads_rename=rename, - benchmark: - os.path.join(dir.out.bench, "pbsim3", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log") + if ROTATE == 1 + else os.path.join( + dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}_{n}.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -43,7 +51,7 @@ rule pbsim3: shell: """ pbsim --strategy wgs --method qshmm \\ - --id-prefix {wildcards.contig} \\ + --id-prefix {params.id_prefix} \\ --difference-ratio {params.ratio} \\ --length-min {params.minlen} \\ --length-max {params.maxlen} \\ diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index a7cf6b5..7d60e2a 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -19,25 +19,31 @@ if BAM: art_args += "-sam -M" -sam_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.txt")) +fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") +if ROTATE > 1: + fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") + +sam_out = temp(fq_prefix + ".txt") if BAM: - sam_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.sam")) + sam_out = temp(temp(fq_prefix + ".sam")) fastq_out = [ - temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}1.fq")), - temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}2.fq")), + temp(fq_prefix + "1.fq"), + temp(fq_prefix + "2.fq"), ] -fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") - if not PAIRED: - fastq_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.fq")) + fastq_out = temp(fq_prefix + ".fq") + +fasta = os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna") +if ROTATE > 1: + fasta = os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna") rule art_illumina: input: - fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), + fa=fasta, df=get_cov_table, output: sam=sam_out, @@ -45,13 +51,15 @@ rule art_illumina: params: args=art_args, read_len=MEAN_LEN, - cov=lambda wildcards, input: get_value(input.df, wildcards, "cov_sim"), - seed=lambda wildcards, input: int(get_value(input.df, wildcards, "seed")), + cov=lambda wildcards: get_value("cov_sim", wildcards), + seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=fq_prefix, - benchmark: - os.path.join(dir.out.bench, "art", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log") + if ROTATE == 1 + else os.path.join( + dir.out.logs, "art", "{sample}", "{fasta}", "{contig}_{n}.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", diff --git a/mess/workflow/scripts/split_contigs.py b/mess/workflow/scripts/split_contigs.py index 4505b2c..6750dc4 100644 --- a/mess/workflow/scripts/split_contigs.py +++ b/mess/workflow/scripts/split_contigs.py @@ -2,6 +2,12 @@ import pandas as pd import os from itertools import chain +import random + + +def get_random_start(seed, contig_length, n): + random.seed(seed) + return [random.randint(0, contig_length - 1) for _ in range(n)] def split_fasta(fa, outdir): @@ -11,7 +17,9 @@ def split_fasta(fa, outdir): for record in SeqIO.parse(fa, "fasta"): SeqIO.write(record, fasta_name + "_" + record.id + ".fna", "fasta") - record_ids.append({"contig": record.id, "fasta": name}) + record_ids.append( + {"contig": record.id, "fasta": name, "contig_length": len(record.seq)} + ) return record_ids @@ -24,6 +32,20 @@ def split_fasta(fa, outdir): id2fa = list(chain.from_iterable(id2fa)) contig_df = pd.DataFrame.from_records(id2fa) df = pd.merge(contig_df, cov_df, how="left", on="fasta") -df[["samplename", "fasta", "contig", "tax_id", "seed", "cov_sim"]].to_csv( - snakemake.output.tsv, sep="\t", index=False -) + +cols = ["samplename", "fasta", "contig", "contig_length", "tax_id", "seed", "cov_sim"] + +if snakemake.params.rotate > 1: + cols += ["n", "random_start"] + df["random_start"] = df.apply( + lambda row: get_random_start( + row["seed"], row["contig_length"], snakemake.params.rotate + ), + axis=1, + ) + df_expanded = df.explode("random_start").reset_index(drop=True) + df_expanded["n"] = df_expanded.groupby(["samplename", "contig"]).cumcount() + 1 + df = df_expanded + df["cov_sim"] = df["cov_sim"] / snakemake.params.rotate + +df[cols].to_csv(snakemake.output.tsv, sep="\t", index=False) diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index 32706ab..5e0a8cb 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -77,6 +77,12 @@ MEAN_LEN = config.args.mean_len # calculate coverages include: os.path.join("rules", "processing", "coverages.smk") + + +# fasta processing options +ROTATE = config.args.rotate + + include: os.path.join("rules", "processing", "fastas.smk")