Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for circular genomes #31

Merged
merged 13 commits into from
Oct 17, 2024
6 changes: 5 additions & 1 deletion mess/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@
local_sim_options = (
{
"name": "Local genomes options",
"options": ["--asm-summary", "--fasta"],
"options": [
"--asm-summary",
"--fasta",
"--rotate",
],
},
)

Expand Down
8 changes: 7 additions & 1 deletion mess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ def run_snakemake(
if prefix:
snake_command += ["--conda-prefix", prefix]
if sdm == "apptainer":

snake_command += [
f"--sdm apptainer --apptainer-args '-B {workflow_basedir}:{workflow_basedir}'"
]
Expand Down Expand Up @@ -248,6 +247,13 @@ def sim_options(func):
default=False,
required=False,
),
click.option(
"--rotate",
help="Number of times to shuffle genome start for circular assemblies (2 or more for circular)",
type=int,
default=1,
show_default=True,
),
click.option(
"--tech",
help="Sequencing technology",
Expand Down
90 changes: 71 additions & 19 deletions mess/workflow/rules/preflight/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,47 @@ def list_fastas(wildcards):


table_cache = {}
def get_value(value, wildcards):

vals = (
f"{wildcards.sample}",
f"{wildcards.fasta}",
f"{wildcards.contig}",
)
idx_col = ["samplename", "fasta", "contig"]

def get_value(table, wildcards, value):
if ROTATE > 1:
idx_col += ["n"]
vals += (int(wildcards.n),)

table = checkpoints.split_contigs.get(**wildcards).output[0]
if table not in table_cache:
df = pd.read_csv(
table,
sep="\t",
index_col=["samplename", "fasta", "contig"],
index_col=idx_col,
)
table_cache[table] = df
df = table_cache[table]
val = df.loc[wildcards.sample].loc[wildcards.fasta].loc[wildcards.contig][value]
return val
return df.loc[vals, value]


fa_table_cache = {}
def get_random_start(wildcards):
table = checkpoints.split_contigs.get(**wildcards).output[0]
if table not in fa_table_cache:
df = pd.read_csv(
table,
sep="\t",
index_col=["fasta","contig","n"]
)
fa_table_cache[table] = df
vals=(f"{wildcards.fasta}",f"{wildcards.contig}",int(wildcards.n))
df = fa_table_cache[table]
return df.loc[vals, "random_start"]





def get_asm_summary(wildcards):
Expand Down Expand Up @@ -130,22 +158,46 @@ def aggregate(wildcards, outdir, level, ext):
else:
contigs = list(set(contigs))
if PAIRED and ext != "bam":
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
p=wildcards.p,
contig=contigs,
ext=ext,
)
if ROTATE > 1:
return expand(
os.path.join(
outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}"
),
sample=wildcards.sample,
fasta=wildcards.fasta,
n=list(range(1, ROTATE + 1)),
p=wildcards.p,
contig=contigs,
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
p=wildcards.p,
contig=contigs,
ext=ext,
)

else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
ext=ext,
)
if ROTATE > 1:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
n=list(range(1, ROTATE + 1)),
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
ext=ext,
)
if level == "fasta":
df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"])
fastas = list(set(df.loc[wildcards.sample].index))
Expand Down
36 changes: 36 additions & 0 deletions mess/workflow/rules/processing/fastas.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,45 @@ checkpoint split_contigs:
output:
tsv=os.path.join(dir.out.processing, "cov.tsv"),
dir=directory(os.path.join(dir.out.processing, "split")),
params:
rotate=ROTATE,
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.sml.time,
script:
os.path.join(dir.scripts, "split_contigs.py")


if ROTATE > 1:

rule rotate_contigs:
input:
fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"),
df=get_cov_table,
output:
os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna"),
params:
get_random_start,
log:
os.path.join(
dir.out.logs,
"seqkit",
"restart",
"{fasta}_{contig}_{n}.log",
),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.sml.time,
threads: config.resources.sml.cpu
conda:
os.path.join(dir.conda, "seqkit.yml")
container:
containers.seqkit
shell:
"""
seqkit restart -i {params} {input.fa} | \
seqkit seq -i | \
seqkit replace -p .+ -r {wildcards.contig}_{wildcards.n} > {output}
"""
71 changes: 32 additions & 39 deletions mess/workflow/rules/processing/reads.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
fastq_dir = dir.out.long
sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")
contig = "{contig}"
if ROTATE > 1:
contig = "{contig}_{n}"
sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")
if SEQ_TECH == "illumina":
fastq_dir = dir.out.short
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")

fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq")
fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz"))
Expand All @@ -15,27 +18,19 @@ if PASSES > 1:

rule ccs_sam_to_bam:
input:
os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam"),
os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".sam"),
output:
temp(
os.path.join(
dir.out.base,
"ccs",
"{sample}",
"{fasta}",
"{contig}.ccs.bam",
contig + ".ccs.bam",
)
),
benchmark:
os.path.join(
dir.out.bench,
"samtools",
"sam2bam",
"{sample}",
"{fasta}_{contig}.txt",
)
log:
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"),
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
Expand All @@ -53,28 +48,28 @@ if PASSES > 1:

rule ccs_bam_to_fastq:
input:
os.path.join(dir.out.base, "ccs", "{sample}", "{fasta}", "{contig}.ccs.bam"),
os.path.join(
dir.out.base, "ccs", "{sample}", "{fasta}", contig + ".ccs.bam"
),
output:
fq=temp(
os.path.join(
dir.out.long,
"{sample}",
"{fasta}",
"{contig}.fq.gz",
contig + ".fq.gz",
)
),
json=temp(
os.path.join(
dir.out.long,
"{sample}",
"{fasta}",
"{contig}.zmw_metrics.json.gz",
contig + ".zmw_metrics.json.gz",
)
),
benchmark:
os.path.join(dir.out.bench, "ccs", "{sample}", "{fasta}", "{contig}.txt")
log:
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"),
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"),
params:
passes=PASSES,
accuracy=ACCURACY,
Expand All @@ -100,10 +95,10 @@ if BAM:

rule add_reference_name:
input:
maf=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf"),
fa=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref"),
maf=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".maf"),
fa=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".ref"),
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf")),
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf")),
params:
seqname=lambda wildcards, input: get_header(input.fa),
resources:
Expand All @@ -117,24 +112,16 @@ if BAM:

rule convert_maf_to_sam:
input:
os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf"),
os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"),
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")),
benchmark:
os.path.join(
dir.out.logs,
"bioconvert",
"maf2sam",
"{sample}",
"{fasta}_{contig}.txt",
)
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")),
log:
os.path.join(
dir.out.logs,
"bioconvert",
"maf2sam",
"{sample}",
"{fasta}_{contig}.log",
"{fasta}" + "_" + contig + ".log",
),
resources:
mem_mb=config.resources.sml.mem,
Expand All @@ -157,9 +144,9 @@ rule fix_art_sam:
Fixes truncated art_illumina SAM files with some genomes
"""
input:
os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam"),
os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"),
output:
temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")),
temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
Expand All @@ -178,10 +165,14 @@ rule convert_sam_to_bam:
input:
sam_in,
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.bam")),
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")),
log:
os.path.join(
dir.out.logs, "bioconvert", "sam2bam", "{sample}", "{fasta}{contig}.log"
dir.out.logs,
"bioconvert",
"sam2bam",
"{sample}",
"{fasta}" + contig + ".log",
),
resources:
mem_mb=config.resources.sml.mem,
Expand Down Expand Up @@ -288,7 +279,7 @@ rule get_bam_coverage:
containers.bioconvert
shell:
"""
samtools coverage {input} | tee {output} {log}
samtools coverage {input} > {output} 2> {log}
"""


Expand All @@ -306,10 +297,12 @@ rule get_tax_profile:
time=config.resources.sml.time,
threads: config.resources.sml.cpu
params:
paired=PAIRED,
rotate=ROTATE,
run:
tax_df = pd.read_csv(input.tax, sep="\t")
tax_df = tax_df[tax_df.samplename == wildcards.sample]
if params.rotate > 1:
tax_df["contig"] = tax_df["contig"] + "_" + tax_df["n"].astype(str)
cov_df = pd.read_csv(input.cov, sep="\t")
cov_df.rename(columns={"#rname": "contig"}, inplace=True)
merge_df = tax_df.merge(cov_df)
Expand Down
Loading