Skip to content

Commit

Permalink
Test for #243
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Oct 18, 2019
1 parent 3c225e3 commit 19f3b2f
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 2 deletions.
83 changes: 81 additions & 2 deletions sample_data/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ configfile: "configuration.yaml"

rule complete:
input: "compare.stats", "compare_subloci.stats", "compare_input.stats", "check.ok",
"check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok"
"check_metrics.ok", "daijin_test/mikado.yaml", "g11.ok", "refmap_check.ok",
"external.ok"
output: touch("finished.ok")


Expand Down Expand Up @@ -128,7 +129,6 @@ rule compare_subloci:
message: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""
shell: """mikado compare -r {input[reference]} -p {input[prediction]} -o compare_subloci -l {log}"""


rule check_refmap:
input:
refmap=rules.compare_input.output.refmap
Expand Down Expand Up @@ -177,6 +177,85 @@ rule check_pick:
open(output[0], "wt")


rule test_external_kal_index:
input:
fasta=rules.daijin.output.prep_fasta
output:
index=os.path.join("Daijin", "5-mikado", "mikado_prepared.idx")
conda: "kallisto.yaml"
threads: 1
shell: "kallisto index -i {output.index} {input.fasta}"

rule test_external_kal_quant:
input:
index=rules.test_external_kal_index.output.index,
r1="ERR588038.R1.fq.gz",
r2="ERR588038.R2.fq.gz"
output:
kal_tsv=os.path.join("Daijin", "5-mikado", "kallisto", "abundance.tsv"),
data=os.path.join("Daijin", "5-mikado", "data.txt")
params:
folder=os.path.join("Daijin", "5-mikado", "kallisto")
conda: "kallisto.yaml"
threads: 2
shell: """kallisto quant -t {threads} -o {params.folder} -i {input.index} {input.r1} {input.r2} && \
cut -f 1,5 {output.kal_tsv} | sed 's/target_id/tid/' > {output.data}"""


rule test_external_kal_serialise:
input:
db=rules.daijin.output.db,
data=rules.test_external_kal_quant.output.data,
output:
db=os.path.join("Daijin", "5-mikado", "mikado_external.db"),
check=touch(os.path.join("Daijin", "5-mikado", "mikado_external.check.ok"))
params:
db="mikado_external.db"
threads: 2
log: os.path.join("Daijin", "5-mikado", "mikado_serialise_external.log")
shell: """
mikado serialise --xml=Daijin/5-mikado/blast/xmls --blast_targets=Daijin/5-mikado/blast/index/blastdb-proteins.fa \
--start-method=spawn --transcripts=Daijin/5-mikado/mikado_prepared.fasta \
--genome_fai=Daijin/5-mikado/chr5.fas.gz.fai --json-conf=configuration.yaml \
--external {input.data} -nsa --force \
--orfs=Daijin/5-mikado/transdecoder/transcripts.fasta.transdecoder.bed \
-od Daijin/5-mikado --procs={threads} -l {log} {params.db};
if [[ $(sqlite3 {output.db} "select count(*) > 0 from external") != 1 ]]; then exit 1; else exit 0; fi"""


rule test_external_kal_pick:
input:
db=rules.test_external_kal_serialise.output.db,
gtf=rules.daijin.output.prep,
scoring="plant_external.yaml"
output:
loci=os.path.join("Daijin", "5-mikado", "pick", "external", "mikado-permissive.loci.gff3"),
scores=os.path.join("Daijin", "5-mikado", "pick", "external", "mikado-permissive.loci.scores.tsv"),
subscores=os.path.join("Daijin", "5-mikado", "pick", "external", "mikado.subloci.scores.tsv")
params:
outdir=os.path.join("Daijin", "5-mikado", "pick", "external"),
loci_out="mikado-permissive.loci.gff3"
log: os.path.join("Daijin", "5-mikado", "pick", "external", "mikado-permissive.log")
threads: 2
shell: """mikado pick --scoring-file {input.scoring} --source Mikado_permissive \
--mode=permissive --procs=2 --start-method=spawn \
--json-conf=configuration.yaml -od {params.outdir} \
-l {log} --loci-out {params.loci_out} -lv INFO -db {input.db} {input.gtf}"""

rule check_external_pick:
input:
scores=rules.test_external_kal_pick.output.scores,
subscores=rules.test_external_kal_pick.output.subscores,
output: touch("external.ok")
run:
import pandas as pd
scores = pd.read_csv(input["scores"], delimiter="\t")
assert "external.tpm" in scores.columns
assert scores["external.tpm"].max() > 0
scores = pd.read_csv(input["subscores"], delimiter="\t")
assert "external.tpm" in scores.columns
assert scores["external.tpm"].max() > 0

rule test_g11_prodigal:
input:
transcripts=rules.daijin.output.prep_fasta
Expand Down
5 changes: 5 additions & 0 deletions sample_data/kallisto.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- bioconda
- conda-forge
dependencies:
- kallisto
77 changes: 77 additions & 0 deletions sample_data/plant_external.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Scoring file suitable for any species with intron sizes similar to plants
requirements:
expression: [(combined_cds_fraction.ncrna or combined_cds_fraction.coding), and, ((exon_num.multi and (cdna_length.multi, or, combined_cds_length.multi) and max_intron_length, and, min_intron_length and proportion_verified_introns_inlocus ) or (exon_num.mono and (combined_cds_length.mono or cdna_length.mono))) ]
parameters:
combined_cds_fraction.ncrna: {operator: eq, value: 0}
combined_cds_fraction.coding: {operator: gt, value: 0.30}
cdna_length.mono: {operator: gt, value: 400}
cdna_length.multi: {operator: ge, value: 300}
combined_cds_length.mono: {operator: gt, value: 225}
combined_cds_length.multi: {operator: gt, value: 150}
exon_num.mono: {operator: eq, value: 1}
exon_num.multi: {operator: gt, value: 1}
max_intron_length: {operator: le, value: 20000}
min_intron_length: {operator: ge, value: 5}
proportion_verified_introns_inlocus: {operator: gt, value: 0}
as_requirements:
expression: [cdna_length and three_utr_length and five_utr_length and utr_length and suspicious_splicing]
parameters:
cdna_length: {operator: ge, value: 300}
utr_length: {operator: le, value: 2500}
five_utr_length: {operator: le, value: 2500}
three_utr_length: {operator: le, value: 2500}
suspicious_splicing: {operator: ne, value: true}
max_intron_length: {operator: le, value: 10000}
not_fragmentary:
expression: [((exon_num.multi and (cdna_length.multi or combined_cds_length.multi)), or, (exon_num.mono and combined_cds_length.mono))]
parameters:
is_complete: {operator: eq, value: true}
exon_num.multi: {operator: gt, value: 1}
cdna_length.multi: {operator: ge, value: 400}
combined_cds_length.multi: {operator: gt, value: 200}
exon_num.mono: {operator: eq, value: 1}
combined_cds_length.mono: {operator: gt, value: 600}
scoring:
blast_score: {rescaling: max}
cdna_length: {rescaling: max}
cds_not_maximal: {rescaling: min}
cds_not_maximal_fraction: {rescaling: min}
exon_num: {
rescaling: max,
filter: {
operator: ge,
value: 3}
}
five_utr_length:
filter: {operator: le, value: 2500}
rescaling: target
value: 200
five_utr_num:
filter: {operator: lt, value: 4}
rescaling: target
value: 2
end_distance_from_junction:
filter: {operator: lt, value: 55}
rescaling: min
highest_cds_exon_number: {rescaling: max}
intron_fraction: {rescaling: max}
is_complete: {rescaling: target, value: true}
number_internal_orfs: {rescaling: target, value: 1}
non_verified_introns_num: {rescaling: min}
proportion_verified_introns_inlocus: {rescaling: max}
retained_fraction: {rescaling: min}
retained_intron_num: {rescaling: min}
selected_cds_fraction: {rescaling: target, value: 0.8}
selected_cds_intron_fraction: {rescaling: max}
selected_cds_length: {rescaling: max}
selected_cds_num: {rescaling: max}
three_utr_length:
filter: {operator: le, value: 2500}
rescaling: target
value: 400
three_utr_num:
filter: {operator: lt, value: 3}
rescaling: target
value: 1
combined_cds_locus_fraction: {rescaling: max}
external.tpm: {rescaling: max}

0 comments on commit 19f3b2f

Please sign in to comment.