Skip to content

Commit

Permalink
feat: one dna or RNA count file across multiple replicates (#144)
Browse files Browse the repository at this point in the history
* feat: using one experiment across all replicates for DNA (or RNA)

* snakemake format

---------

Co-authored-by: Max Schubach <max.schubach@bih-charite.de>
  • Loading branch information
visze and Max Schubach authored Dec 6, 2024
1 parent f1944b8 commit ccbacee
Show file tree
Hide file tree
Showing 7 changed files with 228 additions and 124 deletions.
7 changes: 5 additions & 2 deletions workflow/rules/assigned_counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ rule assigned_counts_assignBarcodes:
conda:
"../envs/python3.yaml"
input:
counts=lambda wc: getFinalCounts(wc.project, wc.config, wc.type, "counts"),
counts=lambda wc: getFinalCounts(
wc.project, wc.config, wc.condition, wc.type, "counts"
),
association="results/experiments/{project}/assignment/{assignment}.tsv.gz",
script=getScript("count/merge_BC_and_assignment.py"),
output:
Expand Down Expand Up @@ -213,7 +215,8 @@ rule assigned_counts_combine_replicates_barcode_output:
]["bc_threshold"],
bc_counts=lambda wc: " ".join(
[
"--counts %s results/experiments/%s/assigned_counts/%s/%s/%s_%s_barcode_assigned_counts.tsv.gz" % (rep, wc.project, wc.assignment, wc.config, wc.condition, rep)
"--counts %s results/experiments/%s/assigned_counts/%s/%s/%s_%s_barcode_assigned_counts.tsv.gz"
% (rep, wc.project, wc.assignment, wc.config, wc.condition, rep)
for rep in getReplicatesOfCondition(wc.project, wc.condition)
]
),
Expand Down
67 changes: 38 additions & 29 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -276,15 +276,16 @@ def getOutputConditionReplicateType_helper(files, project, skip={}):
return []
conditions = getConditions(project)
for condition in conditions:
replicates = getReplicatesOfCondition(project, condition)
for file in files:
output += expand(
file,
project=project,
condition=condition,
replicate=replicates,
type=["RNA", "DNA"],
)
for type in ["DNA", "RNA"]:
replicates = getReplicatesOfConditionType(project, condition, type)
output += expand(
file,
project=project,
condition=condition,
replicate=replicates,
type=type,
)
return output


Expand Down Expand Up @@ -688,24 +689,46 @@ def counts_getSamplingConfig(project, conf, dna_or_rna, command):
return ""


def getFinalCounts(project, conf, rna_or_dna, raw_or_assigned):
def getReplicatesOfConditionType(project, condition, rna_or_dna):
exp = getExperiments(project)

replicates = getReplicatesOfCondition(project, condition)

if f"{rna_or_dna}_BC_F" in exp.columns:

exp_filter = exp[exp.Condition == condition]

if len(replicates) > 1 and exp_filter[f"{rna_or_dna}_BC_F"].nunique() == 1:
return [replicates[0]]

return replicates


def getFinalCounts(project, conf, condition, rna_or_dna, raw_or_assigned):
output = ""

replicates = getReplicatesOfConditionType(project, condition, rna_or_dna)
if len(replicates) > 1:
replicate = "{replicate}"
else:
replicate = replicates[0]

if raw_or_assigned == "counts":
if useSampling(project, conf, rna_or_dna):
output = (
"results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.sampling.{config}.tsv.gz"
% (raw_or_assigned, rna_or_dna)
"results/experiments/{project}/%s/{condition}_%s_%s_final_counts.sampling.{config}.tsv.gz"
% (raw_or_assigned, replicate, rna_or_dna)
)

else:
output = (
"results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.tsv.gz"
% (raw_or_assigned, rna_or_dna)
"results/experiments/{project}/%s/{condition}_%s_%s_final_counts.tsv.gz"
% (raw_or_assigned, replicate, rna_or_dna)
)
else:
output = (
"results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.config.{config}.tsv.gz"
% (raw_or_assigned, rna_or_dna)
"results/experiments/{project}/%s/{condition}_%s_%s_final_counts.config.{config}.tsv.gz"
% (raw_or_assigned, replicate, rna_or_dna)
)
return output

Expand Down Expand Up @@ -733,20 +756,6 @@ def assignedCounts_getAssignmentSamplingConfig(project, assignment, command):
# statistic.smk specific functions


# get all counts of experiment (rule statistic_counts)
def getCountStats(project, countType):
exp = getExperiments(project)
output = []
for index, row in exp.iterrows():
output += expand(
"results/experiments/{{project}}/statistic/counts/{condition}_{replicate}_{type}_{{countType}}_counts.tsv.gz",
condition=row["Condition"],
replicate=row["Replicate"],
type=["DNA", "RNA"],
)
return output


# get all barcodes of experiment (rule statistic_BC_in_RNA_DNA)
def getBCinRNADNAStats(wc):
exp = getExperiments(wc.project)
Expand Down
8 changes: 6 additions & 2 deletions workflow/rules/counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ rule counts_dna_rna_merge_counts:
conda:
"../envs/default.yaml"
input:
dna=lambda wc: getFinalCounts(wc.project, wc.config, "DNA", wc.raw_or_assigned),
rna=lambda wc: getFinalCounts(wc.project, wc.config, "RNA", wc.raw_or_assigned),
dna=lambda wc: getFinalCounts(
wc.project, wc.config, wc.condition, "DNA", wc.raw_or_assigned
),
rna=lambda wc: getFinalCounts(
wc.project, wc.config, wc.condition, "RNA", wc.raw_or_assigned
),
output:
"results/experiments/{project}/{raw_or_assigned}/{condition}_{replicate}.merged.config.{config}.tsv.gz",
params:
Expand Down
10 changes: 7 additions & 3 deletions workflow/rules/statistic/bc_overlap.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ rule statistic_bc_overlap_run:
"../../envs/r.yaml"
input:
files=lambda wc: expand(
getFinalCounts(wc.project, wc.config, wc.type, wc.raw_or_assigned),
getFinalCounts(
wc.project, wc.config, wc.condition, wc.type, wc.raw_or_assigned
),
project=wc.project,
condition=wc.condition,
config=wc.config,
Expand All @@ -22,7 +24,9 @@ rule statistic_bc_overlap_run:
params:
input=lambda wc: ",".join(
expand(
getFinalCounts(wc.project, wc.config, wc.type, wc.raw_or_assigned),
getFinalCounts(
wc.project, wc.config, wc.condition, wc.type, wc.raw_or_assigned
),
project=wc.project,
condition=wc.condition,
config=wc.config,
Expand All @@ -31,7 +35,7 @@ rule statistic_bc_overlap_run:
),
cond="{condition}_{type}",
replicates=lambda wc: ",".join(
getReplicatesOfCondition(wc.project, wc.condition)
getReplicatesOfConditionType(wc.project, wc.condition, wc.type)
),
log:
temp(
Expand Down
15 changes: 7 additions & 8 deletions workflow/rules/statistic/counts.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
include: "counts_common.smk"


#################################
## count 10 most frequent UMIs ##
#################################
Expand Down Expand Up @@ -148,15 +151,11 @@ rule statistic_counts_BC_in_RNA_DNA:
conda:
"../../envs/default.yaml"
input:
dna=lambda wc: (
"results/experiments/{project}/counts/{condition}_{replicate}_DNA_{countType}_counts.tsv.gz"
if wc.countType != "raw"
else getRawCounts(wc.project, "DNA")
dna=lambda wc: statistic_counts_BC_in_RNA_DNA_helper(
project, wc.condition, "DNA", wc.countType
),
rna=lambda wc: (
"results/experiments/{project}/counts/{condition}_{replicate}_RNA_{countType}_counts.tsv.gz"
if wc.countType != "raw"
else getRawCounts(wc.project, "RNA")
rna=lambda wc: statistic_counts_BC_in_RNA_DNA_helper(
project, wc.condition, "RNA", wc.countType
),
output:
temp(
Expand Down
37 changes: 37 additions & 0 deletions workflow/rules/statistic/counts_common.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
def statistic_counts_BC_in_RNA_DNA_helper(project, condition, dna_or_rna, countType):

replicates = getReplicatesOfConditionType(project, condition, dna_or_rna)

if countType == "raw":
output = getRawCounts(project, dna_or_rna)
else:
output = (
"results/experiments/{project}/counts/{condition}_{replicate}_%s_{countType}_counts.tsv.gz"
% dna_or_rna
)

if len(replicates) == 1:
output = output.replace("{replicate}", replicates[0])

return output


# get all counts of experiment (rule statistic_counts)
def getCountStats(project, countType):
exp = getExperiments(project)
output = []
for index, row in exp.iterrows():
condition = row["Condition"]
for dna_or_rna in ["DNA", "RNA"]:
replicates = getReplicatesOfConditionType(project, condition, dna_or_rna)
if len(replicates) == 1:
replicate = replicates[0]
else:
replicate = row["Replicate"]
output += expand(
"results/experiments/{{project}}/statistic/counts/{condition}_{replicate}_{type}_{{countType}}_counts.tsv.gz",
condition=condition,
replicate=replicate,
type=dna_or_rna,
)
return output
Loading

0 comments on commit ccbacee

Please sign in to comment.