forked from stjudecloud/workflows
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [WIP] feat: calc_gene_lengths task * feat: add calc_gene_lengths * feat: add calc_tpm task * fix: correct ESTIMATE dockerfile path * refactor: change name conventions * feat: add run_ESTIMATE task * feat: add ESTIMATE workflow * feat(ESTIMATE): filter to common_genes. Output both calculations * style(estimate): swap filtered and unfiltered emphasis * style(estimate): don't output unfiltered estimate scores * fix: misspelled variable name * ci: add lint-check ignore line ability * ci: don't fail on bad grep * feat: add calc_gene_lengths workflow * chore: point to master branch
- Loading branch information
Showing
9 changed files
with
220 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
FROM r-base:4.0.3 | ||
|
||
RUN R -e 'install.packages("estimate", repos="http://r-forge.r-project.org", dependencies=TRUE)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
FROM stjudecloud/conda-base:1.0.0 AS builder | ||
|
||
RUN conda create -n gtfparse \ | ||
gtfparse==1.2.1 \ | ||
-y \ | ||
&& conda clean --all | ||
|
||
FROM debian:10-slim | ||
COPY --from=builder /opt/conda/envs/gtfparse/bin/ /opt/conda/envs/gtfparse/bin/ | ||
COPY --from=builder /opt/conda/envs/gtfparse/lib/ /opt/conda/envs/gtfparse/lib/ | ||
ENV PATH /opt/conda/envs/gtfparse/bin:$PATH | ||
|
||
ENTRYPOINT [ "python" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
FROM ubuntu:18.04 | ||
|
||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install wget zip gcc zlib1g-dev python3 -y && \ | ||
rm -r /var/lib/apt/lists/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
version 1.0 | ||
|
||
task calc_tpm { | ||
input { | ||
File counts | ||
File gene_lengths | ||
String outfile = basename(counts, ".feature-counts.txt") + ".TPM.txt" | ||
Int max_retries = 1 | ||
} | ||
|
||
command <<< | ||
COUNTS="~{counts}" GENE_LENGTHS="~{gene_lengths}" OUTFILE="~{outfile}" python3 - <<END | ||
import os # lint-check: ignore | ||
counts_file = open(os.environ['COUNTS'], 'r') | ||
counts = {} | ||
for line in counts_file: | ||
gene, count = line.split('\t') | ||
if gene[0:2] == '__': | ||
break | ||
counts[gene.strip()] = int(count.strip()) | ||
counts_file.close() | ||
lengths_file = open(os.environ['GENE_LENGTHS'], 'r') | ||
rpks = {} | ||
tot_rpk = 0 | ||
lengths_file.readline() # discard header | ||
for line in lengths_file: | ||
gene, length = line.split('\t') | ||
rpk = counts[gene.strip()] / int(length.strip()) * 1000 | ||
tot_rpk += rpk | ||
rpks[gene.strip()] = rpk | ||
lengths_file.close() | ||
sf = tot_rpk / 1000000 | ||
sample_name = '.'.join(os.environ['OUTFILE'].split('.')[:-2]) | ||
outfile = open(os.environ['OUTFILE'], 'w') | ||
print(f"Gene name\t{sample_name}", file=outfile) | ||
for gene, rpk in sorted(rpks.items()): | ||
tpm = rpk / sf | ||
print(f"{gene}\t{tpm:.3f}", file=outfile) | ||
outfile.close() | ||
END | ||
>>> | ||
|
||
runtime { | ||
memory: "4 GB" | ||
disk: "4 GB" | ||
docker: 'stjudecloud/util:1.1.0' | ||
maxRetries: max_retries | ||
} | ||
|
||
output { | ||
File out = "~{outfile}" | ||
} | ||
} | ||
|
||
task run_ESTIMATE { | ||
input { | ||
File gene_expression_file | ||
String outfile = basename(gene_expression_file, ".TPM.txt") + ".ESTIMATE.gct" | ||
Int max_retries = 1 | ||
} | ||
|
||
command <<< | ||
cp "~{gene_expression_file}" gene_expression.txt | ||
Rscript - <<END | ||
library("estimate") | ||
infile <- read.table(file = "gene_expression.txt", sep = '\t', header = TRUE) | ||
filtered <- infile[infile$"Gene.name" %in% common_genes[['GeneSymbol']], ] | ||
write.table(filtered, sep = "\t", file = "filtered.tsv", row.names = FALSE, quote = FALSE) | ||
outputGCT("filtered.tsv", "gene_expression.gct") | ||
estimateScore("gene_expression.gct", "common_estimate.gct", platform = "illumina") | ||
END | ||
mv common_estimate.gct "~{outfile}" | ||
>>> | ||
|
||
runtime { | ||
memory: "4 GB" | ||
disk: "4 GB" | ||
docker: 'stjudecloud/estimate:1.0.0' | ||
maxRetries: max_retries | ||
} | ||
|
||
output { | ||
File out = "~{outfile}" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
version 1.0 | ||
|
||
import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/estimate.wdl" | ||
|
||
workflow ESTIMATE { | ||
input { | ||
File counts_file | ||
File gene_lengths_file | ||
} | ||
|
||
call estimate.calc_tpm { input: counts=counts_file, gene_lengths=gene_lengths_file } | ||
call estimate.run_ESTIMATE { input: gene_expression_file=calc_tpm.out } | ||
|
||
output { | ||
File gene_lengths=calc_tpm.out | ||
File estimate_out=run_ESTIMATE.out | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
version 1.0 | ||
|
||
import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/util.wdl" | ||
|
||
workflow calc_gene_lengths { | ||
input { | ||
File gtf | ||
} | ||
|
||
call util.calc_gene_lengths as calc { input: gtf=gtf } | ||
|
||
output { | ||
File gene_lengths=calc.out | ||
} | ||
} |