Skip to content

Commit

Permalink
feat: ESTIMATE (stjudecloud#35)
Browse files Browse the repository at this point in the history
* [WIP] feat: calc_gene_lengths task

* feat: add calc_gene_lengths

* feat: add calc_tpm task

* fix: correct ESTIMATE dockerfile path

* refactor: change name conventions

* feat: add run_ESTIMATE task

* feat: add ESTIMATE workflow

* feat(ESTIMATE): filter to common_genes. Output both calculations

* style(estimate): swap filtered and unfiltered emphasis

* style(estimate): don't output unfiltered estimate scores

* fix: misspelled variable name

* ci: add lint-check ignore line ability

* ci: don't fail on bad grep

* feat: add calc_gene_lengths workflow

* chore: point to master branch
  • Loading branch information
a-frantz authored Jan 11, 2021
1 parent 5c56d60 commit d6121b6
Show file tree
Hide file tree
Showing 9 changed files with 220 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
for file in $(find . -name '*.wdl'); do
>&2 echo "Checking file $file..."
import_lines=$(awk '/import/' "$file")
bad_lines=$(echo "$import_lines" | awk '!/https:\/\/raw.githubusercontent.com\/stjudecloud\/workflows\/master/')
bad_lines=$(echo "$import_lines" | awk '!/https:\/\/raw.githubusercontent.com\/stjudecloud\/workflows\/master/' | grep -v '# lint-check: ignore') || true
if [ -n "$bad_lines" ]; then
>&2 echo "Must import files from the master branch on Github."
>&2 echo "The following lines are bad:"
Expand Down
3 changes: 3 additions & 0 deletions docker/estimate/1.0.0/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM r-base:4.0.3

RUN R -e 'install.packages("estimate", repos="http://r-forge.r-project.org", dependencies=TRUE)'
13 changes: 13 additions & 0 deletions docker/gtfparse/1.0.0/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM stjudecloud/conda-base:1.0.0 AS builder

RUN conda create -n gtfparse \
gtfparse==1.2.1 \
-y \
&& conda clean --all

FROM debian:10-slim
COPY --from=builder /opt/conda/envs/gtfparse/bin/ /opt/conda/envs/gtfparse/bin/
COPY --from=builder /opt/conda/envs/gtfparse/lib/ /opt/conda/envs/gtfparse/lib/
ENV PATH /opt/conda/envs/gtfparse/bin:$PATH

ENTRYPOINT [ "python" ]
6 changes: 6 additions & 0 deletions docker/util/1.1.0/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM ubuntu:18.04

RUN apt-get update && \
apt-get upgrade -y && \
apt-get install wget zip gcc zlib1g-dev python3 -y && \
rm -r /var/lib/apt/lists/*
90 changes: 90 additions & 0 deletions tools/estimate.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
version 1.0

task calc_tpm {
input {
File counts
File gene_lengths
String outfile = basename(counts, ".feature-counts.txt") + ".TPM.txt"
Int max_retries = 1
}

command <<<
COUNTS="~{counts}" GENE_LENGTHS="~{gene_lengths}" OUTFILE="~{outfile}" python3 - <<END
import os # lint-check: ignore
counts_file = open(os.environ['COUNTS'], 'r')
counts = {}
for line in counts_file:
gene, count = line.split('\t')
if gene[0:2] == '__':
break
counts[gene.strip()] = int(count.strip())
counts_file.close()
lengths_file = open(os.environ['GENE_LENGTHS'], 'r')
rpks = {}
tot_rpk = 0
lengths_file.readline() # discard header
for line in lengths_file:
gene, length = line.split('\t')
rpk = counts[gene.strip()] / int(length.strip()) * 1000
tot_rpk += rpk
rpks[gene.strip()] = rpk
lengths_file.close()
sf = tot_rpk / 1000000
sample_name = '.'.join(os.environ['OUTFILE'].split('.')[:-2])
outfile = open(os.environ['OUTFILE'], 'w')
print(f"Gene name\t{sample_name}", file=outfile)
for gene, rpk in sorted(rpks.items()):
tpm = rpk / sf
print(f"{gene}\t{tpm:.3f}", file=outfile)
outfile.close()
END
>>>

runtime {
memory: "4 GB"
disk: "4 GB"
docker: 'stjudecloud/util:1.1.0'
maxRetries: max_retries
}

output {
File out = "~{outfile}"
}
}

task run_ESTIMATE {
input {
File gene_expression_file
String outfile = basename(gene_expression_file, ".TPM.txt") + ".ESTIMATE.gct"
Int max_retries = 1
}

command <<<
cp "~{gene_expression_file}" gene_expression.txt
Rscript - <<END
library("estimate")
infile <- read.table(file = "gene_expression.txt", sep = '\t', header = TRUE)
filtered <- infile[infile$"Gene.name" %in% common_genes[['GeneSymbol']], ]
write.table(filtered, sep = "\t", file = "filtered.tsv", row.names = FALSE, quote = FALSE)
outputGCT("filtered.tsv", "gene_expression.gct")
estimateScore("gene_expression.gct", "common_estimate.gct", platform = "illumina")
END
mv common_estimate.gct "~{outfile}"
>>>

runtime {
memory: "4 GB"
disk: "4 GB"
docker: 'stjudecloud/estimate:1.0.0'
maxRetries: max_retries
}

output {
File out = "~{outfile}"
}
}
2 changes: 1 addition & 1 deletion tools/htseq.wdl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## # HTSeq
##
## This WDL tool wraps the [htseq](https://github.com/simon-anders/htseq) tool.
## This WDL tool wraps the [htseq](https://github.com/htseq/htseq) tool.
## HTSeq is a Python library for analyzing sequencing data.
version 1.0
Expand Down
73 changes: 73 additions & 0 deletions tools/util.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,76 @@ task file_prefix {
String out = read_string("stdout.txt")
}
}

task calc_gene_lengths {
input {
File gtf
String outfile = basename(gtf, ".gtf.gz") + ".genelengths.txt"
Int max_retries = 1
}

Float gtf_size = size(gtf, "GiB")
Int disk_size = ceil(gtf_size * 2 + 10)

command <<<
GTF="~{gtf}" OUTFILE="~{outfile}" python - <<END
import os # lint-check: ignore
import gtfparse # lint-check: ignore
import numpy as np # lint-check: ignore
gtf_name = os.environ['GTF']
outfile = open(os.environ['OUTFILE'], 'w')
gtf = gtfparse.read_gtf(gtf_name)
only_genes = gtf[gtf['feature'] == 'gene']
only_exons = gtf[gtf['feature'] == 'exon']
gene_start_offset = {}
gene_end_offset = {}
gene_exon_intersection = {}
gene_total_exon_size = {}
gene_length = {}
for (index, value) in only_genes.iterrows():
gene_name = value['gene_name']
start = value['start']
end = value['end']
size = end - start
if size <= 0:
raise RuntimeError("Size of gene is negative!")
gene_start_offset[gene_name] = start
gene_end_offset[gene_name] = end
gene_exon_intersection[gene_name] = np.zeros(size)
gene_total_exon_size[gene_name] = 0
gene_length[gene_name] = end - start
for (index, value) in only_exons.iterrows():
gene_name = value['gene_name']
offset = gene_start_offset[gene_name]
start = value['start'] - offset
end = value['end'] - offset
exon_length = end - start
gene_exon_intersection[gene_name][start:end] = 1
gene_total_exon_size[gene_name] += exon_length
results = []
print("Gene name\tlength", file=outfile)
for (gene, exonic_intersection) in sorted(gene_exon_intersection.items()):
length = np.sum(exonic_intersection).astype(int)
print(f"{gene}\t{length}", file=outfile)
END
>>>

runtime {
memory: "8 GB"
disk: disk_size + " GB"
docker: 'stjudecloud/gtfparse:1.0.0'
maxRetries: max_retries
}

output {
File out = "~{outfile}"
}
}
18 changes: 18 additions & 0 deletions workflows/rnaseq/ESTIMATE.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version 1.0

import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/estimate.wdl"

workflow ESTIMATE {
input {
File counts_file
File gene_lengths_file
}

call estimate.calc_tpm { input: counts=counts_file, gene_lengths=gene_lengths_file }
call estimate.run_ESTIMATE { input: gene_expression_file=calc_tpm.out }

output {
File gene_lengths=calc_tpm.out
File estimate_out=run_ESTIMATE.out
}
}
15 changes: 15 additions & 0 deletions workflows/rnaseq/calc-gene-lengths.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version 1.0

import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/util.wdl"

workflow calc_gene_lengths {
input {
File gtf
}

call util.calc_gene_lengths as calc { input: gtf=gtf }

output {
File gene_lengths=calc.out
}
}

0 comments on commit d6121b6

Please sign in to comment.