feat: ESTIMATE (stjudecloud#35)

* [WIP] feat: calc_gene_lengths task * feat: add calc_gene_lengths * feat: add calc_tpm task * fix: correct ESTIMATE dockerfile path * refactor: change name conventions * feat: add run_ESTIMATE task * feat: add ESTIMATE workflow * feat(ESTIMATE): filter to common_genes. Output both calculations * style(estimate): swap filtered and unfiltered emphasis * style(estimate): don't output unfiltered estimate scores * fix: misspelled variable name * ci: add lint-check ignore line ability * ci: don't fail on bad grep * feat: add calc_gene_lengths workflow * chore: point to master branch
NGeneBio-Genomics-Platform · Jan 11, 2021 · d6121b6 · d6121b6
1 parent 5c56d60
commit d6121b6
Show file tree

Hide file tree

Showing 9 changed files with 220 additions and 2 deletions.
diff --git a/.github/workflows/lint-check.yml b/.github/workflows/lint-check.yml
@@ -19,7 +19,7 @@ jobs:
         for file in $(find . -name '*.wdl'); do
           >&2 echo "Checking file $file..."
           import_lines=$(awk '/import/' "$file")
-          bad_lines=$(echo "$import_lines" | awk '!/https:\/\/raw.githubusercontent.com\/stjudecloud\/workflows\/master/')
+          bad_lines=$(echo "$import_lines" | awk '!/https:\/\/raw.githubusercontent.com\/stjudecloud\/workflows\/master/' | grep -v '# lint-check: ignore') || true
           if [ -n "$bad_lines" ]; then
             >&2 echo "Must import files from the master branch on Github."
             >&2 echo "The following lines are bad:"

diff --git a/docker/estimate/1.0.0/Dockerfile b/docker/estimate/1.0.0/Dockerfile
@@ -0,0 +1,3 @@
+FROM r-base:4.0.3
+
+RUN R -e 'install.packages("estimate", repos="http://r-forge.r-project.org", dependencies=TRUE)'
diff --git a/docker/gtfparse/1.0.0/Dockerfile b/docker/gtfparse/1.0.0/Dockerfile
@@ -0,0 +1,13 @@
+FROM stjudecloud/conda-base:1.0.0 AS builder
+
+RUN conda create -n gtfparse \
+    gtfparse==1.2.1 \
+    -y \
+    && conda clean --all
+
+FROM debian:10-slim
+COPY --from=builder /opt/conda/envs/gtfparse/bin/ /opt/conda/envs/gtfparse/bin/
+COPY --from=builder /opt/conda/envs/gtfparse/lib/ /opt/conda/envs/gtfparse/lib/
+ENV PATH /opt/conda/envs/gtfparse/bin:$PATH
+
+ENTRYPOINT [ "python" ]
diff --git a/docker/util/1.1.0/Dockerfile b/docker/util/1.1.0/Dockerfile
@@ -0,0 +1,6 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install wget zip gcc zlib1g-dev python3 -y && \
+    rm -r /var/lib/apt/lists/*
diff --git a/tools/estimate.wdl b/tools/estimate.wdl
@@ -0,0 +1,90 @@
+version 1.0
+
+task calc_tpm {
+    input {
+        File counts
+        File gene_lengths
+        String outfile = basename(counts, ".feature-counts.txt") + ".TPM.txt"
+        Int max_retries = 1
+    }
+
+    command <<<
+        COUNTS="~{counts}" GENE_LENGTHS="~{gene_lengths}" OUTFILE="~{outfile}" python3 - <<END
+import os  # lint-check: ignore
+
+counts_file = open(os.environ['COUNTS'], 'r')
+counts = {}
+for line in counts_file:
+    gene, count = line.split('\t')
+    if gene[0:2] == '__':
+        break
+    counts[gene.strip()] = int(count.strip())
+counts_file.close()
+
+lengths_file = open(os.environ['GENE_LENGTHS'], 'r')
+rpks = {}
+tot_rpk = 0
+lengths_file.readline()  # discard header
+for line in lengths_file:
+    gene, length = line.split('\t')
+    rpk = counts[gene.strip()] / int(length.strip()) * 1000
+    tot_rpk += rpk
+    rpks[gene.strip()] = rpk
+lengths_file.close()
+
+sf = tot_rpk / 1000000
+
+sample_name = '.'.join(os.environ['OUTFILE'].split('.')[:-2])
+outfile = open(os.environ['OUTFILE'], 'w')
+print(f"Gene name\t{sample_name}", file=outfile)
+for gene, rpk in sorted(rpks.items()):
+    tpm = rpk / sf
+    print(f"{gene}\t{tpm:.3f}", file=outfile)
+outfile.close()
+END
+    >>>
+
+    runtime {
+        memory: "4 GB"
+        disk: "4 GB"
+        docker: 'stjudecloud/util:1.1.0'
+        maxRetries: max_retries
+    }
+
+    output {
+        File out = "~{outfile}"
+    }
+}
+
+task run_ESTIMATE {
+    input {
+        File gene_expression_file
+        String outfile = basename(gene_expression_file, ".TPM.txt") + ".ESTIMATE.gct"
+        Int max_retries = 1
+    }
+
+    command <<<
+        cp "~{gene_expression_file}" gene_expression.txt
+        Rscript - <<END
+library("estimate")
+
+infile <- read.table(file = "gene_expression.txt", sep = '\t', header = TRUE)
+filtered <- infile[infile$"Gene.name" %in% common_genes[['GeneSymbol']], ]
+write.table(filtered, sep = "\t", file = "filtered.tsv", row.names = FALSE, quote = FALSE)
+outputGCT("filtered.tsv", "gene_expression.gct")
+estimateScore("gene_expression.gct", "common_estimate.gct", platform = "illumina")
+END
+    mv common_estimate.gct "~{outfile}"
+    >>>
+
+    runtime {
+        memory: "4 GB"
+        disk: "4 GB"
+        docker: 'stjudecloud/estimate:1.0.0'
+        maxRetries: max_retries
+    }
+
+    output {
+        File out = "~{outfile}"
+    }
+}
diff --git a/tools/htseq.wdl b/tools/htseq.wdl
@@ -1,6 +1,6 @@
 ## # HTSeq
 ##
-## This WDL tool wraps the [htseq](https://github.com/simon-anders/htseq) tool.
+## This WDL tool wraps the [htseq](https://github.com/htseq/htseq) tool.
 ## HTSeq is a Python library for analyzing sequencing data.
 
 version 1.0

diff --git a/tools/util.wdl b/tools/util.wdl
@@ -70,3 +70,76 @@ task file_prefix {
         String out = read_string("stdout.txt")
     }
 }
+
+task calc_gene_lengths {
+    input {
+        File gtf
+        String outfile = basename(gtf, ".gtf.gz") + ".genelengths.txt"
+        Int max_retries = 1
+    }
+
+    Float gtf_size = size(gtf, "GiB")
+    Int disk_size = ceil(gtf_size * 2 + 10)
+
+    command <<<
+        GTF="~{gtf}" OUTFILE="~{outfile}" python - <<END
+import os  # lint-check: ignore
+import gtfparse  # lint-check: ignore
+import numpy as np  # lint-check: ignore
+
+gtf_name = os.environ['GTF']
+outfile = open(os.environ['OUTFILE'], 'w')
+
+gtf = gtfparse.read_gtf(gtf_name)
+
+only_genes = gtf[gtf['feature'] == 'gene']
+only_exons = gtf[gtf['feature'] == 'exon']
+gene_start_offset = {}
+gene_end_offset = {}
+gene_exon_intersection = {}
+gene_total_exon_size = {}
+gene_length = {}
+
+for (index, value) in only_genes.iterrows():
+    gene_name = value['gene_name']
+    start = value['start']
+    end = value['end']
+    size = end - start
+    
+    if size <= 0:
+        raise RuntimeError("Size of gene is negative!")
+        
+    gene_start_offset[gene_name] = start
+    gene_end_offset[gene_name] = end
+    gene_exon_intersection[gene_name] = np.zeros(size)
+    gene_total_exon_size[gene_name] = 0
+    gene_length[gene_name] = end - start
+    
+for (index, value) in only_exons.iterrows():
+    gene_name = value['gene_name']
+    offset = gene_start_offset[gene_name]
+    start = value['start'] - offset
+    end = value['end'] - offset
+    exon_length = end - start
+    gene_exon_intersection[gene_name][start:end] = 1
+    gene_total_exon_size[gene_name] += exon_length
+
+results = []
+print("Gene name\tlength", file=outfile)
+for (gene, exonic_intersection) in sorted(gene_exon_intersection.items()):
+    length = np.sum(exonic_intersection).astype(int)
+    print(f"{gene}\t{length}", file=outfile)
+END
+    >>>
+
+    runtime {
+        memory: "8 GB"
+        disk: disk_size + " GB"
+        docker: 'stjudecloud/gtfparse:1.0.0'
+        maxRetries: max_retries
+    }
+
+    output {
+        File out = "~{outfile}"
+    }
+}
diff --git a/workflows/rnaseq/ESTIMATE.wdl b/workflows/rnaseq/ESTIMATE.wdl
@@ -0,0 +1,18 @@
+version 1.0
+
+import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/estimate.wdl"
+
+workflow ESTIMATE {
+    input {
+        File counts_file
+        File gene_lengths_file
+    }
+
+    call estimate.calc_tpm { input: counts=counts_file, gene_lengths=gene_lengths_file }
+    call estimate.run_ESTIMATE { input: gene_expression_file=calc_tpm.out }
+
+    output {
+        File gene_lengths=calc_tpm.out
+        File estimate_out=run_ESTIMATE.out
+    }
+}
diff --git a/workflows/rnaseq/calc-gene-lengths.wdl b/workflows/rnaseq/calc-gene-lengths.wdl
@@ -0,0 +1,15 @@
+version 1.0
+
+import "https://raw.githubusercontent.com/stjudecloud/workflows/master/tools/util.wdl"
+
+workflow calc_gene_lengths {
+    input {
+        File gtf
+    }
+
+    call util.calc_gene_lengths as calc { input: gtf=gtf }
+
+    output {
+        File gene_lengths=calc.out
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM r-base:4.0.3

		RUN R -e 'install.packages("estimate", repos="http://r-forge.r-project.org", dependencies=TRUE)'