AlexsLemonade · jashapiro · Sep 21, 2020 · Sep 2, 2020 · Sep 2, 2020 · Sep 2, 2020
diff --git a/.gitignore b/.gitignore
@@ -44,5 +44,4 @@ rstudio-server.json
 # intermediate reference and analysis files
 workflows/rnaseq-ref-index/annotation/
 workflows/rnaseq-ref-index/fasta/
-workflows/alevin-quant/data/
-
+workflows/benchmarks/data/
diff --git a/workflows/benchmarks/README.md b/workflows/benchmarks/README.md
@@ -0,0 +1,15 @@
+## Docker
+
+Running the benchmark analysis notebook should be done via docker, with the following command to mount the local directory as the home direcory and pass in the current user's AWS credentials and launch RStudio.
-Running the benchmark analysis notebook should be done via docker, with the following command to mount the local directory as the home direcory and pass in the current user's AWS credentials and launch RStudio.
+Running the benchmark analysis notebook should be done via docker, with the following command to mount the local directory as the home directory and pass in the current user's AWS credentials and launch RStudio.
-Running the benchmark analysis notebook should be done via docker, with the following command to mount the local directory as the home direcory and pass in the current user's AWS credentials and launch RStudio.
+Running the benchmark analysis notebook should be done via docker, with the following command to mount the local directory as the home directory and pass in the current user's AWS credentials and launch RStudio.
+
+```
+docker run \
+  --mount type=bind,target=/home/rstudio,source=$PWD \
+  --mount type=bind,target=/home/rstudio/.aws,source=$HOME/.aws \
+  -e PASSWORD=<MYPASS> \
+  -p 8787:8787 \
+  ghcr.io/alexslemonade/scpca-r
+```
+
+The rstudio server can then be logged into via the browser by navigating to http://localhost:8787.
+Login with the username `rstudio` and the password chosen above.
diff --git a/.../alevin-quant/alevin-benchmark-indexes.nf → ...ws/benchmarks/alevin-benchmark-indexes.nf b/.../alevin-quant/alevin-benchmark-indexes.nf → ...ws/benchmarks/alevin-benchmark-indexes.nf
diff --git a/...in-quant/alevin-benchmark_2020-08-31.html → ...nchmarks/alevin-benchmark_2020-08-31.html b/...in-quant/alevin-benchmark_2020-08-31.html → ...nchmarks/alevin-benchmark_2020-08-31.html
diff --git a/...in-quant/alevin-benchmark_2020-09-08.html → ...nchmarks/alevin-benchmark_2020-09-08.html b/...in-quant/alevin-benchmark_2020-09-08.html → ...nchmarks/alevin-benchmark_2020-09-08.html
diff --git a/workflows/benchmarks/benchmark-analysis.Rmd b/workflows/benchmarks/benchmark-analysis.Rmd
@@ -0,0 +1,324 @@
+---
+title: "Alevin Index comparisons"
+author: "Joshua Shapiro for CCDL"
+output: 
+  html_notebook:
+    toc: true
+    toc_float: true
+---
+
+This notebook contains analysis of some initial benchmark in runs of Alevin using different index files, looking at mapping rates.
+
+## Setup
+
+### Load Libraries
+```{r setup}
+library(ggplot2)
+library(magrittr)
+library(SingleCellExperiment)
+library(tximport)
+
+# set seed
+set.seed(2020)
+```
+
+
+
+### File and directory setup
+```{r}
+data_dir <- file.path("data", "alevin-quant")
+dir.create(data_dir, recursive = TRUE, showWarnings = FALSE)
+
+quant_s3 <- "s3://nextflow-ccdl-results/scpca-benchmark/alevin-quant"
+
+```
+
+### Sync S3 files
+
+```{r}
+sync_call <- paste("aws s3 sync", quant_s3, data_dir)
+system(sync_call, ignore.stdout = TRUE)
+```
+
+Get the sample list and make a data frame with sample info
+
+```{r}
+quant_dirs <- list.dirs(data_dir, full.names = FALSE, recursive = FALSE)
+
+# split ids into components for later processing
+quant_info <- data.frame (quant_dir = quant_dirs, info = quant_dirs) %>%
+  tidyr::separate(info, sep = "[-]", 
+                  into = c("sample", "index_type")) %>%
+  tidyr::separate(index_type, 
+                  into = c("index_content", "kmer", "decoy"), 
+                  extra = "drop") %>%
+  dplyr::mutate(kmer = stringr::str_remove(kmer, "k"))
+
+```
+
+### Get Annotations from AnnotationHub
+
+```{r}
+hub = AnnotationHub::AnnotationHub(ask = FALSE)
+# Ensembl v100 Homo sapiens is AH79689
+ensdb = hub[["AH79689"]]
+ensg <- genes(ensdb)
+```
+
+Create vectors of mitochondrial genes and coding genes for later.
+
+```{r}
+# create the mitochondrial gene list
+mito_genes <- ensg[seqnames(ensg) == 'MT']$gene_id
+
+coding_genes <- ensg[ensg$gene_biotype == "protein_coding"]$gene_id
+```
+
+
+
+## Process Alevin quantification
+
+### tximport and make SCEs
+
+```{r}
+sces <- quant_dirs %>%
+  purrr::map(
+    ~ tximport(file.path(data_dir, .x, "alevin", "quants_mat.gz"),
+               type = "alevin")) %>%
+  purrr::map(
+    ~ SingleCellExperiment(list(counts = .x$counts))) 
+
+# add names
+names(sces) <- quant_dirs
+```
+
+
+Calculate cell and feature QC statistics for each sample.
+
+```{r}
+sces <- sces %>% 
+  purrr::map(
+    ~ scater::addPerCellQC(
+      .x,
+      subsets = list(mito = mito_genes[mito_genes %in% rownames(.x)],
+                     ncRNA = rownames(.x)[!rownames(.x) %in% coding_genes] )
+    ) 
+  ) %>%
+  purrr::map(scater::addPerFeatureQC)
+```
+
+Combine all cell QC stats into a single data frame
+```{r}
+sce_cell_qc <- purrr::map_df(sces, 
+                        ~ as.data.frame(colData(.x)) %>%
+                          tibble::rownames_to_column(var = "cell_id"), 
+                        .id = "quant_id") %>%
+  dplyr::left_join(quant_info, by = c("quant_id" = "quant_dir"))
+```
+
+
+Combine all feature QC stats into a single data frame.
+```{r}
+sce_feature_qc <- purrr::map_df(sces, 
+                        ~ as.data.frame(rowData(.x)) %>%
+                          tibble::rownames_to_column(var = "gene_id"), 
+                        .id = "quant_id") %>%
+  dplyr::left_join(quant_info, by = c("quant_id" = "quant_dir"))
+```
+
+### Plotting QC stats
+
+Plot cell QC gene accumulation curves & mito fraction
+```{r}
+ggplot(sce_cell_qc %>% dplyr::filter(kmer == "31"), 
+       aes(x = sum, y = detected, color = subsets_mito_percent)) +
+  geom_point() +
+  facet_grid(sample ~ paste(index_content, decoy) ) +
+  scale_color_viridis_c() +
+  scale_x_log10() +
+  scale_y_log10()
+
+```
+
+### Find features that are reliably detected
+
+To start, will select features that are detected in > 5% of cells in at least one sample/index. 
+
+```{r}
+detected_features <- sce_feature_qc %>%
+  dplyr::filter(detected >= 5.0) %>%
+  dplyr::pull(gene_id) %>%
+  unique()
+# How many?
+length(detected_features)
+```
+
+How many of these are not protein coding?
+
+```{r}
+detected_nc <- detected_features[!(detected_features %in% coding_genes)]
+
+length(detected_nc)
+```
+
+Lets look at these features in a bit more depth, comparing the same sample with full transcriptome vs cDNA only.
+```{r}
+# select noncoding detected features
+nc_feature_qc <- sce_feature_qc %>% 
+  dplyr::filter(gene_id %in% detected_nc) 
+
+# Get detected ncRNAs in 834-txome_k31_no_sa as a transcriptome 
+
+nc_txome <-  nc_feature_qc %>%
+  dplyr::filter(quant_id == "834-txome_k31_no_sa") %>%
+  dplyr::arrange(desc(mean))
+
+nc_cdna <-  nc_feature_qc %>%
+  dplyr::filter(quant_id == "834-cdna_k31_no_sa")
+
+# Join the tables
+nc_feature_comparison <- nc_txome %>%
+  dplyr::left_join(nc_cdna, 
+                   by = "gene_id", 
+                   suffix = c("_txome", "_cdna")) %>%
+  dplyr::select("gene_id", "mean_txome", "mean_cdna", "detected_txome", "detected_cdna")
+
+nc_feature_comparison
+```
+
+As expected, most of the features that are not listed as coding do not appear in the cDNA only set!
+Some do, and spot checking indicates that those are listed as pseudogenes, which are for whatever reason still included in the cDNA file.
+
+Of those that do not, the top few genes are MALAT1 and mitochondrial rRNAs. 
+MALAT1 is potentially interesting, and it does seem that we should probably keep it!
+
+Lets make a vector of the noncoding cDNAs for future work:
+```{r}
+nc_cdna_genes <- sce_feature_qc %>%
+  dplyr::filter(quant_id == "834-cdna_k31_no_sa",
+                ! gene_id %in% coding_genes) %>%
+  dplyr::pull(gene_id)
+```
+
+
+## Comparing coding and noncoding txomes
+
+First, looking at the mean expression of each gene, comparing coding and noncoding: comparing within samples, coding to noncoding.
+
+```{r}
+sce_feature_filtered <- sce_feature_qc %>% 
+  dplyr::filter(kmer == "31", decoy == "no") %>%
+  dplyr::mutate(class = dplyr::case_when(gene_id %in% coding_genes ~ "coding",
+                                         gene_id %in% nc_cdna_genes ~ "noncoding cDNA",
+                                         TRUE ~ "ncRNA"))
+
+ggplot(sce_feature_filtered, 
+       aes (x = mean, color = class)) +
+  geom_density() + 
+  scale_x_log10() + 
+  scale_color_brewer(palette = "Dark2") +
+  facet_grid(index_content ~ sample)
+```
+
+The noncoding cDNA genes seem to follow a very similar distribution to the noncoding cDNA genes (mostly pseudogenes?), but I can't see any particular reason to exclude them. 
+If anything, there are ncRNA genes tend to have somewhat higher mean expression than the noncoding cDNAs which are in the cDNA dataset.
+
+Just to check, lets look at the correlation of coding genes within a sample:
+
+
+```{r}
+# pick a random set of cells to look at
+cell_sample <- sample(colnames(sces[["905_3-cdna_k31_no_sa"]]), 100)
+
+features <- rownames(sces[["905_3-cdna_k31_no_sa"]])
+coding_features <- features[features %in% coding_genes]
+
+compare_expression <- data.frame(
+  cdna = as.vector(counts(sces[["905_3-cdna_k31_no_sa"]][coding_features, cell_sample])),
+  txome = as.vector(counts(sces[["905_3-txome_k31_no_sa"]][coding_features, cell_sample]))
+)
+
+cor(compare_expression, method = "spearman")
+```
+```{r}
+ggplot(compare_expression, aes(x = log1p(cdna), y = log1p(txome))) +
+  geom_point()
+```
+Correlation is very good for genes present in both, with expression more often lower in the transcriptome set, which may make sense in the case of multimapping introduced by the larger number of potential targets.
+Interesting though that there are some genes which do not appear to be expressed in the cDNA set that have expression in the txome.
+
+### Transcriptome Conclusion
+
+I think there is little cost, and a not insignificant potential gain to including noncoding RNA transcripts in the mapping sets for this project.
+
+## Comparing decoy indexes
+
+Get cDNA-only decoy comparison data.
+```{r}
+sce_feature_filtered <- sce_feature_qc %>% 
+  dplyr::filter(kmer == "31", index_content == "cdna")
+```
+
+```{r}
+decoy_means <- sce_feature_filtered %>%
+  tidyr::pivot_wider(id_cols = c(gene_id, sample),
+                     names_from = decoy,
+                     values_from = mean) 
+```
+
+First off, how well correlated are the means for various decoy choices? 
+We expect very good correlations.
+
+```{r}
+cor(decoy_means[,3:5], method = "spearman", use = "complete.obs")
+```
+
+
+
+And we have them... interestingly, the best correlation is between no decoys and the partial decoy: the full decoy index is more different by this measure.
+This leads me to the preliminary conclusion that the partial index (which notably uses a slightly different set of transcripts at this stage, due to not being built locally) may not be worth pursuing, as it seems to make very little difference.
+
+Don't forget to look at your data!
+```{r}
+pairs(decoy_means[,3:5])
+```
+Whoa... seems like a lot is being driven by a few points that are very different in the full index data. 
+What are those genes?
+
+
+```{r}
+decoy_means %>% 
+  dplyr::mutate(diff = (no - full)/(full + 1)) %>%
+  dplyr::arrange(desc(diff))
+```
+ENSG00000269028, which is by far the most different, is the gene *MT-RNR2 like 12* which, aside from being related to at least 12 other genes, does not ring any particular bells.
+However, ENSG00000255823 is *MT-RNR2-like 8* 
+
+Other genes at the top of the differential list:
+ENSG00000163864: NMNAT3 nicotinamide nucleotide adenylyltransferase 3
+ENSG00000197563: PIGN Phosphatidylinositol Glycan Anchor Biosynthesis Class N
+ENSG00000173559: NABP1 Nucleic Acid Binding Protein 1
+
+I don't really know what to do with this list... it seems like few genes will have large differences, but a few differences are enormous!
+
+
+Just as a sanity check, lets repeat one of the analyses we did before, comparing the cDNA to the transcriptome for the full decoy index (since we now have both):
+
+```{r}
+# using the same `cell_sample` and `coding_features` as previously
+
+compare_expression_full <- data.frame(
+  cdna = as.vector(counts(sces[["905_3-cdna_k31_full_sa"]][coding_features, cell_sample])),
+  txome = as.vector(counts(sces[["905_3-txome_k31_full_sa"]][coding_features, cell_sample]))
+)
+
+cor(compare_expression_full, method = "spearman")
+```
+```{r}
+ggplot(compare_expression_full, aes(log1p(cdna), y = log1p(txome))) +
+  geom_point()
+```
+
+Interestingly, the correlation between cDNA and txome expression values (for coding genes) using the full decoy seems substantially better than with the no decoy sequences!
+
+This seems to me to be a good argument for using the full decoy, if it is less sensitive to the chosen transcript list.
diff --git a/workflows/benchmarks/benchmark-analysis.nb.html b/workflows/benchmarks/benchmark-analysis.nb.html
diff --git a/workflows/alevin-quant/trace_2020-08-31.txt → workflows/benchmarks/trace_2020-08-31.txt b/workflows/alevin-quant/trace_2020-08-31.txt → workflows/benchmarks/trace_2020-08-31.txt
diff --git a/workflows/alevin-quant/trace_2020-09-08.txt → workflows/benchmarks/trace_2020-09-08.txt b/workflows/alevin-quant/trace_2020-09-08.txt → workflows/benchmarks/trace_2020-09-08.txt
diff --git a/workflows/images/scpca_r/Dockerfile → workflows/images/scpca-r/Dockerfile b/workflows/images/scpca_r/Dockerfile → workflows/images/scpca-r/Dockerfile
@@ -7,10 +7,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends apt-utils dialo
 
 # Add curl, bzip2 and dev libs
 RUN apt-get update -qq && apt-get -y --no-install-recommends install \
+    awscli \
     bzip2 \
     curl \
     libbz2-dev \
     libgdal-dev \
+    libglpk-dev \
     liblzma-dev \
     libreadline-dev \
     libudunits2-dev
@@ -29,6 +31,7 @@ RUN install2.r --error --deps TRUE \
 RUN R -e "BiocManager::install(c( \
     'AnnotationHub', \
     'ensembldb', \
+    'fishpond', \
     'scran', \
     'scater', \
     'tximport'), \

diff --git a/workflows/images/scpca-r/README.md b/workflows/images/scpca-r/README.md
@@ -0,0 +1,27 @@
+This folder contains a Dockerfile for the scpca-r image used for R analysis in this repository.
+
+The image is built from a versioned Rocker image, with additional R packages and other utilities added as needed.
+
+It can be built with the following command run in this directory:
+
+```
+docker build . -t ghcr.io/alexslemonade/scpca-r
+```
+
+Alternatively, the lastest version should be available via:
+
+```
+docker pull ghcr.io/alexslemonade/scpca-r
+```
+
+To use and launch the RStudio server, you can use the following example command, which uses the current directory for the internal home directory.
+Note that this also mounts the current user's `.aws` credentials folder, which can be useful for easy S3 access, as most data files in this repository are stored on S3 in private buckets.
+
+```
+docker run \
+  --mount type=bind,target=/home/rstudio,source=$PWD \
+  --mount type=bind,target=/home/rstudio/.aws,source=$HOME/.aws \
+  -e PASSWORD=<YOURPASS> \
+  -p 8787:8787 \
+  ghcr.io/alexslemonade/scpca-r
+```