Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Running replicates statistics on bins and bed files #80

Merged
merged 6 commits into from
Oct 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
- name: Manually install BioConductor dependencies
run: |
install.packages("BiocManager")
BiocManager::install(c("rtracklayer", "GenomeInfoDb", "GenomicRanges", "BSgenome.Mmusculus.UCSC.mm9", "BSgenome.Hsapiens.UCSC.hg38"))
BiocManager::install(c("rtracklayer", "GenomeInfoDb", "GenomicRanges", "BSgenome.Mmusculus.UCSC.mm9", "BSgenome.Hsapiens.UCSC.hg38", "DESeq2"))
shell: Rscript {0}
working-directory: elsasserlib

Expand Down
5 changes: 3 additions & 2 deletions elsasserlib/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: elsasserlib
Type: Package
Title: General utilities used within Elsasser lab
Version: 1.0.9
Version: 1.1.0
Authors@R: c(
person("Carmen", "Navarro",
email = "carmen.navarro@scilifelab.se",
Expand Down Expand Up @@ -41,7 +41,8 @@ Imports:
scales,
RColorBrewer,
RCurl,
pheatmap
pheatmap,
DESeq2
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
Expand Down
10 changes: 10 additions & 0 deletions elsasserlib/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

export(build_bins)
export(bw_bed)
export(bw_bed_diff_analysis)
export(bw_bins)
export(bw_bins_diff_analysis)
export(bw_granges_diff_analysis)
export(bw_profile)
export(mean_ratio_norm)
export(palette_categorical)
Expand All @@ -19,6 +22,12 @@ export(trim_quantile)
import(ggplot2)
importFrom(BSgenome.Hsapiens.UCSC.hg38,BSgenome.Hsapiens.UCSC.hg38)
importFrom(BSgenome.Mmusculus.UCSC.mm9,BSgenome.Mmusculus.UCSC.mm9)
importFrom(DESeq2,DESeqDataSetFromMatrix)
importFrom(DESeq2,`sizeFactors<-`)
importFrom(DESeq2,estimateDispersions)
importFrom(DESeq2,estimateSizeFactors)
importFrom(DESeq2,nbinomWaldTest)
importFrom(DESeq2,results)
importFrom(GenomeInfoDb,seqinfo)
importFrom(GenomeInfoDb,sortSeqlevels)
importFrom(GenomicRanges,makeGRangesFromDataFrame)
Expand Down Expand Up @@ -49,6 +58,7 @@ importFrom(rmarkdown,render)
importFrom(rtracklayer,BigWigFile)
importFrom(rtracklayer,import)
importFrom(rtracklayer,mcols)
importFrom(stats,complete.cases)
importFrom(stats,median)
importFrom(stats,sd)
importFrom(stringr,str_sort)
Expand Down
139 changes: 139 additions & 0 deletions elsasserlib/R/bwstats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#' Run DESeq2 analysis on genome-wide bins
#'
#' Runs a DESeq2 analysis on genome-wide bins of a specified bin size.
#' The particularity of this analysis is that it skips the estimateSizeFactors
#' step by default, because this is accounted for in the scaling step of
#' MINUTE-ChIP samples.
#'
#' @param bwfiles_c1 Path or array of paths to the bigWig files for first condition.
#' @param bwfiles_c2 Path or array of paths to the bigWig files for second condition.
#' @param genome Genome. Available choices are mm9, hg38.
#' @param bin_size Bin size.
#' @inheritParams bw_granges_diff_analysis
#' @return a DESeqResults object as returned by DESeq2::results function
#' @export
bw_bins_diff_analysis <- function(bwfiles_c1,
bwfiles_c2,
label_c1,
label_c2,
bin_size = 10000,
genome = "mm9",
estimate_size_factors = FALSE) {

bins_c1 <- bw_bins(bwfiles_c1, genome = genome, bin_size = bin_size)
bins_c2 <- bw_bins(bwfiles_c2, genome = genome, bin_size = bin_size)

bw_granges_diff_analysis(bins_c1, bins_c2, label_c1, label_c2,
estimate_size_factors = estimate_size_factors)
}

#' Run DESeq2 analysis on bed file
#'
#' Runs a DESeq2 analysis on a set of loci specified in a BED file.
#' The particularity of this analysis is that it skips the estimateSizeFactors
#' step by default, because this is accounted for in the scaling step of
#' MINUTE-ChIP samples.
#'
#' @param bwfiles_c1 Path or array of paths to the bigWig files for first condition.
#' @param bwfiles_c2 Path or array of paths to the bigWig files for second condition.
#' @param bedfile BED file for locus specific analysis.
#' @inheritParams bw_granges_diff_analysis
#' @return a DESeqResults object as returned by DESeq2::results function
#' @export
bw_bed_diff_analysis <- function(bwfiles_c1,
bwfiles_c2,
bedfile,
label_c1,
label_c2,
estimate_size_factors = FALSE) {

loci_c1 <- bw_bed(bwfiles_c1, bedfile = bedfile)
loci_c2 <- bw_bed(bwfiles_c2, bedfile = bedfile)

bw_granges_diff_analysis(loci_c1, loci_c2, label_c1, label_c2,
estimate_size_factors = estimate_size_factors)
}


#' Compute DESeq2 differential analysis on GRanges objects
#'
#' Runs a DESeq2 analysis on loci specified on GRanges objects.
#' The particularity of this analysis is that it skips the estimateSizeFactors
#' step by default, because this is accounted for in the scaling step of
#' MINUTE-ChIP samples.
#'
#' @param granges_c1 GRanges object containing the values for condition 1.
#' @param granges_c2 GRanges object containing the values for condition 2.
#' Note that these objects must correspond to the same loci.
#' @param label_c1 Condition name for condition 1.
#' @param label_c2 Condition name for condition 2.
#' @param estimate_size_factors If TRUE, normal DESeq2 procedure is done. Set it
#' to true to analyze non-MINUTE data.
#' @importFrom DESeq2 DESeqDataSetFromMatrix estimateDispersions nbinomWaldTest `sizeFactors<-` results estimateSizeFactors
#' @return a DESeqResults object as returned by DESeq2::results function
#' @export
bw_granges_diff_analysis <- function(granges_c1,
granges_c2,
label_c1,
label_c2,
estimate_size_factors = FALSE) {

# Bind first, get numbers after (drop complete cases separately could cause error)
granges_c1 <- sortSeqlevels(granges_c1)
granges_c1 <- sort(granges_c1)

granges_c2 <- sortSeqlevels(granges_c2)
granges_c2 <- sort(granges_c2)

cts_df <- cbind(data.frame(granges_c1), mcols(granges_c2))

cts <- get_nreads_columns(cts_df[, 6:ncol(cts_df)])

condition_labels <- c(rep(label_c1, length(mcols(granges_c1))),
rep(label_c2, length(mcols(granges_c2))))

coldata <- data.frame(colnames(cts), condition = condition_labels)

dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ condition)


if (estimate_size_factors == TRUE) {
dds <- estimateSizeFactors(dds)
}
else {
# Since files are scaled, we do not want to estimate size factors, so give it
# an array of ones
sizeFactors(dds) <- c(rep(1, ncol(cts)))
}

dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

results(dds)
}


#' Get values in a data frame object as round numeric values in a matrix.
#'
#' This is an auxiliary function for stats. It drops NAs or NaN values, only
#' complete cases are used.
#'
#' @param df Target data frame
#' @param length_factor Scaling factor to multiply coverage values by.
#'
#' @return An integer matrix
#' @importFrom stats complete.cases
get_nreads_columns <- function(df, length_factor = 1000) {
# TODO: Consider whether to multiply by locus length. For bins analysis
# this should not affect results, but for genes or loci of different length
# it might. Since we skip the size factor step, we may bias the results?
# So right now it's only fragment length
cts <- as.matrix(df)
cts <- as.matrix(cts[complete.cases(cts),])
cts <- round(cts*length_factor)
cts
}


3 changes: 2 additions & 1 deletion elsasserlib/README.Md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ dependencies before running the installation:
'GenomicRanges',
'rtracklayer',
'BSgenome.Mmusculus.UCSC.mm9',
'BSgenome.Hsapiens.UCSC.hg38'))
'BSgenome.Hsapiens.UCSC.hg38',
'DESeq2'))

Then you can install directly from this GitHub repository:

Expand Down
38 changes: 38 additions & 0 deletions elsasserlib/man/bw_bed_diff_analysis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 41 additions & 0 deletions elsasserlib/man/bw_bins_diff_analysis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions elsasserlib/man/bw_granges_diff_analysis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions elsasserlib/man/get_nreads_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading