Skip to content

Commit

Permalink
Bug fixing and writing tests for release version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Sarah Killcoyne committed Jan 14, 2020
1 parent 64d87c9 commit 64033c0
Show file tree
Hide file tree
Showing 11 changed files with 123,986 additions and 75 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Package: BarrettsProgressionRisk
Title: Risk progression from Barrett's Esophagus to Adenocarcinoma
Title: Prognositc adenocarcinoma risk prediction for Barrett's Esophagus
Type: Package
Version: 0.5.0.1
Version: 1.0
Date: 2020-01-10
Authors@R: person("Sarah","Killcoyne", email="skillcoy@ebi.ac.uk", role=c("aut","cre"))
Description: Process and predict risk of progression from shallow WGS data.
Description: Process and predict risk of adenocarcinoma progression from shallow whole-genome sequencing data.
License: GPL-3
Encoding: UTF-8
LazyData: true
Expand Down
31 changes: 24 additions & 7 deletions R/plot_utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ plotCorrectedCoverage<-function(brr, as=c('plot','list')) {
showPredictionCalibration<-function(df=NULL) {
if (is.null(df)) df = BarrettsProgressionRisk:::be_model$pred.confidence

if (!is.factor(df$Risk))
df = df %>% mutate(Risk = factor(Risk, levels=names(riskColors()), ordered=T))

mm = range(df[c('r1','r2')])

cuts = seq(mm[1], mm[2], by=df$r1[2])
Expand All @@ -97,7 +100,7 @@ showPredictionCalibration<-function(df=NULL) {

ggplot(df, aes(mn, perc)) +
geom_rect(aes(xmin=r1, xmax=r2, ymin=0,ymax=1, fill=Risk), alpha=0.6) +
scale_fill_manual(values=riskColors(), limits=levels(df$Risk) ) +
scale_fill_manual(values=unlist(riskColors()), limits=levels(df$Risk) ) +
geom_vline(xintercept=cuts[2:(length(cuts)-1)], color='grey88') +
geom_smooth(method='lm',formula=y~x, color='grey39', linetype='dashed', fill='grey88', size=0.5, fullrange=T) +
geom_point() + geom_errorbar(aes(ymin=ci.low, ymax=ci.high), size=0.5, width=0.01) +
Expand Down Expand Up @@ -163,17 +166,22 @@ patientRiskTilesPlot<-function(brr, col='Endoscopy', direction=c('fwd','rev')) {
}

gej.dist = grep('Distance',colnames(preds),value=T)

if (length(gej.dist) > 0 & !is.factor(preds[[gej.dist]])) {
preds[[gej.dist]] = fct_rev(factor(preds[[gej.dist]], ordered=T))
} else if (length(gej.dist <= 0)) {
} else if (length(gej.dist) <= 0) {
preds[[gej.dist]] = 1
}
preds = preds %>% mutate_if(is.numeric, list(~factor(.,ordered=T)))

if (!is.factor(preds[[col]])) {
preds[[col]] = factor(preds[[col]], ordered=T)
}

preds = preds %>% mutate_if(is.numeric, list(~factor(.,ordered=T))) %>%
mutate(Risk = factor(Risk, levels=names(riskColors()), ordered=T))

p = ggplot(preds, aes_(as.name(col), as.name(gej.dist))) +
geom_tile(aes(fill=Risk), color='white',size=2) +
scale_fill_manual(values=riskColors(), limits=names(riskColors())) +
scale_fill_manual(values=unlist(riskColors()), limits=names(riskColors())) +
labs(y='Esophageal Location (GEJ...)')

if (dir == 'rev') p = p + scale_x_discrete(limits=rev(levels(preds[[col]])))
Expand All @@ -200,14 +208,23 @@ patientEndoscopyPlot<-function(brr) {
if (length(which(class(brr) %in% c('BarrettsRiskRx'))) <= 0)
stop("BarrettsRiskRx required")

printRisk <- function(x,low,high, risk='Unknown') {
img = switch(risk,
'High'='img/Human_body_silhouette-RED.png',
'Moderate'='img/Human_body_silhouette-YELLOW.png',
'Low'='img/Human_body_silhouette-BLUE.png',
'Unknown'='img/Human_body_silhouette-GREY.png')
paste0(paste0(c("",rep(paste0('<img src="',img,'" alt="%" width="8"></img>'), x)), collapse=""), ' <b>',x, '%</b> (',low,'%-',high,'%)')
}

preds = absoluteRiskCI(brr)
preds = preds %>% rowwise() %>% dplyr::mutate( img=printRisk(Probability*100,CI.low*100,CI.high*100,Risk) )

ggplot(preds, aes(Endoscopy, Probability)) + ylim(0,1) +
geom_line(color='grey') +
geom_errorbar(aes(ymin=CI.low,ymax=CI.high, color=Risk), width=5, show.legend=F) +
geom_errorbar(aes(ymin=CI.low,ymax=CI.high), color='grey39', width=3, show.legend=F, size=1) +
geom_point(aes(color=Risk), size=5) +
scale_color_manual(values=riskColors(), limits=names(riskColors())) +
scale_color_manual(values=unlist(riskColors()), limits=names(riskColors())) +
scale_x_date(date_breaks = "2 month", date_labels = "%b %Y") +
labs(y='Absolute Risk', x='Endoscopy Date',title='Absolute risks over time') +
theme_bw() + theme(legend.position='bottom', axis.text.x = element_text(angle=45, hjust=1))
Expand Down
54 changes: 39 additions & 15 deletions R/process_swgs.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ runQDNAseq<-function(bam=NULL,path=NULL,outputPath=NULL, minMapQ=37, binsize=50)
#' @export
loadSampleInformation<-function(samples, path=c('NDBE','ID','LGD','HGD','IMC','OAC')) {
if (is.character(samples)) {
#sample.info = .readFile(samples, col_types = cols('Patient ID'=col_character(), 'Pathology'=col_character(), 'Sample'=col_character(), 'P53 IHC'=col_integer()))
sample.info = .readFile(samples)
} else if (is.data.frame(samples)) {
sample.info = samples
Expand All @@ -117,11 +116,14 @@ loadSampleInformation<-function(samples, path=c('NDBE','ID','LGD','HGD','IMC','O

colnames(sample.info) = .titleCase(colnames(sample.info))

exp_cols = c('Pathology','GEJ.Distance', 'P53 IHC')
cols_found = sapply( exp_cols, function(x) x %in% colnames(sample.info) )

if (length(which(!cols_found)) > 0)
warning(paste0('Missing expected columns from sample information: ',paste(names(which(!cols_found)), collapse=', '), ". Recommendations will be based on predicted risks and: ",paste(names(which(cols_found)), collapse=', ')) )
col_regex = 'Pathology|GEJ(\\.| )Distance|P53(\\.| )IHC'
cols_found = grep(col_regex, colnames(sample.info), value=T, ignore.case = T)

if (length(cols_found) < 3) {
message = paste0("Missing expected columns from sample information. Recommendations will be based on predicted risks")
if (length(cols_found) > 0) message = paste0(message, ' and: ', paste(cols_found,collapse=','))
warning(message)
}

endo.date<-function(endo) {
if (is.character(endo)) {
Expand All @@ -136,18 +138,40 @@ loadSampleInformation<-function(samples, path=c('NDBE','ID','LGD','HGD','IMC','O
as.Date(endo)
}
}

sample.info = sample.info %>% rowwise %>% dplyr::mutate(Endoscopy = endo.date(Endoscopy))

#sample.info$Endoscopy = factor(sample.info$Endoscopy, ordered=T)
# Don't change the factor if it's already done.
gej_col = grep('GEJ(\\.| )Distance', colnames(sample.info), value=T)
if (length(gej_col) > 0 && !is.factor(sample.info[[gej_col]])) {
gej.dist<-function(gej) {
funct = case_when(
length(which(is.na(as.numeric(gej)))) <= 0 ~ 'as.numeric',
length(which(is.na(as.character(gej)))) <= 0 ~ 'as.character'
)
sapply(gej, eval(funct))
}

pathCol = grep('Pathology',colnames(sample.info))
if (length(pathCol) > 0)
sample.info[[pathCol]] = factor(sample.info[[pathCol]], levels=path, labels=path, ordered=T)
sample.info = sample.info %>%
dplyr::mutate_at(dplyr::vars(!!gej_col), list(~gej.dist(.)))

levels = sort(sample.info %>% dplyr::select(!!gej_col) %>% distinct %>% pull)

p53Col = grep('P53',colnames(sample.info))
if (length(p53Col) > 0)
sample.info[[p53Col]] = factor(sample.info[[p53Col]], levels=c(0,1), labels=c('Normal','Aberrant'), ordered=T)
sample.info = sample.info %>%
dplyr::mutate_at(dplyr::vars(!!gej_col), list(~factor(., levels=levels, ordered=T)))
}

p53_col = grep('P53', colnames(sample.info), value=T)
if (length(p53_col) > 0 &&
length(grep('Normal|Aberrant', dplyr::select(sample.info, !!p53_col) %>% distinct %>% pull)) > 0) {
sample.info = sample.info %>%
dplyr::mutate_at(dplyr::vars(matches('P53')), list(~factor(., levels=c('Normal','Aberrant'), ordered=T)))
} else {
sample.info = sample.info %>% dplyr::mutate_at(dplyr::vars(dplyr::matches('P53')), list(~factor(., levels=c(0,1), labels=c('Normal','Aberrant'), ordered=T)))
}

sample.info = sample.info %>% rowwise %>% dplyr::mutate(Endoscopy = endo.date(Endoscopy))

sample.info = sample.info %>%
dplyr::mutate_at(dplyr::vars(dplyr::matches('Pathology')), list(~factor(., levels=path, ordered=T)))

class(sample.info) <- c('SampleInformation', class(sample.info))
return(sample.info)
Expand Down
21 changes: 14 additions & 7 deletions R/risk_prediction.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ per.sample.error<-function(df, seg.tile.error, arm.tile.error, be.model) {
}


predictRisk<-function(obj, merged.tiles, be.model = NULL, verbose=T) {
predictRisk<-function(obj, merged.tiles, be.model=NULL, verbose=T) {
if (is.null(be.model)) {
be.model = BarrettsProgressionRisk:::be_model
}
Expand All @@ -119,7 +119,8 @@ predictRisk<-function(obj, merged.tiles, be.model = NULL, verbose=T) {
for(i in colnames(merged.tiles$tiles)) sparsed_test_data[,i] = merged.tiles$tiles[,i]

perSampleError = tibble('Sample'=names(merged.tiles$per.sample.error),'Error'=merged.tiles$per.sample.error)
perSampleError = left_join(perSampleError, obj$sample.info, by='Sample') %>% dplyr::select('Sample','Error','Endoscopy')
perSampleError = left_join(perSampleError, obj$sample.info, by='Sample') %>%
dplyr::select('Sample','Error','Endoscopy')

# Predict and generate absolute probabilities
RR = predict(be.model$fit, newx=sparsed_test_data, s=be.model$lambda, type='link')
Expand Down Expand Up @@ -166,7 +167,7 @@ predictRiskFromSegments<-function(obj, be.model = NULL, verbose=T) {

# Tile, scale, then merge segmented values into 5Mb and arm-length windows across the genome.
binnedSamples = tryCatch({
tileSamples(obj, be.model, verbose)
tileSamples(obj, be.model, scale=T, MARGIN=2, verbose=verbose)
}, error = function(e) {
msg = paste("ERROR tiling segmented data:", e)
stop(msg)
Expand Down Expand Up @@ -375,14 +376,16 @@ rx<-function(brr, by=c('endoscopy','sample')) {
time = match.arg(time)

if (riskBy == 'Sample') warning('Rx rules are intended to be applied on a per-endoscopy basis, not per-sample.')

if (!is.numeric.Date(preds[[riskBy]]) | !is.numeric(preds[[riskBy]])) {

time = 'date'
if (!class(preds[[riskBy]]) %in% 'Date' || is.numeric(preds[[riskBy]])) {
time = 'numeric'
preds[[riskBy]] = factor(preds[[riskBy]])
}

preds = preds %>% rowwise() %>% dplyr::mutate(Risk = .risk(Probability, pred.confidence)) %>%
mutate(Risk = factor(Risk, levels=c('Low','Moderate','High'), ordered=T))
preds = preds %>% rowwise() %>%
dplyr::mutate(Risk = .risk(Probability, pred.confidence)) %>%
mutate(Risk = factor(Risk, levels=names(riskColors()), ordered=T))

p53Col = grep('p53', colnames(preds), value=T, ignore.case=T)
pathCol = grep('pathology', colnames(preds), value=T, ignore.case=T)
Expand Down Expand Up @@ -432,10 +435,14 @@ rx<-function(brr, by=c('endoscopy','sample')) {
rules = add_row(rules, 'Time 1' = t1, 'Time 2' = t2, 'Rule' = rule)
}

# If the last pair resulted in an increased surveillance rx than this one should do as well?
if ( i == nrow(preds) && rules[i,'Rule'] > rules[(i-1), 'Rule'] ) rules[i,'Rule'] = rules[(i-1), 'Rule']

if (i == nrow(preds)) break;
}
rules = rules %>% mutate(Rx = map_chr(Rule, .rule.rx))


return(rules)
}

Expand Down
4 changes: 2 additions & 2 deletions R/utility_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ getcachedir<-function() {
}

# Renames 'get.chr.lengths'
chrInfo<-function(chrs=c(1:22, 'X','Y'), prefix='chr', build='hg19', file=NULL) {
chrInfo<-function(chrs=c(1:22, 'X','Y'), prefix='chr', build='hg19', file=NULL, verbose=F) {
local_file = paste(build,'_info.txt',sep='')

local_file = system.file("extdata", local_file, package="BarrettsProgressionRisk")
Expand All @@ -60,7 +60,7 @@ chrInfo<-function(chrs=c(1:22, 'X','Y'), prefix='chr', build='hg19', file=NULL)
file = tmp_file
}

if (!is.null(file) && file.exists(file)) {
if (!is.null(file) && file.exists(file) && verbose) {
message(paste0("Reading chromosome information for build ",build," from ",file))

chr.lengths = read.table(file, header = T, sep='\t', colClasses = c(character(), numeric(), numeric(), numeric(), numeric()), stringsAsFactors = F) %>% as_tibble()
Expand Down
44 changes: 24 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,57 @@ library(BarrettsProgressionRisk)
# Load example data set
data(package='BarrettsProgressionRisk',ExampleQDNAseqData)
# bamPath needs to contain one or more bam files for a patient, the default binsize is 50.
# This should not be changed without extensive testing of the data unless you are retraining the underlying model!
# example of fitted data output from QDNAseq function call: runQDNAseq(bamPath='.', outputPath='<my path>/', binsize=50)
print(fit.data)
# example of raw data output from QDNAseq function call
print(raw.data)
# example of sample information table
print(info)
# Segment the fitted and raw data
segObj = segmentRawData(loadSampleInformation(info), raw.data, fit.data, verbose=F)
# Predict risks from segmented data
pr = predictRiskFromSegments(segObj, verbose=F)
## Results
## -- Results -- ##
# get QC information
sampleResiduals(pr)
# plot raw data
plotSegmentData(pr)
# risk per sample
predictions(pr,'sample')
# Per sample mountain plots with annotations for coefficients
copyNumberMountainPlot(pr, annotate=T)
### Per sample classifications/probabilities
# TODO error
patientRiskTilesPlot(pr)
# risk per sample for samples that pass QC (see sampleResiduals(...))
predictions(pr,'sample')
# TODO error
patientEndoscopyPlot(pr)
# output the absolute risk CI per sample
absoluteRiskCI(pr, 'sample')
# Plot the sample classifications by Endoscopy and location
patientRiskTilesPlot(pr)
### Per endoscopy classifications/probabilities
# risk per endoscopy
predictions(pr,'endoscopy')
# output the absolute risk CI per sample
absoluteRiskCI(pr, 'sample')
# output the absolute risk CI per endoscopy
absoluteRiskCI(pr, 'endoscopy')
# Plot the endoscopy predictions over time, with risk classifications and confidence intervals based on the table output by absoluteRiskCI(...)
patientEndoscopyPlot(pr)
# Get recommendations per endoscopy, given as either sequential integers or dates in the sample information loaded initially.
rx(pr)
```
Expand All @@ -77,14 +84,11 @@ library(knitr)
# bamPath needs to contain one or more bam files for a patient, the default binsize is 50.
# This should not be changed without extensive testing of the data unless you are retraining the underlying model!
BarrettsProgressionRisk::runQDNAseq(bamPath='.', outputPath=<path to qdnaseq output>, binsize=50)
# BarrettsProgressionRisk::runQDNAseq(bamPath='.', outputPath=<path to qdnaseq output>, binsize=50)
# qdnaseq.path=<path to qdnaseq output>
qdnaseq.path='examples/'
# info.file=<path to per sample p53 IHC/pathology file>
info.file = 'example/endoscopy.xlsx'
qdnaseq.path=system.file('extdata/example',package="BarrettsProgressionRisk")
info.file = system.file('extdata/example','endoscopy.xlsx',package="BarrettsProgressionRisk")
output.dir='~/tmp'
options(warn = -1)
Expand Down
Binary file added inst/extdata/example/endoscopy.xlsx
Binary file not shown.
Loading

0 comments on commit 64033c0

Please sign in to comment.