.Rhistory

if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(parent)
}
})
parent <- ifelse(is.na(parent), gsea$condenseID, parent) #if there is no parent, the child becomes its own parent
gsea$condenseParentID <- unlist(parent)
gsea$condenseSurvive <- ifelse(gsea$condenseID %in% eaten, FALSE, TRUE)
View(gsea)
library(gseaCondenser)
devtools::document()
devtools::document()
library(gseaCondenser)
library(gseaCondenser)
library(gseaCondenser)
library(gseaCondenser)
library(gseaCondenser)
library(gseaCondenser)
gsea <- gseaCondenser::myGsea
View(gsea)
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508"), colname = "GO_ID")
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508"), colname = "pathway")
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508"), namecol = "pathway")
View(gsea)
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508", "GO:0002576"), namecol = "pathway")
#' @param sep charachter. What genes in the column string? E.g. if a cell of the column would look like "HOXA9,HOXA3", you need to set sep=","
#' @param similarity number, what has to be the minimum gene overlap between two terms in order for one to be regarded as redundant?
#' @return data.frame similar to the input, but with 3 added columns: condenseID has a simple numeric ID for each row. condenseChildren lists the IDs of all sets that were eaten by this set. condenseDropout states whether or not this set was eaten itself.
#' @export
#' @examples
#' library(gseaCondenser)
#'
#' gsea <- gseaCondenser::myGsea
#' gsea <- condenseGsea(gsea, similarity=0.3)
#' head(gsea)
condenseGsea <- function(gsea, colname="genes", sep=",", similarity=0.9, n_finalParents=NULL, finalParents=NULL, namecol=NULL){
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(namecol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,namecol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
#each time a similarity value reaches the threshold, the smaller set's ID is added to "eaten" and to the "condenseChildren" column of the bigger set
eaten <- NA
gsea$condenseChildren <- ""
gsea$condenseParents <- ""
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & i %in% possibleParents  & (ratiomat[i,j] > ratiomat[j,i] | !j %in% possibleParents)){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
gsea$condenseChildren <- gsub("^,","",gsea$condenseChildren)
gsea$condenseParents <- gsub("^,","",gsea$condenseParents)
parents <- strsplit(sapply(gsea$condenseParents, function(x) x), split=",")
parents <- lapply(parents, as.numeric)
#the best parent will be determined by which parent shares the most genes with the child
parent <- lapply(seq_along(parents), function(j) {
curparents <- parents[[j]]
if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(parent)
}
})
parent <- ifelse(is.na(parent), gsea$condenseID, parent) #if there is no parent, the child becomes its own parent
gsea$condenseParentID <- unlist(parent)
gsea$condenseSurvive <- ifelse(gsea$condenseID %in% eaten, FALSE, TRUE)
return(gsea)
}
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508", "GO:0002576"), namecol = "pathway")
View(gsea)
library(gseaCondenser)
#' @param sep charachter. What genes in the column string? E.g. if a cell of the column would look like "HOXA9,HOXA3", you need to set sep=","
#' @param similarity number, what has to be the minimum gene overlap between two terms in order for one to be regarded as redundant?
#' @return data.frame similar to the input, but with 3 added columns: condenseID has a simple numeric ID for each row. condenseChildren lists the IDs of all sets that were eaten by this set. condenseDropout states whether or not this set was eaten itself.
#' @export
#' @examples
#' library(gseaCondenser)
#'
#' gsea <- gseaCondenser::myGsea
#' gsea <- condenseGsea(gsea, similarity=0.3)
#' head(gsea)
condenseGsea <- function(gsea, colname="genes", sep=",", similarity=0.9, n_finalParents=NULL, finalParents=NULL, namecol=NULL){
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(namecol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,namecol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
#each time a similarity value reaches the threshold, the smaller set's ID is added to "eaten" and to the "condenseChildren" column of the bigger set
eaten <- NA
gsea$condenseChildren <- ""
gsea$condenseParents <- ""
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.nul(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
gsea$condenseChildren <- gsub("^,","",gsea$condenseChildren)
gsea$condenseParents <- gsub("^,","",gsea$condenseParents)
parents <- strsplit(sapply(gsea$condenseParents, function(x) x), split=",")
parents <- lapply(parents, as.numeric)
#the best parent will be determined by which parent shares the most genes with the child
parent <- lapply(seq_along(parents), function(j) {
curparents <- parents[[j]]
if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(parent)
}
})
parent <- ifelse(is.na(parent), gsea$condenseID, parent) #if there is no parent, the child becomes its own parent
gsea$condenseParentID <- unlist(parent)
gsea$condenseSurvive <- ifelse(gsea$condenseID %in% eaten, FALSE, TRUE)
return(gsea)
}
gsea <- gseaCondenser::myGsea
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508", "GO:0002576"), namecol = "pathway")
#' @param sep charachter. What genes in the column string? E.g. if a cell of the column would look like "HOXA9,HOXA3", you need to set sep=","
#' @param similarity number, what has to be the minimum gene overlap between two terms in order for one to be regarded as redundant?
#' @return data.frame similar to the input, but with 3 added columns: condenseID has a simple numeric ID for each row. condenseChildren lists the IDs of all sets that were eaten by this set. condenseDropout states whether or not this set was eaten itself.
#' @export
#' @examples
#' library(gseaCondenser)
#'
#' gsea <- gseaCondenser::myGsea
#' gsea <- condenseGsea(gsea, similarity=0.3)
#' head(gsea)
condenseGsea <- function(gsea, colname="genes", sep=",", similarity=0.9, n_finalParents=NULL, finalParents=NULL, namecol=NULL){
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(namecol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,namecol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
#each time a similarity value reaches the threshold, the smaller set's ID is added to "eaten" and to the "condenseChildren" column of the bigger set
eaten <- NA
gsea$condenseChildren <- ""
gsea$condenseParents <- ""
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
gsea$condenseChildren <- gsub("^,","",gsea$condenseChildren)
gsea$condenseParents <- gsub("^,","",gsea$condenseParents)
parents <- strsplit(sapply(gsea$condenseParents, function(x) x), split=",")
parents <- lapply(parents, as.numeric)
#the best parent will be determined by which parent shares the most genes with the child
parent <- lapply(seq_along(parents), function(j) {
curparents <- parents[[j]]
if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(parent)
}
})
parent <- ifelse(is.na(parent), gsea$condenseID, parent) #if there is no parent, the child becomes its own parent
gsea$condenseParentID <- unlist(parent)
gsea$condenseSurvive <- ifelse(gsea$condenseID %in% eaten, FALSE, TRUE)
return(gsea)
}
gsea <- condenseGsea(gsea, finalParents=c("GO:0006508", "GO:0002576"), namecol = "pathway")
colname="genes"
sep=","
similarity=0.9
similarity=0.9
finalParents=c("GO:0006508", "GO:0002576")
namecol = "pathway"
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
n_finalParents=NULL
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(idcol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,idcol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
#each time a similarity value reaches the threshold, the smaller set's ID is added to "eaten" and to the "condenseChildren" column of the bigger set
eaten <- NA
gsea$condenseChildren <- ""
gsea$condenseParents <- ""
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
warnings()
i=1
j=2
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i=j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
ratiomat[i,j]>similarity
i
j
i
View(ratiomat)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
ratiomat[i,j]>similarity
ratiomat[1,5]
i
j
i=5
ratiomat[i,j]>similarity
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i==j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
gsea$condenseChildren <- gsub("^,","",gsea$condenseChildren)
gsea$condenseParents <- gsub("^,","",gsea$condenseParents)
parents <- strsplit(sapply(gsea$condenseParents, function(x) x), split=",")
parents <- lapply(parents, as.numeric)
parents
warnings()
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(idcol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,idcol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
idcol="pathway"
if(!is.null(n_finalParents) | !is.null(finalParents)){
message("Similarity cutoff will be ignored, because n_finalParents was defined!!!")
similarity <- -1
}
gsea$condenseID <- seq(nrow(gsea))
genes <- strsplit(gsea[,colname], split=sep)
#ratiomat will become matrix where each row is a set and each column is a set. The cells will contain the percentage of overlap between the sets.
ratiomat <- sapply(genes, function(x) sapply(genes, function(y) length(intersect(x,y)) )) #first, the cells contain absolute numbers
ratiomat <- as.data.frame(ratiomat)
rownames(ratiomat) <- NULL
colnames(ratiomat) <- seq(nrow(gsea))
ratiomat <- apply(ratiomat, 2, function(x) x/max(x)) #ratio is intersect of the sets vs. set size (of the column set)
ratiomat <- as.data.frame(ratiomat)
# define possible parents (only needed in case n_finalParents is defined)
possibleParents <- seq(nrow(gsea))
if(!is.null(finalParents)){
if(is.null(idcol)) stop("please provide a column name for the column containing the pathway names")
possibleParents <- which(gsea[,idcol] %in% finalParents)
}
if(!is.null(n_finalParents)){
cumsim <- apply(ratiomat, 1, var)
names(cumsim) <- seq_along(cumsim)
possibleParents <- as.numeric(head(names(cumsim[order(cumsim)]), n_finalParents)) #take those that have the least(!) variance in correlations
}
#each time a similarity value reaches the threshold, the smaller set's ID is added to "eaten" and to the "condenseChildren" column of the bigger set
eaten <- NA
gsea$condenseChildren <- ""
gsea$condenseParents <- ""
for(i in seq(nrow(ratiomat))) { #the ratio is intersect/n (number of genes of the column set), So j
for(j in seq(ncol(ratiomat))){
#if the ratio of intersect and set size (of set j) is bigger than the ratio of intersect and set size (of set i), it means that set j is smaller. Thus, it will get eaten
if(ratiomat[i,j]>similarity & #the similarity threshold has to be passed
i %in% possibleParents  &  # i has to be a listed
(
ratiomat[i,j] > ratiomat[j,i] | #the parent has to be bigger than the child (and not the two cannot be the same)
!j %in% possibleParents |       #if j is not listed, than i can be a parent even if it is smaller than j
(i==j | !is.null(finalParents))   #inormally a term does not become its own parent, but if it is one of the listed finalParents, it will be
)
){ #by using >, sets do not eat themselves, and it is always the smaller set that gets eaten
gsea$condenseChildren[i] <- paste0(gsea$condenseChildren[i],",",j) #the set that does not get eaten gets the ID of j for its Children column
gsea$condenseParents[j] <- paste0(gsea$condenseParents[j],",",i) # the set that does get eaten gets the ID of i for its Parent column
eaten <- c(eaten,j) # just a vector, gathering all IDs from sets that were eaten
}
}
}
gsea$condenseChildren <- gsub("^,","",gsea$condenseChildren)
gsea$condenseParents <- gsub("^,","",gsea$condenseParents)
parents <- strsplit(sapply(gsea$condenseParents, function(x) x), split=",")
parents <- lapply(parents, as.numeric)
#the best parent will be determined by which parent shares the most genes with the child
parent <- lapply(seq_along(parents), function(j) {
curparents <- parents[[j]]
if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(parent)
}
})
parent <- ifelse(is.na(parent), gsea$condenseID, parent) #if there is no parent, the child becomes its own parent
gsea$condenseParentID <- unlist(parent)
gsea$condenseSurvive <- ifelse(gsea$condenseID %in% eaten, FALSE, TRUE)
parent
View(gsea)
library(gseaCondenser)
library(gseaCondenser)
#the best parent will be determined by which parent shares the most genes with the child
parentdata <- lapply(seq_along(parents), function(j) {
curparents <- parents[[j]]
if(length(curparents)==0){NA}else{ #for each row in gsea, extract the parents (j is the child)
overlaps <- sapply(curparents, function(i) ratiomat[j,i]) #extract overlap values for all parents (normalized by the set size of the child)
parent <- curparents[which(overlaps==max(overlaps))] #choose the parent with the highest overlap value. This could lead to several parents
overlaps2 <- ratiomat[ rep(j,length(parent)) , parent ] #extract overlap values for all parents (normalized by the set size of the parent)
parent <- ifelse(length(parent)==1, parent, parent[which(overlaps2==max(overlaps2))][1] ) #choose the parent with the highest overlap value. Still, this could leave several parents, which is why we just pick the first one
return(c(parent,max(overlaps)))
}
})
parent <- unlist(lapply(parentdata, function(x) x[1]))
parent
gsea$condenseOverlapratio <- unlist(lapply(parentdata, function(x) x[2]))
View(gsea)
library(gseaCondenser)
library(gseaCondenser)
devtools::document()
library(gseaCondenser)
gsea <- gseaCondenser::myGsea