Skip to content

Commit

Permalink
adds test for left censorship
Browse files Browse the repository at this point in the history
  • Loading branch information
rmflight committed Apr 8, 2024
1 parent 781119b commit 056f338
Show file tree
Hide file tree
Showing 6 changed files with 981 additions and 3 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ICIKendallTau
Title: Calculates information-content-informed Kendall-tau
Version: 1.0.3
Date: 2024-01-29
Version: 1.0.4
Date: 2024-04-08
Authors@R: c(person(given = c("Robert", "M"), family = "Flight", role =
c("aut", "cre"), email = "rflight79@gmail.com", comment =
c(ORCID = "0000-0001-8141-7788")), person(given = c("Hunter",
Expand All @@ -14,7 +14,7 @@ VignetteBuilder: knitr
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
LinkingTo: Rcpp
Imports: Rcpp, purrr, utils, stringr
Suggests: furrr, future, testthat (>= 3.0.0), microbenchmark,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ export(log_message)
export(long_df_2_cor_matrix)
export(pairwise_completeness)
export(show_progress)
export(test_left_censorship)
importFrom(Rcpp,sourceCpp)
useDynLib(ICIKendallTau)
119 changes: 119 additions & 0 deletions R/left_censorship.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#' Test for left censorship
#'
#' Does a binomial test to check if the most likely cause of missing values
#' is due to values being below the limit of detection, or coming from a
#' left-censored distribution.
#'
#' @param in_data matrix or data.frame of numeric data
#' @param sample_classes which samples are in which class
#' @param global_na what represents zero or missing?
#'
#' @details
#' For each feature that is missing in a group of samples, we save as a possibility
#' to test. For each sample, we calculate the median value with any missing values
#' removed. Each feature that had a missing value, we test whether the remaining
#' non-missing values are below the sample median for those samples where the
#' feature is non-missing. A binomial test considers the total number of features
#' instances (minus missing values) as the number of trials, and the number of
#' of features below the sample medians as the number of successes.
#'
#'
#' @export
#' @return list of trials / successes, and binom.test result
test_left_censorship = function(in_data, sample_classes = NULL, global_na = c(0, NA))
{
if (is.null(sample_classes)) {
sample_classes = rep("A", ncol(in_data))
}

split_indices = split(seq_len(ncol(in_data)), sample_classes)
in_data_missing = setup_missing_matrix(in_data, global_na)

# split the dataset by group
split_counts = purrr::imap(split_indices, \(in_split, split_id){
# in_split = split_indices[[1]]

# grab the group we want to work with
split_missing = in_data_missing[, in_split, drop = FALSE]

# count the number of missing samples for each feature,
# and keep those that have at least one
n_miss = rowSums(is.na(split_missing))
keep_miss = split_missing[n_miss > 0, ]

# get sample medians
sample_medians = calculate_matrix_medians(split_missing, use = "col", na.rm = TRUE)

# turn the medians into a matrix to make life easier
median_matrix = matrix(sample_medians, nrow = nrow(keep_miss),
ncol = ncol(keep_miss), byrow = TRUE)
# do the comparison
keep_miss_updown = keep_miss < median_matrix

# count how many trials we ran, and how many successes we have
all_trials = (nrow(keep_miss_updown) * ncol(keep_miss_updown)) - sum(is.na(keep_miss_updown))
all_success = sum(keep_miss_updown, na.rm = TRUE)

data.frame(trials = all_trials, success = all_success, class = split_id)
}) |>
purrr::list_rbind()

total_trials = sum(split_counts$trials)
total_success = sum(split_counts$success)

binom_res = binom.test(total_success, total_trials, p = 0.5, alternative = "greater")
return(list(values = split_counts,
binomial_test = binom_res))
}

calculate_matrix_medians = function(in_matrix, use = "col", ...)
{
if (use %in% "row") {
in_matrix = t(in_matrix)
}
out_medians = purrr::map_dbl(seq_len(ncol(in_matrix)), \(in_col){
median(in_matrix[, in_col], ...)
})
return(out_medians)
}

setup_missing_matrix = function(data_matrix, global_na)
{
exclude_loc = matrix(FALSE, nrow = nrow(data_matrix), ncol = ncol(data_matrix))
if (length(global_na) > 0) {
if (any(is.na(global_na))) {
exclude_loc[is.na(data_matrix)] = TRUE
global_na = global_na[!is.na(global_na)]
}
if (any(is.infinite(global_na))) {
exclude_loc[is.infinite(data_matrix)] = TRUE
global_na = global_na[!is.infinite(global_na)]
}
}
if (length(global_na) > 0) {
for (ival in global_na) {
exclude_loc[data_matrix == ival] = TRUE
}
}
out_data = data_matrix
out_data[exclude_loc] = NA
out_data
}

add_uniform_noise = function(n_rep, value, sd, use_zero = FALSE){
n_value = length(value)

n_sd = n_rep * n_value

out_sd = rnorm(n_sd, 0, sd)
out_sd = matrix(out_sd, nrow = n_value, ncol = n_rep)

if (!use_zero){
tmp_value = matrix(value, nrow = n_value, ncol = n_rep, byrow = FALSE)
out_value = tmp_value + out_sd
} else {
out_value = out_sd
}

return(out_value)
}
Loading

0 comments on commit 056f338

Please sign in to comment.