From d6ca45dbbfb112a108a94083625938a0a490882f Mon Sep 17 00:00:00 2001
From: rmflight
Date: Wed, 10 Apr 2024 10:52:55 -0400
Subject: [PATCH] adds test_left_censorship, and all the docs to do it
---
DESCRIPTION | 44 ++--
NEWS.md | 4 +
R/kendalltau.R | 16 +-
R/left_censorship.R | 4 +-
README.Rmd | 8 +-
README.html | 26 +--
README.md | 26 +--
docs/404.html | 3 +-
docs/CODE_OF_CONDUCT.html | 3 +-
docs/LICENSE-text.html | 3 +-
docs/LICENSE.html | 5 +-
docs/articles/ici-kendalltau.html | 25 ++-
docs/articles/index.html | 5 +-
.../articles/testing-for-left-censorship.html | 202 ++++++++++++++++++
.../figure-html/examine-missingness-1.png | Bin 0 -> 72263 bytes
docs/authors.html | 11 +-
docs/index.html | 27 ++-
docs/news/index.html | 7 +-
docs/pkgdown.yml | 5 +-
docs/reference/add_uniform_noise.html | 111 ++++++++++
docs/reference/calculate_matrix_medians.html | 104 +++++++++
docs/reference/cor_matrix_2_long_df.html | 3 +-
docs/reference/disable_logging.html | 3 +-
docs/reference/enable_logging.html | 3 +-
docs/reference/ici_kendalltau.html | 27 ++-
docs/reference/ici_kendalltau_ref.html | 5 +-
docs/reference/ici_kt.html | 3 +-
docs/reference/index.html | 23 +-
docs/reference/kt_fast.html | 7 +-
docs/reference/log_memory.html | 3 +-
docs/reference/log_message.html | 3 +-
docs/reference/long_df_2_cor_matrix.html | 3 +-
docs/reference/missing_dataset.html | 100 +++++++++
docs/reference/pairwise_completeness.html | 7 +-
docs/reference/show_progress.html | 3 +-
docs/reference/test_left_censorship.html | 149 +++++++++++++
docs/search.json | 2 +-
docs/sitemap.xml | 15 ++
man/ici_kendalltau.Rd | 11 +-
man/kt_fast.Rd | 4 +-
man/pairwise_completeness.Rd | 3 +
man/test_left_censorship.Rd | 6 +-
vignettes/ici-kendalltau.Rmd | 4 +-
vignettes/testing-for-left-censorship.Rmd | 91 ++++++++
44 files changed, 998 insertions(+), 119 deletions(-)
create mode 100644 docs/articles/testing-for-left-censorship.html
create mode 100644 docs/articles/testing-for-left-censorship_files/figure-html/examine-missingness-1.png
create mode 100644 docs/reference/add_uniform_noise.html
create mode 100644 docs/reference/calculate_matrix_medians.html
create mode 100644 docs/reference/missing_dataset.html
create mode 100644 docs/reference/test_left_censorship.html
create mode 100644 vignettes/testing-for-left-censorship.Rmd
diff --git a/DESCRIPTION b/DESCRIPTION
index 47c3d21..9c47462 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,24 +1,44 @@
Package: ICIKendallTau
Title: Calculates information-content-informed Kendall-tau
-Version: 1.0.5
-Date: 2024-04-08
-Authors@R: c(person(given = c("Robert", "M"), family = "Flight", role =
- c("aut", "cre"), email = "rflight79@gmail.com", comment =
- c(ORCID = "0000-0001-8141-7788")), person(given = c("Hunter",
- "NB"), family = "Moseley", role = "aut", comment = c(ORCID =
- "0000-0003-3995-5368")))
+Version: 1.1.0
+Authors@R: c(
+ person(
+ given = c("Robert", "M"),
+ family = "Flight",
+ role = c("aut", "cre"),
+ email = "rflight79@gmail.com",
+ comment = c(ORCID = "0000-0001-8141-7788")),
+ person(
+ given = c("Hunter", "NB"),
+ family = "Moseley",
+ role = "aut",
+ comment = c(ORCID = "0000-0003-3995-5368")))
Description: Provides functions for calculating
information-content-informed Kendall-tau. This version of
Kendall-tau allows for the inclusion of missing values.
VignetteBuilder: knitr
+LazyData: true
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
LinkingTo: Rcpp
-Imports: Rcpp, purrr, utils, stringr
-Suggests: furrr, future, testthat (>= 3.0.0), microbenchmark,
- rmarkdown, knitr, dplyr, logger
-URL: https://moseleybioinformaticslab.github.io/ICIKendallTau
- https://github.com/moseleybioinformaticslab/ICIKendallTau
+Imports: Rcpp,
+ purrr,
+ utils,
+ stringr,
+ stats
+Suggests: furrr,
+ future,
+ testthat (>= 3.0.0),
+ microbenchmark,
+ rmarkdown,
+ knitr,
+ dplyr,
+ logger,
+ withr,
+ naniar
+URL: https://moseleybioinformaticslab.github.io/ICIKendallTau/
+ https://github.com/moseleybioinformaticslab/ICIKendallTau/
+BugReports: https://github.com/moseleybioinformaticslab/ICIKendallTau/issues
Config/testthat/edition: 3
diff --git a/NEWS.md b/NEWS.md
index 6881600..b3206b8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# ICIKendallTau 1.1.0
+
+- adds the function `test_left_censorship` to verify if `ici_kendalltau` is appropriate to use on the data or not.
+
# ICIKendallTau 1.0.0
- Calculates correlation between columns of the matrix, **not** the rows.
diff --git a/R/kendalltau.R b/R/kendalltau.R
index 1e7ab6d..fdb843c 100644
--- a/R/kendalltau.R
+++ b/R/kendalltau.R
@@ -12,11 +12,9 @@
#' @param check_timing logical to determine should we try to estimate run time for full dataset? (default is FALSE)
#' @param return_matrix logical, should the data.frame or matrix result be returned?
#'
-#' @seealso [vignette("ici-kendalltau", package = "ICIKendallTau")] [test_left_censorship()]
+#' @seealso [test_left_censorship()] [pairwise_completeness()] [kt_fast()]
#'
-#' @details For more details, see the ICI-Kendall-tau vignette
-#'
-#' \code{browseVignettes("ICIKendallTau")}
+#' @details For more details, see the vignette `vignette("ici-kendalltau", package = "ICIKendallTau"))`
#'
#' The default for \code{global_na} includes what values in the data to replace with NA for the Kendall-tau calculation. By default these are \code{global_na = c(NA, Inf, 0)}. If you want to replace something other than 0, for example, you might use \code{global_na = c(NA, Inf, -2)}, and all values of -2 will be replaced instead of 0.
#'
@@ -30,6 +28,10 @@
#' * taumax: the theoretical maximum kendall-tau value possible
#'
#' Eventually, we plan to provide two more parameters for replacing values, \code{feature_na} for feature specific NA values and \code{sample_na} for sample specific NA values.
+#'
+#' If you want to know if the missing values in your data are possibly due to
+#' left-censorship, we recommend testing that hypothesis with [test_left_censorship()]
+#' first.
#'
#' @return list with cor, raw, pval, taumax
#'
@@ -315,8 +317,8 @@ ici_kendalltau = function(data_matrix,
#' @param use an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs".
#' @param return_matrix Should the matrices of values be returned, or a long data.frame
#'
-#' @details Although the interface is *mostly* identical to the built-in `stats::cor` method,
-#' there are some differences.
+#' @details Although the interface is *mostly* identical to the built-in
+#' [stats::cor()] method, there are some differences.
#'
#' * if providing both `x` and `y`, it is assumed they are both
#' single vectors.
@@ -535,6 +537,8 @@ kt_fast = function(x, y = NULL, use = "everything", return_matrix = TRUE)
#' @param include_only is there certain comparisons to do?
#' @param return_matrix should the matrix or data.frame be returned?
#'
+#' @seealso [ici_kendalltau()]
+#'
#' @export
#'
#' @return matrix of degree of completeness
diff --git a/R/left_censorship.R b/R/left_censorship.R
index 040cd51..039b102 100644
--- a/R/left_censorship.R
+++ b/R/left_censorship.R
@@ -17,7 +17,9 @@
#' instances (minus missing values) as the number of trials, and the number of
#' of features below the sample medians as the number of successes.
#'
-#' @seealso [vignette("testing-for-left-censorship", package = "ICIKendallTau")]
+#' There is a bit more detail in the vignette: `vignette("testing-for-left-censorship", package = "ICIKendallTau")`
+#'
+#' @seealso [ici_kendalltau()]
#'
#' @examples
#' # this example has 80% missing due to left-censorship
diff --git a/README.Rmd b/README.Rmd
index 14da6aa..40e33d5 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -21,7 +21,7 @@ knitr::opts_chunk$set(
[![ICIKendallTau status badge](https://moseleybioinformaticslab.r-universe.dev/badges/ICIKendallTau)](https://moseleybioinformaticslab.r-universe.dev)
-You can see the pkgdown site [here](https://moseleybioinformaticslab.github.io/ICIKendallTau).
+You can see the pkgdown site [here](https://moseleybioinformaticslab.github.io/ICIKendallTau/).
## Installation
@@ -49,7 +49,7 @@ install.packages("ICIKendallTau")
* In these cases, NA is informative.
* Therefore, in **most** analytical measurements (gene expression, proteomics, metabolomics), missing measurements should be included, and contribute to the correlation.
-If you want to read more on **how** we solve this problem, see the package vignette.
+If you want to read more on **how** we solve this problem, see the package [vignette](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/ici-kendalltau.html).
## Package Functions
@@ -65,6 +65,8 @@ The functions that implement this include:
* `plan(multiprocess)`
* Otherwise will only use a single core.
+We've also included a function for testing if the missingness in your data comes from left-censorship, `test_left_censorship`. We walk through creating example data and testing it in the vignette [Testing for Left Censorship](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/testing-for-left-censorship).
+
## Examples
The most common case is a large matrix of independent samples (columns) and measured features in each of the samples (i.e. gene expression).
@@ -158,7 +160,7 @@ r_3 = ici_kendalltau(matrix_2)
In the case of hundreds of thousands of comparisons to be done, the result matrices can become very, very large, and require lots of memory for storage.
They are also inefficient, as both the lower and upper triangular components are stored.
An alternative storage format is as a `data.frame`, where there is a single row for each comparison performed.
-This is actually how the results are stored internally, and then they are converted to a matrix form if requested (the default).s
+This is actually how the results are stored internally, and then they are converted to a matrix form if requested (the default).
To keep the `data.frame` output, add the argument `return_matrix=FALSE` to the call of `ici_kendalltau`.
```{r}
diff --git a/README.html b/README.html
index c3315be..b6d099e 100644
--- a/README.html
+++ b/README.html
@@ -606,10 +606,10 @@
We’ve also included a function for testing if the missingness in your
+data comes from left-censorship, test_left_censorship. We
+walk through creating example data and testing it in the vignette Testing
+for Left Censorship.
Examples
The most common case is a large matrix of independent samples
(columns) and measured features in each of the samples (i.e. gene
@@ -721,14 +725,10 @@
Is It Fast?
times =5)#> Unit: microseconds
-#> expr min lq mean median uq max
-#> cor(x, y, method = "kendall") 11506.697 11670.094 12169.6628 12006.418 12482.883 13182.222
-#> ici_kt(x, y, "global") 243.866 250.125 294.6542 275.104 320.058 384.118
-#> ici_kt(x2, y2, "global") 13467.011 13739.312 14658.5050 14945.446 14987.140 16153.616
-#> neval
-#> 5
-#> 5
-#> 5
+#> expr min lq mean median uq max neval
+#> cor(x, y, method = "kendall") 11685.244 12730.878 12860.9060 13071.630 13406.514 13410.264 5
+#> ici_kt(x, y, "global") 263.306 268.503 332.1288 274.858 283.589 570.388 5
+#> ici_kt(x2, y2, "global") 14110.743 14322.836 15782.6490 16053.907 16595.979 17829.780 5
In the case of 40,000 features, the average time on a modern CPU is
14 milliseconds.
Of course, if you want to use it to calculate Kendall-tau-b without
@@ -757,7 +757,7 @@
Many Many Comparisons
triangular components are stored. An alternative storage format is as a
data.frame, where there is a single row for each comparison
performed. This is actually how the results are stored internally, and
-then they are converted to a matrix form if requested (the default).s To
+then they are converted to a matrix form if requested (the default). To
keep the data.frame output, add the argument
return_matrix=FALSE to the call of
ici_kendalltau.
@@ -770,7 +770,7 @@
Please note that the ICIKendallTau project is released with a Contributor
Code of Conduct. By contributing to this project, you agree to abide
diff --git a/README.md b/README.md
index 7435e20..c50e03f 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ badge](https://moseleybioinformaticslab.r-universe.dev/badges/ICIKendallTau)](ht
You can see the pkgdown site
-[here](https://moseleybioinformaticslab.github.io/ICIKendallTau).
+[here](https://moseleybioinformaticslab.github.io/ICIKendallTau/).
## Installation
@@ -47,7 +47,8 @@ install.packages("ICIKendallTau")
and contribute to the correlation.
If you want to read more on **how** we solve this problem, see the
-package vignette.
+package
+[vignette](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/ici-kendalltau.html).
## Package Functions
@@ -66,6 +67,11 @@ The functions that implement this include:
- `plan(multiprocess)`
- Otherwise will only use a single core.
+We’ve also included a function for testing if the missingness in your
+data comes from left-censorship, `test_left_censorship`. We walk through
+creating example data and testing it in the vignette [Testing for Left
+Censorship](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/testing-for-left-censorship).
+
## Examples
The most common case is a large matrix of independent samples (columns)
@@ -136,14 +142,10 @@ microbenchmark(
times = 5
)
#> Unit: microseconds
-#> expr min lq mean median uq max
-#> cor(x, y, method = "kendall") 11506.697 11670.094 12169.6628 12006.418 12482.883 13182.222
-#> ici_kt(x, y, "global") 243.866 250.125 294.6542 275.104 320.058 384.118
-#> ici_kt(x2, y2, "global") 13467.011 13739.312 14658.5050 14945.446 14987.140 16153.616
-#> neval
-#> 5
-#> 5
-#> 5
+#> expr min lq mean median uq max neval
+#> cor(x, y, method = "kendall") 11685.244 12730.878 12860.9060 13071.630 13406.514 13410.264 5
+#> ici_kt(x, y, "global") 263.306 268.503 332.1288 274.858 283.589 570.388 5
+#> ici_kt(x2, y2, "global") 14110.743 14322.836 15782.6490 16053.907 16595.979 17829.780 5
```
In the case of 40,000 features, the average time on a modern CPU is 14
@@ -188,7 +190,7 @@ for storage. They are also inefficient, as both the lower and upper
triangular components are stored. An alternative storage format is as a
`data.frame`, where there is a single row for each comparison performed.
This is actually how the results are stored internally, and then they
-are converted to a matrix form if requested (the default).s To keep the
+are converted to a matrix form if requested (the default). To keep the
`data.frame` output, add the argument `return_matrix=FALSE` to the call
of `ici_kendalltau`.
@@ -202,7 +204,7 @@ r_4
#> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000
#>
#> $run_time
-#> [1] 0.01606894
+#> [1] 0.01747489
```
## Code of Conduct
diff --git a/docs/404.html b/docs/404.html
index 30f4568..fc211cc 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -24,7 +24,7 @@
ICIKendallTau
- 1.0.2
+ 1.1.0
{ICIKendallTau} has a very specific assumption, that the missing
+values are largely due to being below the limit of
+detection, or the result of left-censorship. Therefore, it should only
+be used if the missing values are from left-censorship.
+Ideally, it would be nice to have a way to test for it.
+
+
+
Strategy
+
+
To test this, we do the following (implemented in
+test_left_censorship). For any feature that is missing in
+one or more samples in a class of samples, we check if the non-missing
+entries are below their sample’s median values. We count all of the
+entries below sample medians as successes, and the total number of
+possible entries as the number of trials in a binomial test, aggregating
+across all features that had a missing value. We can then perform a
+one-tailed binomial test with the expectation that the successes are
+greater than 0.5.
+
+
+
Fake Data
+
+
To start, we need to make some fake data that we can evaluate the
+test on. We will make a smallish dataset, with 1000 features across 20
+samples, and we will have 80% of the missing values be due to being
+left-censored.
+
We sort the initial data so we know where we can easily put
+missingness due to left-censoring. We also use a log-normal distribution
+initially, just because.
+missing_test=test_left_censorship(missing_dataset)
+missing_test
+#> $values
+#> trials success class
+#> 1 1900 1520 A
+#>
+#> $binomial_test
+#>
+#> Exact binomial test
+#>
+#> data: total_success and total_trials
+#> number of successes = 1520, number of trials = 1900, p-value < 2.2e-16
+#> alternative hypothesis: true probability of success is greater than 0.5
+#> 95 percent confidence interval:
+#> 0.7843033 1.0000000
+#> sample estimates:
+#> probability of success
+#> 0.8