Skip to content

Commit

Permalink
add hashing (#774)
Browse files Browse the repository at this point in the history
closes #751 

The dataset are being hashed using `rlang::hash` in:
1) `data_to_datasets` when using `qenv`
2) `get_rcode` when using chunks `chunks`

-`calculate_hashes` function has been created to be used in both places.
-the filtering is not returned in `get_datasets_code` now but rather in
`get_rcode` and `datasets_to_data`.
-messages and warnings calls are now returned in `get_datasets_code`
instead of character strings.

Please test with `chunks` and `qenv` examples from tmg, tmc ...


Also closes #752

Signed-off-by: Mahmoud Hallal <86970066+mhallal1@users.noreply.github.com>
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Nikolas Burkoff <nikolas.burkoff@roche.com>
  • Loading branch information
3 people authored Nov 11, 2022
1 parent 2dba87e commit bbaf8c9
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 45 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

* Updated examples to use `scda.2022`.
* Added R session information into a link in the footer of `teal` applications.
* Added data hashing step using `rlang` instead of `digest` package to calculate the hash (which has been moved from `teal.data` and `teal.slice`). There is now an explicit hashing check in the reproducible code output

# teal 0.12.0

Expand Down
45 changes: 18 additions & 27 deletions R/get_rcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ get_rcode <- function(datasets = NULL,
)
str_install <- paste(c(get_rcode_str_install(), ""), collapse = "\n")
str_libs <- paste(get_rcode_libraries(), "\n")
str_code <- get_datasets_code(datanames, datasets)

hashes <- calculate_hashes(datanames, datasets)
str_code <- c(get_datasets_code(datanames, datasets, hashes), teal.slice::get_filter_expr(datasets, datanames))
} else {
str_header <- get_rcode_header(title = title, description = description)
str_install <- character(0)
Expand Down Expand Up @@ -134,37 +136,29 @@ get_rcode <- function(datasets = NULL,
#' Get datasets code
#'
#' Get combined code from `FilteredData` and from `CodeClass` object.
#'
#' @param datanames (`character`) names of datasets to extract code from
#' @param datasets (`FilteredData`) object
#' @param hashes named (`list`) of hashes per dataset
#'
#' @return `character(3)` containing following elements:
#' - code from `CodeClass` (data loading code)
#' - hash of loaded objects
#' - filter panel code
#' - hash check of loaded objects
#'
#' @keywords internal
get_datasets_code <- function(datanames, datasets) {
get_datasets_code <- function(datanames, datasets, hashes) {
str_code <- datasets$get_code(datanames)
if (length(str_code) == 0 || (length(str_code) == 1 && str_code == "")) {
str_code <- paste0(c(
"#################################################################",
"# ___ ____ ____ ___ ____ ____ ____ ____ ____ ____ _ _ _ ____ #",
"# |__] |__/ |___ |__] |__/ | | | |___ [__ [__ | |\\ | | __ #",
"# | | \\ |___ | | \\ |__| |___ |___ ___] ___] | | \\| |__] #",
"# _ ____ ____ _ _ ___ ___ _ _ #",
"# | [__ |___ |\\/| |__] | \\_/ #",
"# | ___] |___ | | | | | #",
"#################################################################\n"
), collapse = "\n")
str_code <- "message('Preprocessing is empty')"
} else if (length(str_code) > 0) {
str_code <- paste0(str_code, "\n\n")
}

if (!datasets$get_check()) {
check_note_string <- paste0(
c(
"## NOTE: Reproducibility of data import and preprocessing was not",
"## explicitly checked (argument \"check = FALSE\" is set).",
"## The app developer has the choice to check the reproducibility",
"## and might have omitted this step for some reason. Please reach",
"## out to the app developer for details.\n"
"message(paste(\"Reproducibility of data import and preprocessing was not explicitly checked\",",
" \" ('check = FALSE' is set). Contact app developer if this is an issue.\n\"))"
),
collapse = "\n"
)
Expand All @@ -177,9 +171,9 @@ get_datasets_code <- function(datanames, datasets) {
datanames,
function(dataname) {
sprintf(
"# %s MD5 hash at the time of analysis: %s",
dataname,
datasets$get_filtered_dataset(dataname)$get_hash()
"stopifnot(%s == %s)",
deparse1(bquote(rlang::hash(.(as.name(dataname))))),
deparse1(hashes[[dataname]])
)
},
character(1)
Expand All @@ -189,11 +183,7 @@ get_datasets_code <- function(datanames, datasets) {
"\n\n"
)

str_filter <- teal.slice::get_filter_expr(datasets, datanames)
if (str_filter != "") {
str_filter <- paste0(str_filter, "\n\n")
}
c(str_code, str_hash, str_filter)
c(str_code, str_hash)
}

## Module ----
Expand All @@ -203,6 +193,7 @@ get_datasets_code <- function(datanames, datasets) {
#'
#' @inheritParams get_rcode
#' @inheritParams shiny::moduleServer
#'
#' @param modal_title optional, (`character`) title of the modal
#' @param code_header optional, (`character`) header inside R
#' @param disable_buttons optional, (`reactive`)
Expand Down
34 changes: 32 additions & 2 deletions R/module_nested_tabs.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ srv_nested_tabs.teal_modules <- function(id, datasets, modules, reporter) {
"module { deparse1(modules$label) }."
)
)

modules_reactive <- sapply(names(modules$children), USE.NAMES = TRUE, function(id) {
srv_nested_tabs(id = id, datasets = datasets, modules = modules$children[[id]], reporter = reporter)
})
Expand Down Expand Up @@ -232,13 +233,16 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
#'
#' Converts `FilteredData` object to `tdata` object containing datasets needed for a specific module.
#' Please note that if module needs dataset which has a parent, then parent will be also returned.
#' A hash per `dataset` is calculated internally and returned in the code.
#'
#' @param module (`teal_module`) module where needed filters are taken from
#' @param datasets (`FilteredData`) object where needed data are taken from
#' @return list of reactive datasets with following attributes:
#' - `code` (`character`) containing datasets reproducible code.
#' @keywords internal
#' - `join_keys` (`JoinKeys`) containing relationships between datasets.
#' - `metadata` (`list`) containing metadata of datasets.
#'
#' @keywords internal
.datasets_to_data <- function(module, datasets) {
datanames <- if (identical("all", module$filter) || is.null(module$filter)) {
datasets$datanames()
Expand All @@ -255,13 +259,39 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
}
)

hashes <- calculate_hashes(datanames, datasets)
metadata <- lapply(datanames, datasets$get_metadata)
names(metadata) <- datanames

new_tdata(
data,
reactive(get_datasets_code(datanames, datasets)),
reactive(
c(
get_rcode_str_install(),
get_rcode_libraries(),
get_datasets_code(datanames, datasets, hashes),
teal.slice::get_filter_expr(datasets, datanames)
)
),
datasets$get_join_keys(),
metadata
)
}

#' Get the hash of a dataset
#'
#' @param datanames (`character`) names of datasets
#' @param datasets (`FilteredData`) object holding the data
#'
#' @return A list of hashes per dataset
#' @keywords internal
#'
calculate_hashes <- function(datanames, datasets) {
sapply(
datanames,
simplify = FALSE,
function(x) {
rlang::hash(datasets$get_data(x, filtered = FALSE))
}
)
}
20 changes: 20 additions & 0 deletions man/calculate_hashes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 3 additions & 7 deletions man/dot-datasets_to_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/get_datasets_code.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions tests/testthat/test-get_rcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,16 @@ testthat::test_that("get_datasets_code returns code only for specified datanames
)
)

hashes <- calculate_hashes(datasets$datanames(), datasets)
testthat::expect_true(
!grepl(
"mtcars",
paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
ignore.case = TRUE
) &&
grepl(
"iris",
paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
ignore.case = TRUE
)
)
Expand Down
80 changes: 78 additions & 2 deletions tests/testthat/test-module_nested_tabs.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,22 @@ testthat::test_that(".datasets_to_data returns tdata object", {

# code
testthat::expect_equal(
isolate(get_code(data)[1]),
"d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n"
isolate(get_code(data)),
c(
"# Add any code to install/load your NEST environment here",
paste0(
"library(testthat)\nlibrary(shiny)\nlibrary(teal.data)\nlibrary(magrittr)\nlibrary(teal.transform)\n",
"library(teal)\nlibrary(matrixStats)\nlibrary(MatrixGenerics)\nlibrary(BiocGenerics)\nlibrary(S4Vectors)\n",
"library(IRanges)\nlibrary(GenomeInfoDb)\nlibrary(GenomicRanges)\nlibrary(Biobase)\n",
"library(SummarizedExperiment)\nlibrary(MultiAssayExperiment)"
),
"d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n",
paste0(
"stopifnot(rlang::hash(d1) == \"f6f90d2c133ca4abdeb2f7a7d85b731e\")\n",
"stopifnot(rlang::hash(d2) == \"6e30be195b7d914a1311672c3ebf4e4f\") \n\n"
),
""
)
)

# metadata
Expand Down Expand Up @@ -343,3 +357,65 @@ testthat::test_that(".datasets_to_data returns parent datasets for CDISC data",
data <- .datasets_to_data(module, datasets)
testthat::expect_setequal(isolate(names(data)), c("ADSL", "ADAE"))
})

testthat::test_that("calculate_hashes takes a FilteredData and vector of datanames as input", {
adsl <- data.frame(STUDYID = 1, USUBJID = 1)
adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)

datasets <- teal.slice::init_filtered_data(
teal.data::cdisc_data(
teal.data::cdisc_dataset("ADSL", adsl),
teal.data::cdisc_dataset("ADAE", adae),
teal.data::cdisc_dataset("ADTTE", adtte)
)
)

testthat::expect_error(calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets), NA)
})

testthat::test_that("calculate_hashes returns a named list", {
adsl <- data.frame(STUDYID = 1, USUBJID = 1)
adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)

datasets <- teal.slice::init_filtered_data(
teal.data::cdisc_data(
teal.data::cdisc_dataset("ADSL", adsl),
teal.data::cdisc_dataset("ADAE", adae),
teal.data::cdisc_dataset("ADTTE", adtte)
)
)

hashes <- calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets)
testthat::expect_identical(
hashes,
list(
"ADSL" = "e89f5271357822c78dd5cfddb60c0a95",
"ADAE" = "f71b576ecfd23075f7285841327515e0",
"ADTTE" = "c68c01c86b946a3dfe05150da040aa2a"
)
)
testthat::expect_is(hashes, "list")
testthat::expect_named(hashes)
})

testthat::test_that("calculate_hashes returns the hash of the non Filtered dataset", {
datasets <- teal.slice::init_filtered_data(
teal.data::teal_data(
teal.data::dataset("iris", iris)
)
)

fs <- list(
iris = list(
Sepal.Length = list(c(5.1, 6.4)),
Species = c("setosa", "versicolor")
)
)
datasets$set_filter_state(state = fs)

hashes <- calculate_hashes(datanames = c("iris"), datasets = datasets)
testthat::expect_identical(hashes, list("iris" = "34844aba7bde36f5a34f6d8e39803508"))
testthat::expect_false(hashes == rlang::hash(isolate(datasets$get_data("iris", filtered = TRUE))))
})
5 changes: 3 additions & 2 deletions vignettes/adding-support-for-reporting.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,9 @@ example_reporter_module <- function(label = "Example") {
app <- init(
data = teal_data(
dataset("AIR", airquality, code = "AIR <- data(airquality)"),
dataset("IRIS", iris, code = "IRIS <- data(iris)")
dataset("AIR", airquality, code = "data(airquality); AIR <- airquality"),
dataset("IRIS", iris, code = "data(iris); IRIS <- iris"),
check = FALSE
),
modules = modules(
example_reporter_module(label = "with Reporter"),
Expand Down

0 comments on commit bbaf8c9

Please sign in to comment.