add hashing (#774)

closes #751 The dataset are being hashed using `rlang::hash` in: 1) `data_to_datasets` when using `qenv` 2) `get_rcode` when using chunks `chunks` -`calculate_hashes` function has been created to be used in both places. -the filtering is not returned in `get_datasets_code` now but rather in `get_rcode` and `datasets_to_data`. -messages and warnings calls are now returned in `get_datasets_code` instead of character strings. Please test with `chunks` and `qenv` examples from tmg, tmc ... Also closes #752 Signed-off-by: Mahmoud Hallal <86970066+mhallal1@users.noreply.github.com> Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Nikolas Burkoff <nikolas.burkoff@roche.com>
insightsengineering · Nov 11, 2022 · bbaf8c9 · bbaf8c9
1 parent 2dba87e
commit bbaf8c9
Show file tree

Hide file tree

Showing 9 changed files with 162 additions and 45 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,7 @@
 
 * Updated examples to use `scda.2022`.
 * Added R session information into a link in the footer of `teal` applications.
+* Added data hashing step using `rlang` instead of `digest` package to calculate the hash (which has been moved from `teal.data` and `teal.slice`). There is now an explicit hashing check in the reproducible code output 
 
 # teal 0.12.0
 

diff --git a/R/get_rcode.R b/R/get_rcode.R
@@ -91,7 +91,9 @@ get_rcode <- function(datasets = NULL,
     )
     str_install <- paste(c(get_rcode_str_install(), ""), collapse = "\n")
     str_libs <- paste(get_rcode_libraries(), "\n")
-    str_code <- get_datasets_code(datanames, datasets)
+
+    hashes <- calculate_hashes(datanames, datasets)
+    str_code <- c(get_datasets_code(datanames, datasets, hashes), teal.slice::get_filter_expr(datasets, datanames))
   } else {
     str_header <- get_rcode_header(title = title, description = description)
     str_install <- character(0)
@@ -134,37 +136,29 @@ get_rcode <- function(datasets = NULL,
 #' Get datasets code
 #'
 #' Get combined code from `FilteredData` and from `CodeClass` object.
+#'
 #' @param datanames (`character`) names of datasets to extract code from
 #' @param datasets (`FilteredData`) object
+#' @param hashes named (`list`) of hashes per dataset
+#'
 #' @return `character(3)` containing following elements:
 #'  - code from `CodeClass` (data loading code)
-#'  - hash of loaded objects
-#'  - filter panel code
+#'  - hash check of loaded objects
+#'
 #' @keywords internal
-get_datasets_code <- function(datanames, datasets) {
+get_datasets_code <- function(datanames, datasets, hashes) {
   str_code <- datasets$get_code(datanames)
   if (length(str_code) == 0 || (length(str_code) == 1 && str_code == "")) {
-    str_code <- paste0(c(
-      "#################################################################",
-      "# ___  ____ ____ ___  ____ ____ ____ ____ ____ ____ _ _  _ ____ #",
-      "# |__] |__/ |___ |__] |__/ |  | |    |___ [__  [__  | |\\ | | __ #",
-      "# |    |  \\ |___ |    |  \\ |__| |___ |___ ___] ___] | | \\| |__] #",
-      "#              _ ____    ____ _  _ ___  ___ _   _               #",
-      "#              | [__     |___ |\\/| |__]  |   \\_/                #",
-      "#              | ___]    |___ |  | |     |    |                 #",
-      "#################################################################\n"
-    ), collapse = "\n")
+    str_code <- "message('Preprocessing is empty')"
   } else if (length(str_code) > 0) {
     str_code <- paste0(str_code, "\n\n")
   }
+
   if (!datasets$get_check()) {
     check_note_string <- paste0(
       c(
-        "## NOTE: Reproducibility of data import and preprocessing was not",
-        "## explicitly checked (argument \"check = FALSE\" is set).",
-        "## The app developer has the choice to check the reproducibility",
-        "## and might have omitted this step for some reason. Please reach",
-        "## out to the app developer for details.\n"
+        "message(paste(\"Reproducibility of data import and preprocessing was not explicitly checked\",",
+        "   \" ('check = FALSE' is set). Contact app developer if this is an issue.\n\"))"
       ),
       collapse = "\n"
     )
@@ -177,9 +171,9 @@ get_datasets_code <- function(datanames, datasets) {
         datanames,
         function(dataname) {
           sprintf(
-            "# %s MD5 hash at the time of analysis: %s",
-            dataname,
-            datasets$get_filtered_dataset(dataname)$get_hash()
+            "stopifnot(%s == %s)",
+            deparse1(bquote(rlang::hash(.(as.name(dataname))))),
+            deparse1(hashes[[dataname]])
           )
         },
         character(1)
@@ -189,11 +183,7 @@ get_datasets_code <- function(datanames, datasets) {
     "\n\n"
   )
 
-  str_filter <- teal.slice::get_filter_expr(datasets, datanames)
-  if (str_filter != "") {
-    str_filter <- paste0(str_filter, "\n\n")
-  }
-  c(str_code, str_hash, str_filter)
+  c(str_code, str_hash)
 }
 
 ## Module ----
@@ -203,6 +193,7 @@ get_datasets_code <- function(datanames, datasets) {
 #'
 #' @inheritParams get_rcode
 #' @inheritParams shiny::moduleServer
+#'
 #' @param modal_title optional, (`character`) title of the modal
 #' @param code_header optional, (`character`) header inside R
 #' @param disable_buttons optional, (`reactive`)

diff --git a/R/module_nested_tabs.R b/R/module_nested_tabs.R
@@ -160,6 +160,7 @@ srv_nested_tabs.teal_modules <- function(id, datasets, modules, reporter) {
         "module { deparse1(modules$label) }."
       )
     )
+
     modules_reactive <- sapply(names(modules$children), USE.NAMES = TRUE, function(id) {
       srv_nested_tabs(id = id, datasets = datasets, modules = modules$children[[id]], reporter = reporter)
     })
@@ -232,13 +233,16 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
 #'
 #' Converts `FilteredData` object to `tdata` object containing datasets needed for a specific module.
 #' Please note that if module needs dataset which has a parent, then parent will be also returned.
+#' A hash per `dataset` is calculated internally and returned in the code.
 #'
 #' @param module (`teal_module`) module where needed filters are taken from
 #' @param datasets (`FilteredData`) object where needed data are taken from
 #' @return list of reactive datasets with following attributes:
 #' - `code` (`character`) containing datasets reproducible code.
-#' @keywords internal
 #' - `join_keys` (`JoinKeys`) containing relationships between datasets.
+#' - `metadata` (`list`) containing metadata of datasets.
+#'
+#' @keywords internal
 .datasets_to_data <- function(module, datasets) {
   datanames <- if (identical("all", module$filter) || is.null(module$filter)) {
     datasets$datanames()
@@ -255,13 +259,39 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
     }
   )
 
+  hashes <- calculate_hashes(datanames, datasets)
   metadata <- lapply(datanames, datasets$get_metadata)
   names(metadata) <- datanames
 
   new_tdata(
     data,
-    reactive(get_datasets_code(datanames, datasets)),
+    reactive(
+      c(
+        get_rcode_str_install(),
+        get_rcode_libraries(),
+        get_datasets_code(datanames, datasets, hashes),
+        teal.slice::get_filter_expr(datasets, datanames)
+      )
+    ),
     datasets$get_join_keys(),
     metadata
   )
 }
+
+#' Get the hash of a dataset
+#'
+#' @param datanames (`character`) names of datasets
+#' @param datasets (`FilteredData`) object holding the data
+#'
+#' @return A list of hashes per dataset
+#' @keywords internal
+#'
+calculate_hashes <- function(datanames, datasets) {
+  sapply(
+    datanames,
+    simplify = FALSE,
+    function(x) {
+      rlang::hash(datasets$get_data(x, filtered = FALSE))
+    }
+  )
+}
diff --git a/man/calculate_hashes.Rd b/man/calculate_hashes.Rd
diff --git a/man/dot-datasets_to_data.Rd b/man/dot-datasets_to_data.Rd
diff --git a/man/get_datasets_code.Rd b/man/get_datasets_code.Rd
diff --git a/tests/testthat/test-get_rcode.R b/tests/testthat/test-get_rcode.R
@@ -80,15 +80,16 @@ testthat::test_that("get_datasets_code returns code only for specified datanames
     )
   )
 
+  hashes <- calculate_hashes(datasets$datanames(), datasets)
   testthat::expect_true(
     !grepl(
       "mtcars",
-      paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
+      paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
       ignore.case = TRUE
     ) &&
       grepl(
         "iris",
-        paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
+        paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
         ignore.case = TRUE
       )
   )

diff --git a/tests/testthat/test-module_nested_tabs.R b/tests/testthat/test-module_nested_tabs.R
@@ -313,8 +313,22 @@ testthat::test_that(".datasets_to_data returns tdata object", {
 
   # code
   testthat::expect_equal(
-    isolate(get_code(data)[1]),
-    "d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n"
+    isolate(get_code(data)),
+    c(
+      "# Add any code to install/load your NEST environment here",
+      paste0(
+        "library(testthat)\nlibrary(shiny)\nlibrary(teal.data)\nlibrary(magrittr)\nlibrary(teal.transform)\n",
+        "library(teal)\nlibrary(matrixStats)\nlibrary(MatrixGenerics)\nlibrary(BiocGenerics)\nlibrary(S4Vectors)\n",
+        "library(IRanges)\nlibrary(GenomeInfoDb)\nlibrary(GenomicRanges)\nlibrary(Biobase)\n",
+        "library(SummarizedExperiment)\nlibrary(MultiAssayExperiment)"
+      ),
+      "d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n",
+      paste0(
+        "stopifnot(rlang::hash(d1) == \"f6f90d2c133ca4abdeb2f7a7d85b731e\")\n",
+        "stopifnot(rlang::hash(d2) == \"6e30be195b7d914a1311672c3ebf4e4f\") \n\n"
+      ),
+      ""
+    )
   )
 
   # metadata
@@ -343,3 +357,65 @@ testthat::test_that(".datasets_to_data returns parent datasets for CDISC data",
   data <- .datasets_to_data(module, datasets)
   testthat::expect_setequal(isolate(names(data)), c("ADSL", "ADAE"))
 })
+
+testthat::test_that("calculate_hashes takes a FilteredData and vector of datanames as input", {
+  adsl <- data.frame(STUDYID = 1, USUBJID = 1)
+  adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
+  adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)
+
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::cdisc_data(
+      teal.data::cdisc_dataset("ADSL", adsl),
+      teal.data::cdisc_dataset("ADAE", adae),
+      teal.data::cdisc_dataset("ADTTE", adtte)
+    )
+  )
+
+  testthat::expect_error(calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets), NA)
+})
+
+testthat::test_that("calculate_hashes returns a named list", {
+  adsl <- data.frame(STUDYID = 1, USUBJID = 1)
+  adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
+  adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)
+
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::cdisc_data(
+      teal.data::cdisc_dataset("ADSL", adsl),
+      teal.data::cdisc_dataset("ADAE", adae),
+      teal.data::cdisc_dataset("ADTTE", adtte)
+    )
+  )
+
+  hashes <- calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets)
+  testthat::expect_identical(
+    hashes,
+    list(
+      "ADSL" = "e89f5271357822c78dd5cfddb60c0a95",
+      "ADAE" = "f71b576ecfd23075f7285841327515e0",
+      "ADTTE" = "c68c01c86b946a3dfe05150da040aa2a"
+    )
+  )
+  testthat::expect_is(hashes, "list")
+  testthat::expect_named(hashes)
+})
+
+testthat::test_that("calculate_hashes returns the hash of the non Filtered dataset", {
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::teal_data(
+      teal.data::dataset("iris", iris)
+    )
+  )
+
+  fs <- list(
+    iris = list(
+      Sepal.Length = list(c(5.1, 6.4)),
+      Species = c("setosa", "versicolor")
+    )
+  )
+  datasets$set_filter_state(state = fs)
+
+  hashes <- calculate_hashes(datanames = c("iris"), datasets = datasets)
+  testthat::expect_identical(hashes, list("iris" = "34844aba7bde36f5a34f6d8e39803508"))
+  testthat::expect_false(hashes == rlang::hash(isolate(datasets$get_data("iris", filtered = TRUE))))
+})
diff --git a/vignettes/adding-support-for-reporting.Rmd b/vignettes/adding-support-for-reporting.Rmd
@@ -323,8 +323,9 @@ example_reporter_module <- function(label = "Example") {
 
 app <- init(
   data = teal_data(
-    dataset("AIR", airquality, code = "AIR <- data(airquality)"),
-    dataset("IRIS", iris, code = "IRIS <- data(iris)")
+    dataset("AIR", airquality, code = "data(airquality); AIR <- airquality"),
+    dataset("IRIS", iris, code = "data(iris); IRIS <- iris"),
+    check = FALSE
   ),
   modules = modules(
     example_reporter_module(label = "with Reporter"),