Merge pull request #795 from ropensci/12

file_in()/file_out() directories
ropensci · Mar 22, 2019 · 001f2fd · 001f2fd
2 parents d1fceb8 + 3461fd7
commit 001f2fd
Show file tree

Hide file tree

Showing 18 changed files with 448 additions and 121 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -11,6 +11,7 @@
 
 ## Enhancements
 
+- `file_in()` and `file_out()` can now handle entire directories, e.g. `file_in("your_folder_of_input_data_files")` and `file_out("directory_with_a_bunch_of_output_files")`.
 - Improve `drake_ggraph()`
   - Hide node labels by default and render the arrows behind the nodes.
   - Print an informative error message when the user supplies a `drake` plan to the `config` argument of a function.

diff --git a/R/api-clean.R b/R/api-clean.R
@@ -169,7 +169,7 @@ clean_single_target <- function(
     }
   }
   if (length(files)) {
-    unlink(decode_path(files))
+    unlink(decode_path(files), recursive = TRUE)
   }
 }
 

diff --git a/R/api-plan.R b/R/api-plan.R
@@ -87,7 +87,7 @@
 #'   summ = target(
 #'     sum_fun(data, reg),
 #'    transform = cross(sum_fun = c(coef, residuals), reg)
-#'   ), 
+#'   ),
 #'   winners = target(
 #'     min(summ),
 #'     transform = combine(summ, .by = c(data, sum_fun))
@@ -108,7 +108,7 @@
 #'   summ = target(
 #'     sum_fun(data, reg),
 #'    transform = cross(sum_fun = c(coef, residuals), reg)
-#'   ), 
+#'   ),
 #'   winners = target(
 #'     min(summ),
 #'     transform = combine(summ, .by = c(data, sum_fun))
@@ -239,15 +239,13 @@ complete_target_names <- function(commands_list) {
   commands_list
 }
 
-#' @title Declare the file inputs of a workflow plan command.
-#' @description Use this function to help write the commands
-#'   in your workflow plan data frame. See the examples
-#'   for a full explanation.
+#' @title Declare input files and directories.
+#' @description `file_in()` marks individual files
+#'   (and whole directories) that your targets depend on.
 #' @export
 #' @seealso [file_out()], [knitr_in()], [ignore()]
-#' @return A character vector of declared input file paths.
-#' @param ... Character strings. File paths of input files
-#'   to a command in your workflow plan data frame.
+#' @return A character vector of declared input file or directory paths.
+#' @param ... Character vector, paths to files and directories.
 #' @export
 #' @examples
 #' \dontrun{
@@ -259,7 +257,7 @@ complete_target_names <- function(commands_list) {
 #' # in your workflow plan data frame.
 #' suppressWarnings(
 #'   plan <- drake_plan(
-#'     write.csv(mtcars, file_out("mtcars.csv")),
+#'     out = write.csv(mtcars, file_out("mtcars.csv")),
 #'     contents = read.csv(file_in("mtcars.csv"))
 #'   )
 #' )
@@ -268,37 +266,51 @@ complete_target_names <- function(commands_list) {
 #' # and a dependency of `contents`. See for yourself:
 #' make(plan)
 #' file.exists("mtcars.csv")
-#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
-#' # except that it analyzes active code chunks in your `knitr`
-#' # source file and detects non-file dependencies.
-#' # That way, updates to the right dependencies trigger rebuilds
-#' # in your report.
+#' # You can also work with entire directories this way.
+#' # However, in `file_out("your_directory")`, the directory
+#' # becomes an entire unit. Thus, `file_in("your_directory")`
+#' # is more appropriate for subsequent steps than
+#' # `file_in("your_directory/file_inside.txt")`.
+#' suppressWarnings(
+#'   plan <- drake_plan(
+#'     out = {
+#'       dir.create(file_out("dir"))
+#'       write.csv(mtcars, "dir/mtcars.csv")
+#'     },
+#'     contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
+#'   )
+#' )
+#' plan
+#' make(plan)
+#' file.exists("dir/mtcars.csv")
+#' # See the connections that the file relationships create:
+#' # config <- drake_config(plan) # nolint
+#' # vis_drake_graph(config)      # nolint
 #' })
 #' }
 file_in <- function(...) {
   as.character(c(...))
 }
 
-#' @title Declare the file outputs of a workflow plan command.
-#' @description Use this function to help write the commands
-#'   in your workflow plan data frame. You can only specify
-#'   one file output per command. See the examples
-#'   for a full explanation.
+#' @title Declare output files and directories.
+#' @description `file_in()` marks individual files
+#'   (and whole directories) that your targets create.
+#' @export
+#' @seealso [file_out()], [knitr_in()], [ignore()]
+#' @return A character vector of declared output file or directory paths.
+#' @param ... Character vector, paths to files and directories.
 #' @export
-#' @seealso [file_in()], [knitr_in()], [ignore()]
-#' @return A character vector of declared output file paths.
-#' @param ... Character vector of output file paths.
 #' @examples
 #' \dontrun{
 #' test_with_dir("Contain side effects", {
 #' # The `file_out()` and `file_in()` functions
 #' # just takes in strings and returns them.
-#' file_out("summaries.txt", "output.csv")
+#' file_out("summaries.txt")
 #' # Their main purpose is to orchestrate your custom files
 #' # in your workflow plan data frame.
 #' suppressWarnings(
 #'   plan <- drake_plan(
-#'     write.csv(mtcars, file_out("mtcars.csv")),
+#'     out = write.csv(mtcars, file_out("mtcars.csv")),
 #'     contents = read.csv(file_in("mtcars.csv"))
 #'   )
 #' )
@@ -307,20 +319,40 @@ file_in <- function(...) {
 #' # and a dependency of `contents`. See for yourself:
 #' make(plan)
 #' file.exists("mtcars.csv")
-#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
-#' # except that it analyzes active code chunks in your `knitr`
-#' # source file and detects non-file dependencies.
-#' # That way, updates to the right dependencies trigger rebuilds
-#' # in your report.
+#' # You can also work with entire directories this way.
+#' # However, in `file_out("your_directory")`, the directory
+#' # becomes an entire unit. Thus, `file_in("your_directory")`
+#' # is more appropriate for subsequent steps than
+#' # `file_in("your_directory/file_inside.txt")`.
+#' suppressWarnings(
+#'   plan <- drake_plan(
+#'     out = {
+#'       dir.create(file_out("dir"))
+#'       write.csv(mtcars, "dir/mtcars.csv")
+#'     },
+#'     contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
+#'   )
+#' )
+#' plan
+#' make(plan)
+#' # See the connections that the file relationships create:
+#' # config <- drake_config(plan) # nolint
+#' # vis_drake_graph(config)      # nolint
+#' file.exists("dir/mtcars.csv")
 #' })
 #' }
 file_out <- file_in
 
-#' @title Declare the `knitr`/`rmarkdown` source files
-#'   of a workflow plan command.
-#' @description Use this function to help write the commands
-#'   in your workflow plan data frame. See the examples
-#'   for a full explanation.
+#' @title Declare `knitr`/`rmarkdown` source files
+#'   as dependencies.
+#' @description `knitr_in()` marks individual `knitr`/R Markdown
+#'   reports as dependencies. In `drake`, these reports are pieces
+#'   of the pipeline. R Markdown is a great tool for *displaying*
+#'   precomputed results, but not for running a large workflow
+#'   from end to end. These reports should do as little
+#'   computation as possible.
+#' @details Unlike [file_in()] and [file_out()], `knitr_in()`
+#'   does not work with entire directories.
 #' @export
 #' @seealso [file_in()], [file_out()], [ignore()]
 #' @return A character vector of declared input file paths.

diff --git a/R/exec-meta.R b/R/exec-meta.R
@@ -19,7 +19,7 @@ drake_meta_ <- function(target, config) {
   }
   # For imported files.
   if (meta$isfile) {
-    meta$mtime <- file.mtime(decode_path(target, config))
+    meta$mtime <- storage_mtime(decode_path(target, config))
   }
   if (meta$trigger$command) {
     meta$command <- layout$command_standardized
@@ -76,7 +76,7 @@ self_hash <- function(target, config) {
 input_file_hash <- function(
   target,
   config,
-  size_cutoff = rehash_file_size_cutoff
+  size_cutoff = rehash_storage_size_cutoff
 ) {
   deps <- config$layout[[target]]$deps_build
   files <- sort(unique(as.character(c(deps$file_in, deps$knitr_in))))
@@ -86,7 +86,7 @@ input_file_hash <- function(
   out <- ht_memo(
     ht = config$ht_get_hash,
     x = files,
-    fun = file_hash,
+    fun = storage_hash,
     config = config,
     size_cutoff = size_cutoff
   )
@@ -101,7 +101,7 @@ input_file_hash <- function(
 output_file_hash <- function(
   target,
   config,
-  size_cutoff = rehash_file_size_cutoff
+  size_cutoff = rehash_storage_size_cutoff
 ) {
   deps <- config$layout[[target]]$deps_build
   files <- sort(unique(as.character(deps$file_out)))
@@ -110,7 +110,7 @@ output_file_hash <- function(
   }
   out <- vapply(
     X = files,
-    FUN = file_hash,
+    FUN = storage_hash,
     FUN.VALUE = character(1),
     config = config,
     size_cutoff = size_cutoff
@@ -123,14 +123,22 @@ output_file_hash <- function(
   )
 }
 
-rehash_file <- function(target, config) {
+rehash_storage <- function(target, config) {
   if (!is_encoded_path(target)) {
     return(NA_character_)
   }
   file <- decode_path(target, config)
-  if (!file.exists(file) || file.info(file)$isdir) {
+  if (!file.exists(file)) {
     return(NA_character_)
   }
+  if (dir.exists(file)) {
+    rehash_dir(file, config)
+  } else {
+    rehash_file(file, config)
+  }
+}
+
+rehash_file <- function(file, config) {
   digest::digest(
     object = file,
     algo = config$cache$driver$hash_algorithm,
@@ -139,27 +147,49 @@ rehash_file <- function(target, config) {
   )
 }
 
-safe_rehash_file <- function(target, config) {
+rehash_dir <- function(dir, config) {
+  files <- list.files(
+    path = dir,
+    all.files = TRUE,
+    full.names = TRUE,
+    recursive = TRUE,
+    include.dirs = FALSE
+  )
+  out <- vapply(
+    files,
+    rehash_file,
+    FUN.VALUE = character(1),
+    config = config
+  )
+  out <- paste(out, collapse = "")
+  digest::digest(
+    out,
+    algo = config$cache$driver$hash_algorithm,
+    serialize = FALSE
+  )
+}
+
+safe_rehash_storage <- function(target, config) {
   if (file.exists(decode_path(target, config))) {
-    rehash_file(target = target, config = config)
+    rehash_storage(target = target, config = config)
   } else {
     NA_character_
   }
 }
 
-should_rehash_file <- function(filename, new_mtime, old_mtime,
+should_rehash_storage <- function(filename, new_mtime, old_mtime,
   size_cutoff) {
-  do_rehash <- file.size(filename) < size_cutoff | new_mtime > old_mtime
+  do_rehash <- storage_size(filename) < size_cutoff | new_mtime > old_mtime
   if (safe_is_na(do_rehash)) {
     do_rehash <- TRUE
   }
   do_rehash
 }
 
-file_hash <- function(
+storage_hash <- function(
   target,
   config,
-  size_cutoff = rehash_file_size_cutoff
+  size_cutoff = rehash_storage_size_cutoff
 ) {
   if (!is_encoded_path(target)) {
     return(NA_character_)
@@ -183,16 +213,56 @@ file_hash <- function(
     ),
     -Inf
   )
-  new_mtime <- file.mtime(filename)
-  do_rehash <- should_rehash_file(
+  new_mtime <- storage_mtime(filename)
+  do_rehash <- should_rehash_storage(
     filename = filename,
     new_mtime = new_mtime,
     old_mtime = old_mtime,
     size_cutoff = size_cutoff)
   old_hash_exists <- config$cache$exists(key = target)
   if (do_rehash || !old_hash_exists) {
-    rehash_file(target = target, config = config)
+    rehash_storage(target = target, config = config)
   } else {
     config$cache$get(key = target)
   }
 }
+
+storage_mtime <- function(x) {
+  if (dir.exists(x)) {
+    dir_mtime(x)
+  } else {
+    file.mtime(x)
+  }
+}
+
+storage_size <- function(x) {
+  if (dir.exists(x)) {
+    dir_size(x)
+  } else {
+    file.size(x)
+  }
+}
+
+dir_mtime <- function(x) {
+  files <- list.files(
+    path = x,
+    all.files = TRUE,
+    full.names = TRUE,
+    recursive = TRUE,
+    include.dirs = FALSE
+  )
+  times <- vapply(files, file.mtime, FUN.VALUE = numeric(1))
+  max(times %||% Inf)
+}
+
+dir_size <- function(x) {
+  files <- list.files(
+    path = x,
+    all.files = TRUE,
+    full.names = TRUE,
+    recursive = TRUE,
+    include.dirs = FALSE
+  )
+  sizes <- vapply(files, file.size, FUN.VALUE = numeric(1))
+  max(sizes %||% 0)
+}
diff --git a/R/exec-store.R b/R/exec-store.R
@@ -92,7 +92,7 @@ store_object <- function(target, value, meta, config) {
 store_file <- function(target, meta, config) {
   store_object(
     target = target,
-    value = safe_rehash_file(target = target, config = config),
+    value = safe_rehash_storage(target = target, config = config),
     meta = meta,
     config = config
   )
@@ -102,7 +102,7 @@ store_output_files <- function(files, meta, config) {
   meta$isfile <- TRUE
   for (file in files) {
     meta$name <- file
-    meta$mtime <- file.mtime(decode_path(file, config))
+    meta$mtime <- storage_mtime(decode_path(file, config))
     meta$isfile <- TRUE
     store_single_output(
       target = file,

diff --git a/R/utils-checksums.R b/R/utils-checksums.R
@@ -16,7 +16,7 @@ mc_get_outfile_checksum <- function(target, config) {
   files <- sort(unique(as.character(deps$file_out)))
   out <- vapply(
     X = files,
-    FUN = rehash_file,
+    FUN = rehash_storage,
     FUN.VALUE = character(1),
     config = config
   )
-Original file line number
+Diff line change
@@ Expand Up / @@ -169,7 +169,7 @@ clean_single_target <- function( @@
         }
       }
       if (length(files)) {
-        unlink(decode_path(files))
+        unlink(decode_path(files), recursive = TRUE)
       }
     }
@@ Expand Down @@