Skip to content

Commit

Permalink
Merge pull request #795 from ropensci/12
Browse files Browse the repository at this point in the history
file_in()/file_out() directories
  • Loading branch information
wlandau-lilly authored Mar 22, 2019
2 parents d1fceb8 + 3461fd7 commit 001f2fd
Show file tree
Hide file tree
Showing 18 changed files with 448 additions and 121 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

## Enhancements

- `file_in()` and `file_out()` can now handle entire directories, e.g. `file_in("your_folder_of_input_data_files")` and `file_out("directory_with_a_bunch_of_output_files")`.
- Improve `drake_ggraph()`
- Hide node labels by default and render the arrows behind the nodes.
- Print an informative error message when the user supplies a `drake` plan to the `config` argument of a function.
Expand Down
2 changes: 1 addition & 1 deletion R/api-clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ clean_single_target <- function(
}
}
if (length(files)) {
unlink(decode_path(files))
unlink(decode_path(files), recursive = TRUE)
}
}

Expand Down
102 changes: 67 additions & 35 deletions R/api-plan.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
#' summ = target(
#' sum_fun(data, reg),
#' transform = cross(sum_fun = c(coef, residuals), reg)
#' ),
#' ),
#' winners = target(
#' min(summ),
#' transform = combine(summ, .by = c(data, sum_fun))
Expand All @@ -108,7 +108,7 @@
#' summ = target(
#' sum_fun(data, reg),
#' transform = cross(sum_fun = c(coef, residuals), reg)
#' ),
#' ),
#' winners = target(
#' min(summ),
#' transform = combine(summ, .by = c(data, sum_fun))
Expand Down Expand Up @@ -239,15 +239,13 @@ complete_target_names <- function(commands_list) {
commands_list
}

#' @title Declare the file inputs of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. See the examples
#' for a full explanation.
#' @title Declare input files and directories.
#' @description `file_in()` marks individual files
#' (and whole directories) that your targets depend on.
#' @export
#' @seealso [file_out()], [knitr_in()], [ignore()]
#' @return A character vector of declared input file paths.
#' @param ... Character strings. File paths of input files
#' to a command in your workflow plan data frame.
#' @return A character vector of declared input file or directory paths.
#' @param ... Character vector, paths to files and directories.
#' @export
#' @examples
#' \dontrun{
Expand All @@ -259,7 +257,7 @@ complete_target_names <- function(commands_list) {
#' # in your workflow plan data frame.
#' suppressWarnings(
#' plan <- drake_plan(
#' write.csv(mtcars, file_out("mtcars.csv")),
#' out = write.csv(mtcars, file_out("mtcars.csv")),
#' contents = read.csv(file_in("mtcars.csv"))
#' )
#' )
Expand All @@ -268,37 +266,51 @@ complete_target_names <- function(commands_list) {
#' # and a dependency of `contents`. See for yourself:
#' make(plan)
#' file.exists("mtcars.csv")
#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
#' # except that it analyzes active code chunks in your `knitr`
#' # source file and detects non-file dependencies.
#' # That way, updates to the right dependencies trigger rebuilds
#' # in your report.
#' # You can also work with entire directories this way.
#' # However, in `file_out("your_directory")`, the directory
#' # becomes an entire unit. Thus, `file_in("your_directory")`
#' # is more appropriate for subsequent steps than
#' # `file_in("your_directory/file_inside.txt")`.
#' suppressWarnings(
#' plan <- drake_plan(
#' out = {
#' dir.create(file_out("dir"))
#' write.csv(mtcars, "dir/mtcars.csv")
#' },
#' contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
#' )
#' )
#' plan
#' make(plan)
#' file.exists("dir/mtcars.csv")
#' # See the connections that the file relationships create:
#' # config <- drake_config(plan) # nolint
#' # vis_drake_graph(config) # nolint
#' })
#' }
file_in <- function(...) {
as.character(c(...))
}

#' @title Declare the file outputs of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. You can only specify
#' one file output per command. See the examples
#' for a full explanation.
#' @title Declare output files and directories.
#' @description `file_in()` marks individual files
#' (and whole directories) that your targets create.
#' @export
#' @seealso [file_out()], [knitr_in()], [ignore()]
#' @return A character vector of declared output file or directory paths.
#' @param ... Character vector, paths to files and directories.
#' @export
#' @seealso [file_in()], [knitr_in()], [ignore()]
#' @return A character vector of declared output file paths.
#' @param ... Character vector of output file paths.
#' @examples
#' \dontrun{
#' test_with_dir("Contain side effects", {
#' # The `file_out()` and `file_in()` functions
#' # just takes in strings and returns them.
#' file_out("summaries.txt", "output.csv")
#' file_out("summaries.txt")
#' # Their main purpose is to orchestrate your custom files
#' # in your workflow plan data frame.
#' suppressWarnings(
#' plan <- drake_plan(
#' write.csv(mtcars, file_out("mtcars.csv")),
#' out = write.csv(mtcars, file_out("mtcars.csv")),
#' contents = read.csv(file_in("mtcars.csv"))
#' )
#' )
Expand All @@ -307,20 +319,40 @@ file_in <- function(...) {
#' # and a dependency of `contents`. See for yourself:
#' make(plan)
#' file.exists("mtcars.csv")
#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
#' # except that it analyzes active code chunks in your `knitr`
#' # source file and detects non-file dependencies.
#' # That way, updates to the right dependencies trigger rebuilds
#' # in your report.
#' # You can also work with entire directories this way.
#' # However, in `file_out("your_directory")`, the directory
#' # becomes an entire unit. Thus, `file_in("your_directory")`
#' # is more appropriate for subsequent steps than
#' # `file_in("your_directory/file_inside.txt")`.
#' suppressWarnings(
#' plan <- drake_plan(
#' out = {
#' dir.create(file_out("dir"))
#' write.csv(mtcars, "dir/mtcars.csv")
#' },
#' contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
#' )
#' )
#' plan
#' make(plan)
#' # See the connections that the file relationships create:
#' # config <- drake_config(plan) # nolint
#' # vis_drake_graph(config) # nolint
#' file.exists("dir/mtcars.csv")
#' })
#' }
file_out <- file_in

#' @title Declare the `knitr`/`rmarkdown` source files
#' of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. See the examples
#' for a full explanation.
#' @title Declare `knitr`/`rmarkdown` source files
#' as dependencies.
#' @description `knitr_in()` marks individual `knitr`/R Markdown
#' reports as dependencies. In `drake`, these reports are pieces
#' of the pipeline. R Markdown is a great tool for *displaying*
#' precomputed results, but not for running a large workflow
#' from end to end. These reports should do as little
#' computation as possible.
#' @details Unlike [file_in()] and [file_out()], `knitr_in()`
#' does not work with entire directories.
#' @export
#' @seealso [file_in()], [file_out()], [ignore()]
#' @return A character vector of declared input file paths.
Expand Down
102 changes: 86 additions & 16 deletions R/exec-meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ drake_meta_ <- function(target, config) {
}
# For imported files.
if (meta$isfile) {
meta$mtime <- file.mtime(decode_path(target, config))
meta$mtime <- storage_mtime(decode_path(target, config))
}
if (meta$trigger$command) {
meta$command <- layout$command_standardized
Expand Down Expand Up @@ -76,7 +76,7 @@ self_hash <- function(target, config) {
input_file_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
deps <- config$layout[[target]]$deps_build
files <- sort(unique(as.character(c(deps$file_in, deps$knitr_in))))
Expand All @@ -86,7 +86,7 @@ input_file_hash <- function(
out <- ht_memo(
ht = config$ht_get_hash,
x = files,
fun = file_hash,
fun = storage_hash,
config = config,
size_cutoff = size_cutoff
)
Expand All @@ -101,7 +101,7 @@ input_file_hash <- function(
output_file_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
deps <- config$layout[[target]]$deps_build
files <- sort(unique(as.character(deps$file_out)))
Expand All @@ -110,7 +110,7 @@ output_file_hash <- function(
}
out <- vapply(
X = files,
FUN = file_hash,
FUN = storage_hash,
FUN.VALUE = character(1),
config = config,
size_cutoff = size_cutoff
Expand All @@ -123,14 +123,22 @@ output_file_hash <- function(
)
}

rehash_file <- function(target, config) {
rehash_storage <- function(target, config) {
if (!is_encoded_path(target)) {
return(NA_character_)
}
file <- decode_path(target, config)
if (!file.exists(file) || file.info(file)$isdir) {
if (!file.exists(file)) {
return(NA_character_)
}
if (dir.exists(file)) {
rehash_dir(file, config)
} else {
rehash_file(file, config)
}
}

rehash_file <- function(file, config) {
digest::digest(
object = file,
algo = config$cache$driver$hash_algorithm,
Expand All @@ -139,27 +147,49 @@ rehash_file <- function(target, config) {
)
}

safe_rehash_file <- function(target, config) {
rehash_dir <- function(dir, config) {
files <- list.files(
path = dir,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
out <- vapply(
files,
rehash_file,
FUN.VALUE = character(1),
config = config
)
out <- paste(out, collapse = "")
digest::digest(
out,
algo = config$cache$driver$hash_algorithm,
serialize = FALSE
)
}

safe_rehash_storage <- function(target, config) {
if (file.exists(decode_path(target, config))) {
rehash_file(target = target, config = config)
rehash_storage(target = target, config = config)
} else {
NA_character_
}
}

should_rehash_file <- function(filename, new_mtime, old_mtime,
should_rehash_storage <- function(filename, new_mtime, old_mtime,
size_cutoff) {
do_rehash <- file.size(filename) < size_cutoff | new_mtime > old_mtime
do_rehash <- storage_size(filename) < size_cutoff | new_mtime > old_mtime
if (safe_is_na(do_rehash)) {
do_rehash <- TRUE
}
do_rehash
}

file_hash <- function(
storage_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
if (!is_encoded_path(target)) {
return(NA_character_)
Expand All @@ -183,16 +213,56 @@ file_hash <- function(
),
-Inf
)
new_mtime <- file.mtime(filename)
do_rehash <- should_rehash_file(
new_mtime <- storage_mtime(filename)
do_rehash <- should_rehash_storage(
filename = filename,
new_mtime = new_mtime,
old_mtime = old_mtime,
size_cutoff = size_cutoff)
old_hash_exists <- config$cache$exists(key = target)
if (do_rehash || !old_hash_exists) {
rehash_file(target = target, config = config)
rehash_storage(target = target, config = config)
} else {
config$cache$get(key = target)
}
}

storage_mtime <- function(x) {
if (dir.exists(x)) {
dir_mtime(x)
} else {
file.mtime(x)
}
}

storage_size <- function(x) {
if (dir.exists(x)) {
dir_size(x)
} else {
file.size(x)
}
}

dir_mtime <- function(x) {
files <- list.files(
path = x,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
times <- vapply(files, file.mtime, FUN.VALUE = numeric(1))
max(times %||% Inf)
}

dir_size <- function(x) {
files <- list.files(
path = x,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
sizes <- vapply(files, file.size, FUN.VALUE = numeric(1))
max(sizes %||% 0)
}
4 changes: 2 additions & 2 deletions R/exec-store.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ store_object <- function(target, value, meta, config) {
store_file <- function(target, meta, config) {
store_object(
target = target,
value = safe_rehash_file(target = target, config = config),
value = safe_rehash_storage(target = target, config = config),
meta = meta,
config = config
)
Expand All @@ -102,7 +102,7 @@ store_output_files <- function(files, meta, config) {
meta$isfile <- TRUE
for (file in files) {
meta$name <- file
meta$mtime <- file.mtime(decode_path(file, config))
meta$mtime <- storage_mtime(decode_path(file, config))
meta$isfile <- TRUE
store_single_output(
target = file,
Expand Down
2 changes: 1 addition & 1 deletion R/utils-checksums.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ mc_get_outfile_checksum <- function(target, config) {
files <- sort(unique(as.character(deps$file_out)))
out <- vapply(
X = files,
FUN = rehash_file,
FUN = rehash_storage,
FUN.VALUE = character(1),
config = config
)
Expand Down
Loading

0 comments on commit 001f2fd

Please sign in to comment.