Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

file_in()/file_out() directories #795

Merged
merged 11 commits into from
Mar 22, 2019
Merged
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

## Enhancements

- `file_in()` and `file_out()` can now handle entire directories, e.g. `file_in("your_folder_of_input_data_files")` and `file_out("directory_with_a_bunch_of_output_files")`.
- Improve `drake_ggraph()`
- Hide node labels by default and render the arrows behind the nodes.
- Print an informative error message when the user supplies a `drake` plan to the `config` argument of a function.
Expand Down
2 changes: 1 addition & 1 deletion R/api-clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ clean_single_target <- function(
}
}
if (length(files)) {
unlink(decode_path(files))
unlink(decode_path(files), recursive = TRUE)
}
}

Expand Down
102 changes: 67 additions & 35 deletions R/api-plan.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
#' summ = target(
#' sum_fun(data, reg),
#' transform = cross(sum_fun = c(coef, residuals), reg)
#' ),
#' ),
#' winners = target(
#' min(summ),
#' transform = combine(summ, .by = c(data, sum_fun))
Expand All @@ -108,7 +108,7 @@
#' summ = target(
#' sum_fun(data, reg),
#' transform = cross(sum_fun = c(coef, residuals), reg)
#' ),
#' ),
#' winners = target(
#' min(summ),
#' transform = combine(summ, .by = c(data, sum_fun))
Expand Down Expand Up @@ -239,15 +239,13 @@ complete_target_names <- function(commands_list) {
commands_list
}

#' @title Declare the file inputs of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. See the examples
#' for a full explanation.
#' @title Declare input files and directories.
#' @description `file_in()` marks individual files
#' (and whole directories) that your targets depend on.
#' @export
#' @seealso [file_out()], [knitr_in()], [ignore()]
#' @return A character vector of declared input file paths.
#' @param ... Character strings. File paths of input files
#' to a command in your workflow plan data frame.
#' @return A character vector of declared input file or directory paths.
#' @param ... Character vector, paths to files and directories.
#' @export
#' @examples
#' \dontrun{
Expand All @@ -259,7 +257,7 @@ complete_target_names <- function(commands_list) {
#' # in your workflow plan data frame.
#' suppressWarnings(
#' plan <- drake_plan(
#' write.csv(mtcars, file_out("mtcars.csv")),
#' out = write.csv(mtcars, file_out("mtcars.csv")),
#' contents = read.csv(file_in("mtcars.csv"))
#' )
#' )
Expand All @@ -268,37 +266,51 @@ complete_target_names <- function(commands_list) {
#' # and a dependency of `contents`. See for yourself:
#' make(plan)
#' file.exists("mtcars.csv")
#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
#' # except that it analyzes active code chunks in your `knitr`
#' # source file and detects non-file dependencies.
#' # That way, updates to the right dependencies trigger rebuilds
#' # in your report.
#' # You can also work with entire directories this way.
#' # However, in `file_out("your_directory")`, the directory
#' # becomes an entire unit. Thus, `file_in("your_directory")`
#' # is more appropriate for subsequent steps than
#' # `file_in("your_directory/file_inside.txt")`.
#' suppressWarnings(
#' plan <- drake_plan(
#' out = {
#' dir.create(file_out("dir"))
#' write.csv(mtcars, "dir/mtcars.csv")
#' },
#' contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
#' )
#' )
#' plan
#' make(plan)
#' file.exists("dir/mtcars.csv")
#' # See the connections that the file relationships create:
#' # config <- drake_config(plan) # nolint
#' # vis_drake_graph(config) # nolint
#' })
#' }
file_in <- function(...) {
as.character(c(...))
}

#' @title Declare the file outputs of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. You can only specify
#' one file output per command. See the examples
#' for a full explanation.
#' @title Declare output files and directories.
#' @description `file_in()` marks individual files
#' (and whole directories) that your targets create.
#' @export
#' @seealso [file_out()], [knitr_in()], [ignore()]
#' @return A character vector of declared output file or directory paths.
#' @param ... Character vector, paths to files and directories.
#' @export
#' @seealso [file_in()], [knitr_in()], [ignore()]
#' @return A character vector of declared output file paths.
#' @param ... Character vector of output file paths.
#' @examples
#' \dontrun{
#' test_with_dir("Contain side effects", {
#' # The `file_out()` and `file_in()` functions
#' # just takes in strings and returns them.
#' file_out("summaries.txt", "output.csv")
#' file_out("summaries.txt")
#' # Their main purpose is to orchestrate your custom files
#' # in your workflow plan data frame.
#' suppressWarnings(
#' plan <- drake_plan(
#' write.csv(mtcars, file_out("mtcars.csv")),
#' out = write.csv(mtcars, file_out("mtcars.csv")),
#' contents = read.csv(file_in("mtcars.csv"))
#' )
#' )
Expand All @@ -307,20 +319,40 @@ file_in <- function(...) {
#' # and a dependency of `contents`. See for yourself:
#' make(plan)
#' file.exists("mtcars.csv")
#' # See also `knitr_in()`. `knitr_in()` is like `file_in()`
#' # except that it analyzes active code chunks in your `knitr`
#' # source file and detects non-file dependencies.
#' # That way, updates to the right dependencies trigger rebuilds
#' # in your report.
#' # You can also work with entire directories this way.
#' # However, in `file_out("your_directory")`, the directory
#' # becomes an entire unit. Thus, `file_in("your_directory")`
#' # is more appropriate for subsequent steps than
#' # `file_in("your_directory/file_inside.txt")`.
#' suppressWarnings(
#' plan <- drake_plan(
#' out = {
#' dir.create(file_out("dir"))
#' write.csv(mtcars, "dir/mtcars.csv")
#' },
#' contents = read.csv(file.path(file_in("dir"), "mtcars.csv"))
#' )
#' )
#' plan
#' make(plan)
#' # See the connections that the file relationships create:
#' # config <- drake_config(plan) # nolint
#' # vis_drake_graph(config) # nolint
#' file.exists("dir/mtcars.csv")
#' })
#' }
file_out <- file_in

#' @title Declare the `knitr`/`rmarkdown` source files
#' of a workflow plan command.
#' @description Use this function to help write the commands
#' in your workflow plan data frame. See the examples
#' for a full explanation.
#' @title Declare `knitr`/`rmarkdown` source files
#' as dependencies.
#' @description `knitr_in()` marks individual `knitr`/R Markdown
#' reports as dependencies. In `drake`, these reports are pieces
#' of the pipeline. R Markdown is a great tool for *displaying*
#' precomputed results, but not for running a large workflow
#' from end to end. These reports should do as little
#' computation as possible.
#' @details Unlike [file_in()] and [file_out()], `knitr_in()`
#' does not work with entire directories.
#' @export
#' @seealso [file_in()], [file_out()], [ignore()]
#' @return A character vector of declared input file paths.
Expand Down
102 changes: 86 additions & 16 deletions R/exec-meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ drake_meta_ <- function(target, config) {
}
# For imported files.
if (meta$isfile) {
meta$mtime <- file.mtime(decode_path(target, config))
meta$mtime <- storage_mtime(decode_path(target, config))
}
if (meta$trigger$command) {
meta$command <- layout$command_standardized
Expand Down Expand Up @@ -76,7 +76,7 @@ self_hash <- function(target, config) {
input_file_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
deps <- config$layout[[target]]$deps_build
files <- sort(unique(as.character(c(deps$file_in, deps$knitr_in))))
Expand All @@ -86,7 +86,7 @@ input_file_hash <- function(
out <- ht_memo(
ht = config$ht_get_hash,
x = files,
fun = file_hash,
fun = storage_hash,
config = config,
size_cutoff = size_cutoff
)
Expand All @@ -101,7 +101,7 @@ input_file_hash <- function(
output_file_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
deps <- config$layout[[target]]$deps_build
files <- sort(unique(as.character(deps$file_out)))
Expand All @@ -110,7 +110,7 @@ output_file_hash <- function(
}
out <- vapply(
X = files,
FUN = file_hash,
FUN = storage_hash,
FUN.VALUE = character(1),
config = config,
size_cutoff = size_cutoff
Expand All @@ -123,14 +123,22 @@ output_file_hash <- function(
)
}

rehash_file <- function(target, config) {
rehash_storage <- function(target, config) {
if (!is_encoded_path(target)) {
return(NA_character_)
}
file <- decode_path(target, config)
if (!file.exists(file) || file.info(file)$isdir) {
if (!file.exists(file)) {
return(NA_character_)
}
if (dir.exists(file)) {
rehash_dir(file, config)
} else {
rehash_file(file, config)
}
}

rehash_file <- function(file, config) {
digest::digest(
object = file,
algo = config$cache$driver$hash_algorithm,
Expand All @@ -139,27 +147,49 @@ rehash_file <- function(target, config) {
)
}

safe_rehash_file <- function(target, config) {
rehash_dir <- function(dir, config) {
files <- list.files(
path = dir,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
out <- vapply(
files,
rehash_file,
FUN.VALUE = character(1),
config = config
)
out <- paste(out, collapse = "")
digest::digest(
out,
algo = config$cache$driver$hash_algorithm,
serialize = FALSE
)
}

safe_rehash_storage <- function(target, config) {
if (file.exists(decode_path(target, config))) {
rehash_file(target = target, config = config)
rehash_storage(target = target, config = config)
} else {
NA_character_
}
}

should_rehash_file <- function(filename, new_mtime, old_mtime,
should_rehash_storage <- function(filename, new_mtime, old_mtime,
size_cutoff) {
do_rehash <- file.size(filename) < size_cutoff | new_mtime > old_mtime
do_rehash <- storage_size(filename) < size_cutoff | new_mtime > old_mtime
if (safe_is_na(do_rehash)) {
do_rehash <- TRUE
}
do_rehash
}

file_hash <- function(
storage_hash <- function(
target,
config,
size_cutoff = rehash_file_size_cutoff
size_cutoff = rehash_storage_size_cutoff
) {
if (!is_encoded_path(target)) {
return(NA_character_)
Expand All @@ -183,16 +213,56 @@ file_hash <- function(
),
-Inf
)
new_mtime <- file.mtime(filename)
do_rehash <- should_rehash_file(
new_mtime <- storage_mtime(filename)
do_rehash <- should_rehash_storage(
filename = filename,
new_mtime = new_mtime,
old_mtime = old_mtime,
size_cutoff = size_cutoff)
old_hash_exists <- config$cache$exists(key = target)
if (do_rehash || !old_hash_exists) {
rehash_file(target = target, config = config)
rehash_storage(target = target, config = config)
} else {
config$cache$get(key = target)
}
}

storage_mtime <- function(x) {
if (dir.exists(x)) {
dir_mtime(x)
} else {
file.mtime(x)
}
}

storage_size <- function(x) {
if (dir.exists(x)) {
dir_size(x)
} else {
file.size(x)
}
}

dir_mtime <- function(x) {
files <- list.files(
path = x,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
times <- vapply(files, file.mtime, FUN.VALUE = numeric(1))
max(times %||% Inf)
}

dir_size <- function(x) {
files <- list.files(
path = x,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
include.dirs = FALSE
)
sizes <- vapply(files, file.size, FUN.VALUE = numeric(1))
max(sizes %||% 0)
}
4 changes: 2 additions & 2 deletions R/exec-store.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ store_object <- function(target, value, meta, config) {
store_file <- function(target, meta, config) {
store_object(
target = target,
value = safe_rehash_file(target = target, config = config),
value = safe_rehash_storage(target = target, config = config),
meta = meta,
config = config
)
Expand All @@ -102,7 +102,7 @@ store_output_files <- function(files, meta, config) {
meta$isfile <- TRUE
for (file in files) {
meta$name <- file
meta$mtime <- file.mtime(decode_path(file, config))
meta$mtime <- storage_mtime(decode_path(file, config))
meta$isfile <- TRUE
store_single_output(
target = file,
Expand Down
2 changes: 1 addition & 1 deletion R/utils-checksums.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ mc_get_outfile_checksum <- function(target, config) {
files <- sort(unique(as.character(deps$file_out)))
out <- vapply(
X = files,
FUN = rehash_file,
FUN = rehash_storage,
FUN.VALUE = character(1),
config = config
)
Expand Down
Loading