diff --git a/R/acc2lin.R b/R/acc2lin.R index f8d71949..dfb33da9 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -10,6 +10,8 @@ #' Sink Reset #' #' @return No return, but run to close all outstanding `sink()`s +#' and handles any errors or warnings that occur during the process. +#' #' @export #' #' @examples @@ -17,9 +19,19 @@ #' sink.reset() #' } sink.reset <- function() { + # Handle all errors and warnings + tryCatch({ for (i in seq_len(sink.number())) { - sink(NULL) + sink(NULL) } + print("All sinks closed") + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("resetSink function execution completed.") + }) } @@ -44,23 +56,61 @@ sink.reset <- function() { #' add_lins() #' } add_lins <- function(df, acc_col = "AccNum", assembly_path, - lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - s_acc_col <- sym(acc_col) - accessions <- df %>% pull(acc_col) - lins <- acc2lin(accessions, assembly_path, lineagelookup_path, ipgout_path, plan) + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + # check for validate inputs + if (!is.data.frame(df)) { + stop("Input 'df' must be a data frame.") + } + + if (!acc_col %in% colnames(df)) { + stop(paste("Column", acc_col, "not found in data frame.")) + } + + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } - # Drop a lot of the unimportant columns for now? will make merging much easier - lins <- lins[, c( + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + tryCatch({ + # Attempt to add lineages + acc_col <- sym(acc_col) + accessions <- df %>% pull(acc_col) + lins <- acc2lin( + accessions, assembly_path, lineagelookup_path, ipgout_path, plan + ) + + # Drop a lot of the unimportant columns for now? + # will make merging much easier + lins <- lins[, c( "Strand", "Start", "Stop", "Nucleotide Accession", "Source", "Id", "Strain" - ) := NULL] - lins <- unique(lins) + ) := NULL] + lins <- unique(lins) + + # dup <- lins %>% group_by(Protein) %>% + # summarize(count = n()) %>% filter(count > 1) %>% + # pull(Protein) - # dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>% - # pull(Protein) + merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) + return(merged) + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("addLineages function execution completed.") + }) - merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) - return(merged) } @@ -68,7 +118,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +#' @description This function combines 'efetch_ipg()' +#' and 'ipg2lin()' to map a set #' of protein accessions to their assembly (GCA_ID), tax ID, and lineage. #' #' @param accessions Character vector of protein accessions @@ -76,7 +127,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results of the efetch run of the accessions +#' @param ipgout_path Path to write the results +#' of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan #' @@ -87,27 +139,43 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' \dontrun{ #' acc2lin() #' } -acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - tmp_ipg <- F - if (is.null(ipgout_path)) { - tmp_ipg <- T - ipgout_path <- tempfile("ipg", fileext = ".txt") - } +acc2lin <- function(accessions, assembly_path, + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + tmp_ipg <- F + if (is.null(ipgout_path)) { + tmp_ipg <- T + ipgout_path <- tempfile("ipg", fileext = ".txt") + } + + lins <- NULL + tryCatch({ + # Attempt to fetch IPG efetch_ipg(accessions, out_path = ipgout_path, plan) + # Attempt to process IPG to lineages lins <- ipg2lin(accessions, ipgout_path, assembly_path, lineagelookup_path) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("acc2lin function execution completed.") + }) - if (tmp_ipg) { - unlink(tempdir(), recursive = T) - } - return(lins) + if (tmp_ipg) { + unlink(tempdir(), recursive = T) + } + return(lins) } + #' efetch_ipg #' #' @author Samuel Chen, Janani Ravi #' -#' @description Perform efetch on the ipg database and write the results to out_path +#' @description Perform efetch on the ipg database +#' and write the results to out_path #' #' @param accnums Character vector containing the accession numbers to query on #' the ipg database @@ -126,57 +194,84 @@ acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = #' efetch_ipg() #' } efetch_ipg <- function(accnums, out_path, plan = "sequential") { - if (length(accnums) > 0) { - partition <- function(in_data, groups) { - # \\TODO This function should be defined outside of efetch_ipg(). It can be non-exported/internal - # Partition data to limit number of queries per second for rentrez fetch: - # limit of 10/second w/ key - l <- length(in_data) - - partitioned <- list() - for (i in 1:groups) - { - partitioned[[i]] <- in_data[seq.int(i, l, groups)] - } - - return(partitioned) - } + # Argument validation + if (!is.character(accnums) || length(accnums) == 0) { + stop("Error: 'accnums' must be a non-empty character vector.") + } + + if (!is.character(out_path) || nchar(out_path) == 0) { + stop("Error: 'out_path' must be a non-empty string.") + } + + if (!is.function(plan)) { + stop("Error: 'plan' must be a valid plan function.") + } + if (length(accnums) > 0) { + partition <- function(in_data, groups) { + # \\TODO This function should be defined outside of efetch_ipg(). + # It can be non-exported/internal + # Partition data to limit number of queries per second for rentrez fetch: + # limit of 10/second w/ key + l <- length(in_data) - plan(strategy = plan, .skip = T) - - - min_groups <- length(accnums) / 200 - groups <- min(max(min_groups, 15), length(accnums)) - partitioned_acc <- partition(accnums, groups) - sink(out_path) - - a <- future_map(1:length(partitioned_acc), function(x) { - # Avoid hitting the rate API limit - if (x %% 9 == 0) { - Sys.sleep(1) - } - cat( - entrez_fetch( - id = partitioned_acc[[x]], - db = "ipg", - rettype = "xml", - api_key = "YOUR_KEY_HERE" ## Can this be included in public package? - ) - ) - }) - sink(NULL) + partitioned <- list() + for (i in 1:groups){ + partitioned[[i]] <- in_data[seq.int(i, l, groups)] + } + + return(partitioned) } + tryCatch({ + # Set the future plan strategy + plan(strategy = plan, .skip = T) + + + min_groups <- length(accnums) / 200 + groups <- min(max(min_groups, 15), length(accnums)) + partitioned_acc <- partition(accnums, groups) + + # Open the sink to the output path + sink(out_path) + + a <- future_map(1:length(partitioned_acc), function(x) { + # Avoid hitting the rate API limit + if (x %% 9 == 0) { + Sys.sleep(1) + } + cat( + entrez_fetch( + id = partitioned_acc[[x]], + db = "ipg", + rettype = "xml", + api_key = "YOUR_KEY_HERE" ## Can this be included in public package? + ) + ) + }) + sink(NULL) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("efetch_ipg function execution completed.") + }) + } } + + #' ipg2lin #' #' @author Samuel Chen, Janani Ravi #' -#' @description Takes the resulting file of an efetch run on the ipg database and +#' @description Takes the resulting file +#' of an efetch run on the ipg database and #' #' @param accessions Character vector of protein accessions -#' @param ipg_file Filepath to the file containing results of an efetch run on the -#' ipg database. The protein accession in 'accessions' should be contained in this +#' @param ipg_file Filepath to the file +#' containing results of an efetch run on the +#' ipg database. The protein accession in +#' 'accessions' should be contained in this #' file #' @param assembly_path String of the path to the assembly_summary path #' This file can be generated using the "DownloadAssemblySummary()" function @@ -195,16 +290,54 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") { #' } #' ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) { + # Argument validation for accessions + if (!is.character(accessions) || length(accessions) == 0) { + stop("Input 'accessions' must be a non-empty character vector.") + } + + # check for validate inputs + if (!is.character(ipg_file)) { + stop("Input 'ipg_file' must be a character string.") + } + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } + + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + + try({ + # Attempt to read the IPG file ipg_dt <- fread(ipg_file, sep = "\t", fill = T) + # Filter the IPG data table to only include the accessions ipg_dt <- ipg_dt[Protein %in% accessions] + # Rename the 'Assembly' column to 'GCA_ID' ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID") + # Convert the IPG data table to a lineage data table lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path) + + # Filter out rows with missing lineage information lins <- lins[!is.na(Lineage)] %>% unique() return(lins) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("ipg2lin function execution completed.") + }) } diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd index 6255b290..d3f2468b 100644 --- a/man/acc2lin.Rd +++ b/man/acc2lin.Rd @@ -38,7 +38,8 @@ on the ipg database. If NULL, the file will not be written. Defaults to NULL} Describe return, in detail } \description{ -This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +This function combines 'efetch_ipg()' +and 'ipg2lin()' to map a set of protein accessions to their assembly (GCA_ID), tax ID, and lineage. Function to map protein accession numbers to lineage diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd index ec5b6bcb..1fbb9d92 100644 --- a/man/efetch_ipg.Rd +++ b/man/efetch_ipg.Rd @@ -23,7 +23,8 @@ the ipg database} Describe return, in detail } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd index 3a14eada..453668b0 100644 --- a/man/ipg2lin.Rd +++ b/man/ipg2lin.Rd @@ -38,7 +38,8 @@ This file can be generated using the "DownloadAssemblySummary()" function} Describe return, in detail } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd index a31b841d..64087c49 100644 --- a/man/sink.reset.Rd +++ b/man/sink.reset.Rd @@ -8,6 +8,7 @@ sink.reset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset