Skip to content

Commit

Permalink
usethis::pr_init("Implement error handling in acc2lin.R functions
Browse files Browse the repository at this point in the history
- Added validation checks for input parameters (accessions, ipg_file, assembly_path, lineagelookup_path).
- Included error messages for missing or invalid inputs and file existence checks.
- Wrapped main logic in tryCatch for graceful error handling during execution.
")
  • Loading branch information
Seyi007 committed Oct 5, 2024
1 parent 94369a2 commit 30d4bf3
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 70 deletions.
267 changes: 200 additions & 67 deletions R/acc2lin.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,28 @@
#' Sink Reset
#'
#' @return No return, but run to close all outstanding `sink()`s
#' and handles any errors or warnings that occur during the process.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' sink.reset()
#' }
sink.reset <- function() {
# Handle all errors and warnings
tryCatch({
for (i in seq_len(sink.number())) {
sink(NULL)
sink(NULL)
}
print("All sinks closed")
}, error = function(e) {
print(paste("Error: ", e$message))
}, warning = function(w) {
print(paste("Warning: ", w$message))
}, finally = {
print("resetSink function execution completed.")
})
}


Expand All @@ -44,39 +56,79 @@ sink.reset <- function() {
#' add_lins()
#' }
add_lins <- function(df, acc_col = "AccNum", assembly_path,
lineagelookup_path, ipgout_path = NULL, plan = "sequential") {
s_acc_col <- sym(acc_col)
accessions <- df %>% pull(acc_col)
lins <- acc2lin(accessions, assembly_path, lineagelookup_path, ipgout_path, plan)
lineagelookup_path, ipgout_path = NULL,
plan = "sequential") {
# check for validate inputs
if (!is.data.frame(df)) {
stop("Input 'df' must be a data frame.")
}

if (!acc_col %in% colnames(df)) {
stop(paste("Column", acc_col, "not found in data frame."))
}

# Ensure paths are character strings
if (!is.character(assembly_path) || !is.character(lineagelookup_path)) {
stop("Both 'assembly_path' and
'lineagelookup_path' must be character strings.")
}

# Ensure paths exist
if (!file.exists(assembly_path)) {
stop(paste("Assembly file not found at:", assembly_path))
}

# Drop a lot of the unimportant columns for now? will make merging much easier
lins <- lins[, c(
if (!file.exists(lineagelookup_path)) {
stop(paste("Lineage lookup file not found at:", lineagelookup_path))
}
tryCatch({
# Attempt to add lineages
acc_col <- sym(acc_col)
accessions <- df %>% pull(acc_col)
lins <- acc2lin(
accessions, assembly_path, lineagelookup_path, ipgout_path, plan
)

# Drop a lot of the unimportant columns for now?
# will make merging much easier
lins <- lins[, c(
"Strand", "Start", "Stop", "Nucleotide Accession", "Source",
"Id", "Strain"
) := NULL]
lins <- unique(lins)
) := NULL]
lins <- unique(lins)

# dup <- lins %>% group_by(Protein) %>%
# summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)

# dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)
merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
}, error = function(e) {
print(paste("Error: ", e$message))
}, warning = function(w) {
print(paste("Warning: ", w$message))
}, finally = {
print("addLineages function execution completed.")
})

merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
}


#' acc2lin
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set
#' @description This function combines 'efetch_ipg()'
#' and 'ipg2lin()' to map a set
#' of protein accessions to their assembly (GCA_ID), tax ID, and lineage.
#'
#' @param accessions Character vector of protein accessions
#' @param assembly_path String of the path to the assembly_summary path
#' This file can be generated using the "DownloadAssemblySummary()" function
#' @param lineagelookup_path String of the path to the lineage lookup file
#' (taxid to lineage mapping). This file can be generated using the
#' @param ipgout_path Path to write the results of the efetch run of the accessions
#' @param ipgout_path Path to write the results
#' of the efetch run of the accessions
#' on the ipg database. If NULL, the file will not be written. Defaults to NULL
#' @param plan
#'
Expand All @@ -87,27 +139,43 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path,
#' \dontrun{
#' acc2lin()
#' }
acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "sequential") {
tmp_ipg <- F
if (is.null(ipgout_path)) {
tmp_ipg <- T
ipgout_path <- tempfile("ipg", fileext = ".txt")
}
acc2lin <- function(accessions, assembly_path,
lineagelookup_path, ipgout_path = NULL,
plan = "sequential") {
tmp_ipg <- F
if (is.null(ipgout_path)) {
tmp_ipg <- T
ipgout_path <- tempfile("ipg", fileext = ".txt")
}

lins <- NULL
tryCatch({
# Attempt to fetch IPG
efetch_ipg(accessions, out_path = ipgout_path, plan)

# Attempt to process IPG to lineages
lins <- ipg2lin(accessions, ipgout_path, assembly_path, lineagelookup_path)
}, error = function(e) {
print(paste("An error occurred: ", e$message))
}, warning = function(w) {
print(paste("Warning: ", w$message))
}, finally = {
print("acc2lin function execution completed.")
})

if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
}


#' efetch_ipg
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Perform efetch on the ipg database and write the results to out_path
#' @description Perform efetch on the ipg database
#' and write the results to out_path
#'
#' @param accnums Character vector containing the accession numbers to query on
#' the ipg database
Expand All @@ -126,57 +194,84 @@ acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path =
#' efetch_ipg()
#' }
efetch_ipg <- function(accnums, out_path, plan = "sequential") {
if (length(accnums) > 0) {
partition <- function(in_data, groups) {
# \\TODO This function should be defined outside of efetch_ipg(). It can be non-exported/internal
# Partition data to limit number of queries per second for rentrez fetch:
# limit of 10/second w/ key
l <- length(in_data)

partitioned <- list()
for (i in 1:groups)
{
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}
# Argument validation
if (!is.character(accnums) || length(accnums) == 0) {
stop("Error: 'accnums' must be a non-empty character vector.")
}

if (!is.character(out_path) || nchar(out_path) == 0) {
stop("Error: 'out_path' must be a non-empty string.")
}

if (!is.function(plan)) {
stop("Error: 'plan' must be a valid plan function.")
}
if (length(accnums) > 0) {
partition <- function(in_data, groups) {
# \\TODO This function should be defined outside of efetch_ipg().
# It can be non-exported/internal
# Partition data to limit number of queries per second for rentrez fetch:
# limit of 10/second w/ key
l <- length(in_data)

plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)
partitioned <- list()
for (i in 1:groups){
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}
tryCatch({
# Set the future plan strategy
plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)

# Open the sink to the output path
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)
}, error = function(e) {
print(paste("An error occurred: ", e$message))
}, warning = function(w) {
print(paste("Warning: ", w$message))
}, finally = {
print("efetch_ipg function execution completed.")
})
}
}



#' ipg2lin
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Takes the resulting file of an efetch run on the ipg database and
#' @description Takes the resulting file
#' of an efetch run on the ipg database and
#'
#' @param accessions Character vector of protein accessions
#' @param ipg_file Filepath to the file containing results of an efetch run on the
#' ipg database. The protein accession in 'accessions' should be contained in this
#' @param ipg_file Filepath to the file
#' containing results of an efetch run on the
#' ipg database. The protein accession in
#' 'accessions' should be contained in this
#' file
#' @param assembly_path String of the path to the assembly_summary path
#' This file can be generated using the "DownloadAssemblySummary()" function
Expand All @@ -195,16 +290,54 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") {
#' }
#'
ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) {
# Argument validation for accessions
if (!is.character(accessions) || length(accessions) == 0) {
stop("Input 'accessions' must be a non-empty character vector.")
}

# check for validate inputs
if (!is.character(ipg_file)) {
stop("Input 'ipg_file' must be a character string.")
}
# Ensure paths are character strings
if (!is.character(assembly_path) || !is.character(lineagelookup_path)) {
stop("Both 'assembly_path' and
'lineagelookup_path' must be character strings.")
}

# Ensure paths exist
if (!file.exists(assembly_path)) {
stop(paste("Assembly file not found at:", assembly_path))
}

if (!file.exists(lineagelookup_path)) {
stop(paste("Lineage lookup file not found at:", lineagelookup_path))
}

try({
# Attempt to read the IPG file
ipg_dt <- fread(ipg_file, sep = "\t", fill = T)

# Filter the IPG data table to only include the accessions
ipg_dt <- ipg_dt[Protein %in% accessions]

# Rename the 'Assembly' column to 'GCA_ID'
ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")

# Convert the IPG data table to a lineage data table
lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path)

# Filter out rows with missing lineage information
lins <- lins[!is.na(Lineage)] %>% unique()

return(lins)
}, error = function(e) {
print(paste("An error occurred: ", e$message))
}, warning = function(w) {
print(paste("Warning: ", w$message))
}, finally = {
print("ipg2lin function execution completed.")
})
}


Expand Down
3 changes: 2 additions & 1 deletion man/acc2lin.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/efetch_ipg.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/ipg2lin.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/sink.reset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 30d4bf3

Please sign in to comment.