Skip to content

Commit

Permalink
refactor functions in R/cleanup.R
Browse files Browse the repository at this point in the history
  • Loading branch information
teddyCodex committed Oct 6, 2024
1 parent 94369a2 commit 61680ae
Showing 1 changed file with 71 additions and 71 deletions.
142 changes: 71 additions & 71 deletions R/cleanup.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
#' @return [string] string with only alphanumerics, "_", "+", and "."
#' @examples
#' \dontrun{
#' clean_string()
#' cleanString()
#' }
#'
clean_string <- function(string) {
cleanString <- function(string) {
# replace spaces with "_"
string <- stringr::str_replace_all(string, "\\s+", "_")
# keep only alphanumeric characters, "_", and "."
Expand All @@ -44,7 +44,7 @@ clean_string <- function(string) {

# use the same code as upstream_scripts/00_submit_full.R's
# get_sequences() function to extract accession numbers
#' string2accnum
#' extractAccNum
#'
#' @param string
#'
Expand All @@ -53,9 +53,9 @@ clean_string <- function(string) {
#'
#' @examples
#' \dontrun{
#' string2accnum()
#' extractAccNum()
#' }
string2accnum <- function(string) {
extractAccNum <- function(string) {
if (grepl("\\|", string)) {
accnum <- strsplit(string, "\\|")[[1]][2]
accnum <- strsplit(accnum, " ")[[1]][1]
Expand All @@ -81,9 +81,9 @@ string2accnum <- function(string) {
#' @examples
#' \dontrun{
#' c("xxx", "xxx", "xxx", "yyy", "yyy") |>
#' make_accnums_unique()
#' ensureUniqAccNum()
#' }
make_accnums_unique <- function(accnums) {
ensureUniqAccNum <- function(accnums) {
# group by accnums then use the row count as a proxy
# for the index of occurence for each accession number
df_accnums <- tibble::tibble("accnum" = accnums)
Expand Down Expand Up @@ -113,22 +113,22 @@ make_accnums_unique <- function(accnums) {
#' @examples
#' \dontrun{
#' AAStringSet(c("xxx" = "ATCG", "xxx" = "GGGC")) |>
#' cleanup_fasta_header()
#' cleanFAHeaders()
#' }
cleanup_fasta_header <- function(fasta) {
cleanFAHeaders <- function(fasta) {
headers <- names(fasta)
# try parsing accession numbers from header
headers <- purrr::map_chr(
headers,
string2accnum
extractAccNum
)
# sanitize string for pathing (file read/write-ing)
headers <- purrr::map_chr(
headers,
fs::path_sanitize
)
# append an index suffix for the ith occurence of each accnum
headers <- make_accnums_unique(headers)
headers <- ensureUniqAccNum(headers)
names(fasta) <- headers
return(fasta)
}
Expand All @@ -153,9 +153,9 @@ cleanup_fasta_header <- function(fasta) {
#'
#' @examples
#' \dontrun{
#' remove_empty(prot, "DomArch")
#' removeEmptyRows(prot, "DomArch")
#' }
remove_empty <- function(prot, by_column = "DomArch") {
removeEmptyRows <- function(prot, by_column = "DomArch") {
# ?? Don't call other psp functions within these functions
prot <- prot %>%
as_tibble() %>%
Expand All @@ -168,7 +168,7 @@ remove_empty <- function(prot, by_column = "DomArch") {
}

###########################
#' repeat2s
#' condenseRepeatedDomains
#'
#' @description
#' Condense repeated domains
Expand All @@ -181,7 +181,7 @@ remove_empty <- function(prot, by_column = "DomArch") {
#'
#' @param prot A data frame containing 'DomArch', 'GenContext', 'ClustName' columns.
#' @param by_column Column in which repeats are condensed to domain+domain -> domain(s).
#' @param excluded_prots Vector of strings that repeat2s should not reduce to (s). Defaults to c()
#' @param excluded_prots Vector of strings that condenseRepeatedDomains should not reduce to (s). Defaults to c()
#'
#' @return Describe return, in detail
#' @export
Expand All @@ -191,10 +191,10 @@ remove_empty <- function(prot, by_column = "DomArch") {
#'
#' @examples
#' \dontrun{
#' repeat2s(prot, "DomArch")
#' condenseRepeatedDomains(prot, "DomArch")
#' }
repeat2s <- function(prot, by_column = "DomArch", excluded_prots = c()) {
# If there are strings that repeat2s should not affect, the pattern to search
condenseRepeatedDomains <- function(prot, by_column = "DomArch", excluded_prots = c()) {
# If there are strings that condenseRepeatedDomains should not affect, the pattern to search
# for must be changed to exclude a search for those desired strings

collapsed_prots <- paste0(excluded_prots, collapse = "\\s|")
Expand Down Expand Up @@ -253,10 +253,10 @@ repeat2s <- function(prot, by_column = "DomArch", excluded_prots = c()) {
#'
#' @examples
#' \dontrun{
#' replaceQMs()
#' replaceQuestionMarks()
#' }
#'
replaceQMs <- function(prot, by_column = "GenContext") {
replaceQuestionMarks <- function(prot, by_column = "GenContext") {
by <- sym(by_column)

# Regex for finding repeated `?`
Expand Down Expand Up @@ -290,9 +290,9 @@ replaceQMs <- function(prot, by_column = "GenContext") {
#'
#' @examples
#' \dontrun{
#' remove_astrk()
#' removeAsterisks()
#' }
remove_astrk <- function(query_data, colname = "GenContext") {
removeAsterisks <- function(query_data, colname = "GenContext") {
query_data[, colname] <- map(query_data[, colname], function(x) str_remove_all(x, pattern = "\\*"))

return(query_data)
Expand Down Expand Up @@ -323,9 +323,9 @@ remove_astrk <- function(query_data, colname = "GenContext") {
#'
#' @examples
#' \dontrun{
#' remove_tails(prot, "DomArch")
#' removeTails(prot, "DomArch")
#' }
remove_tails <- function(prot, by_column = "DomArch",
removeTails <- function(prot, by_column = "DomArch",
keep_domains = FALSE) { # !! currently redundant

by_column <- sym(by_column)
Expand Down Expand Up @@ -369,7 +369,7 @@ remove_tails <- function(prot, by_column = "DomArch",
#' A cleaned up version of the data table is returned.
#'
#' @param prot A data frame that contains columns 'Species'.
#' @param remove_empty Boolean. If TRUE, rows with empty/unnecessary values in 'Species' are removed.
#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values in 'Species' are removed.
#' Default is false.
#'
#' @importFrom stringr coll str_replace_all
Expand All @@ -379,9 +379,9 @@ remove_tails <- function(prot, by_column = "DomArch",
#'
#' @examples
#' \dontrun{
#' cleanup_species(prot, TRUE)
#' cleanSpecies(prot, TRUE)
#' }
cleanup_species <- function(prot, remove_empty = FALSE) {
cleanSpecies <- function(prot, removeEmptyRows = FALSE) {
# FUNCTIONS CALLED HERE, if else might be better since only two options, T and F

# Create cleaned up Species column
Expand All @@ -404,8 +404,8 @@ cleanup_species <- function(prot, remove_empty = FALSE) {
str_replace_all(coll(" ", TRUE), " ")

# !! CHECK !! Species vs Species_old
if (remove_empty) {
prot <- remove_empty(prot = prot, by_column = "Species")
if (removeEmptyRows) {
prot <- removeEmptyRows(prot = prot, by_column = "Species")
}

return(prot)
Expand All @@ -425,9 +425,9 @@ cleanup_species <- function(prot, remove_empty = FALSE) {
#' @param prot A data frame that must contain columns Query and ClustName.
#' @param domains_rename A data frame containing the domain names to be replaced in a column 'old' and the corresponding replacement values in a column 'new'.
#' @param domains_keep A data frame containing the domain names to be retained.
#' @param repeat2s Boolean. If TRUE, repeated domains in 'ClustName' are condensed. Default is TRUE.
#' @param remove_tails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.
#' @param remove_empty Boolean. If TRUE, rows with empty/unnecessary values in 'ClustName' are removed. Default is FALSE.
#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in 'ClustName' are condensed. Default is TRUE.
#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.
#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values in 'ClustName' are removed. Default is FALSE.
#'
#' @importFrom dplyr filter
#' @importFrom stringr coll str_replace_all
Expand All @@ -437,12 +437,12 @@ cleanup_species <- function(prot, remove_empty = FALSE) {
#'
#' @examples
#' \dontrun{
#' cleanup_clust(prot, TRUE, FALSE, domains_keep, domains_rename)
#' cleanClusters(prot, TRUE, FALSE, domains_keep, domains_rename)
#' }
cleanup_clust <- function(prot,
cleanClusters <- function(prot,
domains_rename, domains_keep,
repeat2s = TRUE, remove_tails = FALSE,
remove_empty = FALSE) {
condenseRepeatedDomains = TRUE, removeTails = FALSE,
removeEmptyRows = FALSE) {
# Create cleaned up ClustName column
prot$ClustName <- prot$ClustName.orig

Expand All @@ -469,19 +469,19 @@ cleanup_clust <- function(prot,

## Optional parameters
# Condense repeats
if (repeat2s) {
prot <- repeat2s(prot, by_column = "ClustName")
if (condenseRepeatedDomains) {
prot <- condenseRepeatedDomains(prot, by_column = "ClustName")
}
# Remove singletons
# if(remove_tails){
# if(removeTails){
# prot <- prot %>% filter(!grepl(".1$", ClustID))
# }
if (remove_tails) {
prot <- remove_tails(prot, by_column = "ClustName")
if (removeTails) {
prot <- removeTails(prot, by_column = "ClustName")
}
# Remove empty rows
if (remove_empty) {
prot <- remove_empty(prot = prot, by_column = "ClustName")
if (removeEmptyRows) {
prot <- removeEmptyRows(prot = prot, by_column = "ClustName")
}


Expand Down Expand Up @@ -509,9 +509,9 @@ cleanup_clust <- function(prot,
#' @param domains_keep A data frame containing the domain names to be retained.
#' @param domains_rename A data frame containing the domain names to be replaced in a column 'old' and the
#' corresponding replacement values in a column 'new'.
#' @param repeat2s Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE.
#' @param remove_tails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.
#' @param remove_empty Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE.
#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE.
#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.
#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE.
#' @param domains_ignore A data frame containing the domain names to be removed in a column called 'domains'
#'
#' @importFrom dplyr pull
Expand All @@ -522,12 +522,12 @@ cleanup_clust <- function(prot,
#'
#' @examples
#' \dontrun{
#' cleanup_domarch(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL)
#' cleanDomainArchitecture(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL)
#' }
cleanup_domarch <- function(prot, old = "DomArch.orig", new = "DomArch",
cleanDomainArchitecture <- function(prot, old = "DomArch.orig", new = "DomArch",
domains_keep, domains_rename,
repeat2s = TRUE, remove_tails = FALSE,
remove_empty = F,
condenseRepeatedDomains = TRUE, removeTails = FALSE,
removeEmptyRows = F,
domains_ignore = NULL) {
old_sym <- sym(old)
new_sym <- sym(new)
Expand Down Expand Up @@ -577,22 +577,22 @@ cleanup_domarch <- function(prot, old = "DomArch.orig", new = "DomArch",

## Optional parameters
# Remove singletons
if (remove_tails) {
prot <- remove_tails(prot = prot, by_column = new)
if (removeTails) {
prot <- removeTails(prot = prot, by_column = new)
}
# Condense repeats
if (repeat2s) {
if (condenseRepeatedDomains) {
## Error in UseMethod("tbl_vars") : no applicable method for 'tbl_vars' applied to an object of class "character"
prot <- repeat2s(prot = prot, by_column = new)
prot <- condenseRepeatedDomains(prot = prot, by_column = new)
}
# Remove empty rows
# ! FUNCTIONS CALLED HERE, if else might be better since only two options, T and F
# ! Make a separate function of out of this?
if (remove_empty) {
prot <- remove_empty(prot = prot, by_column = new)
if (removeEmptyRows) {
prot <- removeEmptyRows(prot = prot, by_column = new)
}

prot <- replaceQMs(prot, new)
prot <- replaceQuestionMarks(prot, new)

return(prot)
}
Expand All @@ -610,7 +610,7 @@ cleanup_domarch <- function(prot, old = "DomArch.orig", new = "DomArch",
#' @param prot A data frame that contains columns 'GenContext.orig'
#' @param domains_rename A data frame containing the domain names to be replaced in a column 'old' and the replacement in a column 'new'.
#' Defaults to an empty data frame with a new and old column such that non of the domains will be renamed
#' @param repeat2s Boolean. If TRUE, repeated domains in 'GenContext' are condensed. Default is TRUE.
#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in 'GenContext' are condensed. Default is TRUE.
#' @param remove_asterisk Boolean. If TRUE, asterisks in 'ClustName' are removed. Default is TRUE.
#'
#' @importFrom stringr str_replace_all
Expand All @@ -620,11 +620,11 @@ cleanup_domarch <- function(prot, old = "DomArch.orig", new = "DomArch",
#'
#' @examples
#' \dontrun{
#' cleanup_gencontext(prot, domains_rename, T, F)
#' cleanGenomicContext(prot, domains_rename, T, F)
#' }
#'
cleanup_gencontext <- function(prot, domains_rename = data.frame("old" = character(0), "new" = character(0), stringsAsFactors = F),
repeat2s = TRUE, remove_asterisk = TRUE) {
cleanGenomicContext <- function(prot, domains_rename = data.frame("old" = character(0), "new" = character(0), stringsAsFactors = F),
condenseRepeatedDomains = TRUE, remove_asterisk = TRUE) {
# Create cleaned up GenContext column
prot$GenContext <- prot$GenContext.orig

Expand All @@ -641,16 +641,16 @@ cleanup_gencontext <- function(prot, domains_rename = data.frame("old" = charact
## Reverse operons | Straighten them out!
prot <- reverse_operon(prot)

prot <- replaceQMs(prot, "GenContext")
prot <- replaceQuestionMarks(prot, "GenContext")
## Optional parameters
# Condense repeats
if (repeat2s) {
prot <- repeat2s(prot, "GenContext")
if (condenseRepeatedDomains) {
prot <- condenseRepeatedDomains(prot, "GenContext")
}

# Remove the Asterisks
if (remove_asterisk) {
prot <- remove_astrk(prot, colname = "GenContext")
prot <- removeAsterisks(prot, colname = "GenContext")
}

return(prot)
Expand All @@ -666,9 +666,9 @@ cleanup_gencontext <- function(prot, domains_rename = data.frame("old" = charact
#'
#' @examples
#' \dontrun{
#' cleanup_GeneDesc()
#' cleanGeneDescription()
#' }
cleanup_GeneDesc <- function(prot, column) {
cleanGeneDescription <- function(prot, column) {
prot[, "GeneDesc"] <- gsub("\\.$", "", prot %>% pull(column))
prot[, "GeneDesc"] <- gsub("%2C", ",", prot %>% pull(column))
return(prot)
Expand All @@ -688,9 +688,9 @@ cleanup_GeneDesc <- function(prot, column) {
#'
#' @examples
#' \dontrun{
#' pick_longer_duplicate()
#' selectLongestDuplicate()
#' }
pick_longer_duplicate <- function(prot, column) {
selectLongestDuplicate <- function(prot, column) {
col <- sym(column)

prot$row.orig <- 1:nrow(prot)
Expand Down Expand Up @@ -736,9 +736,9 @@ pick_longer_duplicate <- function(prot, column) {
#'
#' @examples
#' \dontrun{
#' cleanup_lineage()
#' cleanLineage()
#' }
cleanup_lineage <- function(prot, lins_rename) {
cleanLineage <- function(prot, lins_rename) {
for (i in 1:nrow(lins_rename)) {
prot$Lineage <- gsub(lins_rename$old[i], lins_rename$new[i],
x = prot$Lineage,
Expand Down

0 comments on commit 61680ae

Please sign in to comment.