-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataWranglingFuns.R
101 lines (76 loc) · 4.4 KB
/
dataWranglingFuns.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Extract Species Function ----
# Trevor Nishida
# Function to extract species level information from Bishop Museum HTML files
require(tidyverse)
# Function used to extract the species information from HTML files
# Input HTML file data after reading into R (xml2::read_html())
extractSpecies <- function(rawHTML) {
# Harvest raw text data from the HTML file
# CSS selectors to navigate through child elements
textFromHTML <- rawHTML %>%
rvest::html_element("body > div") %>%
rvest::html_elements("p") %>%
rvest::html_text2()
# Remove clutter (HTML tags) from text and format into a list
cleanText <- gsub("\r", "", textFromHTML, fixed = T) %>%
trimws(.)
listOfText <- as.list(scan(text = cleanText, what = "", sep = "\n"))
# Create a dataframe using the list
# Use RegEx to extract species level identification
speciesDF <- listOfText %>%
stringr::str_extract_all(., "(\\S\\w+\\s[[:lower:]]+(?!\\')|\\S\\w+\\s\\(\\w+\\)\\s[[:lower:]]+\\w)") %>%
purrr::list_c() %>%
purrr::compact() %>% # Remove empty list entries
stringr::str_replace_all(., " sp", " sp.") %>% # Fix Sp. labels (Missing punctuation)
dplyr::tibble() %>% # Send list to a tibble
dplyr::rename("species" = ".") %>% # Rename column to properly reflect what is stored
dplyr::mutate(listID = "bishop") %>% # Add origin of data identifier for each entry
# mutate(across("species", str_replace, " sp.| Sp.", "")) %>% # Remove Sp. to (hopefully) identify to Genus
dplyr::distinct() # Remove duplicates (if any)
return(speciesDF) # Return the dataframe
}
# WoRMS Cross reference function ----
# Trevor Nishida
# Adapted from https://marinegeo.github.io/2018-04-24-working-with-worms/
# Script to cross reference species name to the World Registry of Marine Species database
require(tidyverse)
require(worrms)
# Input a data frame containing a column of species names --> Most likely output from extractSpecies()
# Also takes ColumnsOfInterest --> Character Vector of column names from WoRMS output that is of interest
# Possible column names: AphiaID, url, scientificname, authority, status, unacceptreason, taxonRankID, rank
# valid_AphiaID, valid_name, valid_authority, parentNameUsageID, kingdom, phylum, class, order
# family, genus, citation, lsid, isMarine, isBrackish, isFreshwater, isTerrestrial, isExtinct,
# match_type, modified
# Optionally, fuzzy: A binary operator which determines if it should fuzzy search
wormsProcess <- function(dataframe, ColumnsOfInterest, fuzzy = FALSE) {
# Split every 50 or so rows -- Can only request ~50 instances to WoRMS database; drops all results after ~50 for request of 100
n <- 25
df <- dataframe %>%
rename_with(tolower) %>% # Ensure column labels are lower case
mutate(species = as.character(species)) %>% # and make sure species is coming in as a character column
filter(species != "") %>%
filter(species != " ") %>%
mutate(scientificName = species) %>% # Collect ONLY species column from input
group_split(group_id = row_number() %/% n) # Use integer division to split up data by groups of 100
wormPH <- list() # Initialize list that will store WoRMS data
for (i in 1:length(df)) {
uniqueSpeciesList <- df[[i]] %>% # Iterate over each group of data save off a list of species
dplyr::select(scientificName) %>%
dplyr::distinct() %>% # Make sure species names are unique within iterations
dplyr::pull(scientificName)
if (fuzzy == FALSE){
worms_rec <- worrms::wm_records_names(name = uniqueSpeciesList) # Save the data table from WoRMS for each species
} else {
worms_rec <- worrms::wm_records_taxamatch(name = uniqueSpeciesList) # If Fuzzy search is true
}
worms_df <- worms_rec %>% # Combine them into one single data frame compiling all data from iteration
dplyr::bind_rows()
wormPH[[i]] <- worms_df # Save the data frame for each iteration into list initialized above.
}
wormTogether <- dplyr::bind_rows(wormPH) # Bind all data frames contained in the list into one
wormFinal <- wormTogether %>% # Save and return only the columns of interest
dplyr::select(all_of(ColumnsOfInterest)) %>%
rename("worms_name" = "valid_name") %>%
mutate("valid_name" = str_replace(worms_name, " \\s*\\([^\\)]+\\)", "")) # Remove entries with Genus (Genus) species type naming
return(wormFinal)
}