-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassdl2classbl.r
108 lines (96 loc) · 6.38 KB
/
classdl2classbl.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# this script will transform a classification download from Arctos to a classification bulkload for Arctos
# load libraries
library(readxl)
library(plyr)
# define function: is not in
'%!in%' <- function(x,y)!('%in%'(x,y))
# define function: right
right = function (string, char) {
substr(string,(unlist(lapply(gregexpr(pattern = char, string), min)) + 1),nchar(string))
}
# define function VLOOKUP (x = return value dataframe and column,
# y = lookup value dataframe and column, z = lookup dataframe and column
# x and z should be from the same dataframe)
vlookup <- function(x,y,z){
x[match(y,z)]
}
classification <- read_excel("~/GitHub/arctos-r/input/temp_arctos_tracefossil.xlsx") # read in classification file
classification$unique_id <- paste(classification$SCIENTIFIC_NAME,classification$CLASSIFICATION_ID, sep = " ") # create unique identifier for name and classification ID combination
classification$noclass_term_type_1 <- "source_authority" # set up term type for source_authority
classification$noclass_term_1 <- paste(classification$SOURCE,classification$LASTDATE,sep = " ") # create a source plus date field for source_authority
df <- classification # change dataframe name for ease of use
df <- df[which(!duplicated(df$unique_id)),] # get list of unique IDs
df$source <- "Arctos" # set source to which classifications will be loaded
df <- df[c("unique_id","SCIENTIFIC_NAME","source","noclass_term_type_1","noclass_term_1")] # create dataframe of unique ids IDs
colnames(df) <- tolower(colnames(df)) # lower case column names
justclass <- classification[which(!is.na(classification$POSITION_IN_CLASSIFICATION)),] # get all classification terms
noclass <- classification[which(is.na(classification$POSITION_IN_CLASSIFICATION)),]
if (nrow(justclass) + nrow(noclass) == nrow(classification)){
noclass <- noclass[which(noclass$TERM_TYPE != "display_name" & noclass$TERM_TYPE != "scientific_name"),] # remove scientific name from the list
check <- nrow(noclass)
df$source <- "Arctos Ichnology" # add source name
# sciname <- classification[which(classification$TERM_TYPE == "scientific_name"),] # get list of scientific names
df$scientific_name <- NA # create column for scientific_name
for (i in 1:nrow(df)){
df$scientific_name[i] <- vlookup(classification$SCIENTIFIC_NAME,df$unique_id[i],classification$unique_id)
} # add scientific name
# non-classification terms
used <- data.frame(unique_id=character(),
stringsAsFactors=FALSE) # initialize temporary dataframe
k <- 1 # initialize variable k
while (nrow(noclass) > 0){
classes <- unique(noclass$TERM_TYPE)# get maximum number of classification terms in any given classification
classcount <- length(classes) # get the number of unique class terms
for (j in 1:(classcount)){
classj <- noclass[which(noclass$TERM_TYPE == classes[[j]]),] # get all terms in classification position j
classj1 <- classj[which(!duplicated(classj$unique_id)),] # get deduplicated for class
used <- rbind.fill(used,classj1) # create a dataframe of used classification terms
classtyp <- paste('noclass_term_type_', (j+k), sep="") # set class term type column number
classterm <- paste('noclass_term_', (j+k), sep="") # set class term column number
for (i in 1:nrow(df)){
df[[classtyp]][i] <- vlookup(classj1$TERM_TYPE,df$unique_id[i],classj1$unique_id) # get the classification term type for the position and taxon
df[[classterm]][i] <- vlookup(classj1$TERM,df$unique_id[i],classj1$unique_id) # get the classification term for the position and taxon
}
}
noclass <- noclass[which(noclass$TAXON_TERM_ID %!in% used$TAXON_TERM),] # get the terms that haven't been added
k <- classcount + k # iterate variable k
}
# sanity check
test <- (length(which(!is.na(df))) - length(which(!is.na(df$unique_id))) - length(which(!is.na(df$scientific_name))) - length(which(!is.na(df$source))) - length(which(!is.na(df$noclass_term_type_1))) - length(which(!is.na(df$noclass_term_1))))/2
if (check == test){
# classification terms
check2 <- nrow(justclass)
used <- data.frame(CLASSIFICATION_ID=character(),
stringsAsFactors=FALSE) # initialize temporary dataframe
k <- 0 # initialize variable k
while (nrow(justclass) > 0){
# classes <- max(unique(justclass$POSITION_IN_CLASSIFICATION)) # get maximum number of classification terms in any given classification
classes <- unique(justclass$POSITION_IN_CLASSIFICATION)# get maximum number of classification terms in any given classification
classcount <- length(classes) # get the number of unique class terms
for (j in 0:classcount){
classj <- justclass[which(justclass$POSITION_IN_CLASSIFICATION == j),] # get all terms in classification position j
classj1 <- classj[which(!duplicated(classj$unique_id)),] # get deduplicated for class
classtyp <- paste('class_term_type_', (j+k), sep="") # set class term type column number
classterm <- paste('class_term_', (j+k), sep="") # set class term column number
used <- rbind.fill(used,classj1) # create a dataframe of used classification terms
for (i in 1:nrow(df)){
df[[classtyp]][i] <- vlookup(classj1$TERM_TYPE,df$unique_id[i],classj1$unique_id) # get the classification term type for the position and taxon
df[[classterm]][i] <- vlookup(classj1$TERM,df$unique_id[i],classj1$unique_id) # get the classification term for the position and taxon
}
}
justclass <- justclass[which(justclass$TAXON_TERM_ID %!in% used$TAXON_TERM_ID),] # get duplicate ranks by CLASSIFICATION_ID
k <- k + classcount
}
#sanity check
test <- (length(which(!is.na(df))) - length(which(!is.na(df$unique_id))) - length(which(!is.na(df$source))) - length(which(!is.na(df$scientific_name))) - length(which(!is.na(df$noclass_term_type_1))) - length(which(!is.na(df$noclass_term_1))))
test2 <- (length(which(!is.na(classification$TERM))) + length(which(!is.na(classification$TERM_TYPE))) - (length(which(classification$TERM_TYPE == "scientific_name"))*2) - (length(which(classification$TERM_TYPE == "display_name"))*2))
if (test2 == test){
write.csv(df,"~/GitHub/arctos-r/output/class_load.csv", row.names = FALSE)
print("no data was lost")
} else {
print("data was lost in classification terms")
}
} else {
print("data was lost in non-classification terms")
}
}