forked from ropensci/rppo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
quantile_flag.R
186 lines (155 loc) · 6.27 KB
/
quantile_flag.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#' @title Test and flag outliers using quantiles
#'
#' @description Test and flag outliers using the quantiles to set upper and lower limits
#' for values of a trait for every species in a dataframe.
#'
#' @details
#' The quantile_flag function returns a dataframe with outliers flagged.
#' It uses quantiles to set the upper and lower limit of values of a trait to test for outliers.
#' It creates a column "measurementStatus" with the values 'outlier', 'too few records', or
#' 'possible adult, possibly good'.
#' The function is useful for non-normally and non-log normally distributed data.
#' The function is called with parameters that correspond to values contained in the data itself
#' which act as a filter on the returned record set.
#' The function goes through every species in the datase t for the specified trait.
#'
#' @param data (string) a dataframe, assumed to be in FuTRES format
#' @param trait (string) a measurementType of interest, a list of traits can be found using
#' futres_traits()
#' @param taxa (string) a species (scientificName) within the dataset.
#' By default, the function goes through every unique species in the dataset.
#' @param stage (string) the life stage of the group (e.g., adult).
#' @param status (string) a list of values to ignore when making upper and lower limits.
#' The default is to ignore "too few records" and "outliers".
#' @param quant (integer) the number specifing the quaniles for setting the upper and lower limits.
#' By default, the functions sets the quantile to 0.05.
#' @param sample.min (integer) a limit for the number of samples required to test for
#' outliers. Defaults to a minimum of 3 individuals.
#' @export
#' @keywords outlier quantile
#' @return Return value containing a dataframe.
#'
#' @examples
#' wildcat.store <- futres_data(scientificName = "Puma concolor")
#' wildcat <- wildcat.store$data
#'
#' wildcat.quant <- quantile_flag(data = wildcat, trait = "body mass")
#'
#' print(wildcat.quant)
# Quantile flag function
quantile_flag <- function(
data = NULL,
trait = NULL,
taxa = NULL,
stage = NULL,
status = NULL,
quant = NULL,
sample.min = NULL
)
{
if(isTRUE(is.null(data))){
stop("The argument 'data' is missing, please enter a dataframe")
}
if(!isTRUE(is.data.frame(data))){
stop("Data is not a dataframe")
}
if(isTRUE(is.null(trait))){
stop("The argument 'trait' is missing, please enter a trait value")
}
#create column to record sample size
if(!isTRUE(colnames(data) %in% "sample.size")){
data[, "sample.size"] <- ""
}
#create column for measurementStatus if it does not already exist
if(!isTRUE(colnames(data) %in% "measurementStatus")){
data[, "measurementStatus"] <- ""
}
data[, "measurementValue"] <- as.numeric(data[, "measurementValue"])
#create index of species names to go through
#by default goes through all species in the dataset
if(isTRUE(is.null(taxa))){
sp <- unique(data[,"scientificName"])
}
else{
sp <- taxa
}
if(isTRUE(is.null(sample.min))){
n.limit = 3
}
else{
n.limit = sample.min
}
if(isTRUE(is.null(quant))){
quant = 5
}
else{
quant = quant
}
steps = quant*.01
if(isTRUE(is.null(status))){
status = c("too few records", "outlier")
}
else{
status = status
}
data[,"index"] <- rownames(data)
#create new columns if they don't currently exist
##if don't have one of these columns, likely don't have any
if(!(isTRUE(colnames(data) %in% "upperLimit"))){
data[,"upperLimit"] <- ""
data[,"lowerLimit"] <- ""
data[,"limitMethod"] <- ""
}
percent <- seq(0, 1, steps)
index <- seq(1, length(percent), 1)
#q <- data.frame(percent,index)
lower.quant.index = 2 #this will always be 2, because 1 = 0
upper.quant.index = length(index)-1 #this will be 1 less than the length of index, length of index = 100%
for(i in 1:length(sp)){
sub <- subset(data, subset = data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait &
!(data[, "measurementStatus"] %in% status))
#if they have lifeStage not null, trim the dataset more
if(!isTRUE(is.null(stage))){
sub <- subset(sub, subset = sub[, "lifeStage"] == stage)
}
#make numeric
sub[, "measurementValue"] <- as.numeric(sub[, "measurementValue"])
#remove NAs from measurementValue
sub <- sub[!is.na("measurementValue"),]
#calculate sample size for records being included in normality test
data$sample.size[data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait] <- as.numeric(nrow(sub))
data[, "sample.size"] <- as.numeric(data[, "sample.size"])
#calculate upper quantile limit
data$upperLimit[data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait] <- quantile(sub$measurementValue, probs = seq(0,1,steps), na.rm = TRUE)[[upper.quant.index]]
data[, "upperLimit"] <- as.numeric(data[, "upperLimit"])
#calculate lower quantile limit
data$lowerLimit[data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait] <- quantile(sub$measurementValue, probs = seq(0,1,steps), na.rm = TRUE)[[lower.quant.index]]
data[, "lowerLimit"] <- as.numeric(data[, "lowerLimit"])
#specify method
data$limitMethod[data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait] <- "quantile" #label method
}
data$measurementStatus[data[, "sample.size"] < n.limit &
data[, "measurementType" == trait]] <- "too few records"
for(i in 1:length(sp)){
sub <- subset(data, data[, "scientificName"] == sp[i] &
data[, "measurementType"] == trait &
!(data[, "measurementStatus"] %in% status))
for(j in 1:nrow(sub)){
if(isTRUE(sub$measurementValue[j] < sub$lowerLimit[1])){
data$measurementStatus[data$index == sub$index[j]] <- "possible juvenile"
}
else if(isTRUE(sub$measurementValue[j] > sub$upperLimit[1])){
data$measurementStatus[data$index == sub$index[j]] <- "outlier"
}
else{
data$measurementStatus[data$index == sub$index[j]] <- "possible adult, possibly good"
}
}
}
return(data)
}