-
Notifications
You must be signed in to change notification settings - Fork 3
/
07-vis-nir_modeling.Rmd
322 lines (281 loc) · 13.2 KB
/
07-vis-nir_modeling.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# Vis–NIR modelling and predictions
------------------------------------------------------------------------
A prediction model based on vis–NIR spectra was fitted for each soil property using the selected optimal calibration subset. Each model was developed using a memory-based learning (MBL) algorithm (from the `resemble` package). Please check the paper for additional details on the MBL lagorithm used (see section _Vis–NIR modelling and predictions_ subsections 1-3).
## Calibration and validation
First define some basic aspects of the MBL which will be commmon to all the properties for which NIR modeling is going to be performed.
```{r eval = FALSE}
## Add to the train data a variable indicating to what sampling point
## each sample belongs to. This variable will be used internally
## by the mbl function during the internal validations.
train$samplegroup <- substr(train$ID, 2, 100)
## Dissimilarity metric: partial least squares (pls)
dmetric <- "pls"
## Threshold distances to be tested
## Note: the number of threshold distances tested here is large,
## it could be reduced
diss2test <- seq(0.3, 1.5, by = 0.05)
## Define the minimum and maximum number of neighbors that must be retained
## for each local model.
## Example: If with a given threshold distance only 70 neighbors are
## selected, then the function will be fored to take the especified minimum
## number of neighbors to be included in the model. In this case 120.
## We also set that the maximum number of neighbors that can be
## included can be all the samples in the calibration set.
kminmax <- c(120, nrow(train$spc))
## Regression method: Weighted average pls (wapls)
rmethod <- "wapls1"
## set the maximum and minimum number of pls factors
## for the local wapls regressions
my_wapls <- local_fit_wapls(min_pls_c = 5, max_pls_c = 20)
## Adjust some additional parameters that control some
## aspects of the mbl
ctrl <- mbl_control(return_dissimilarity = TRUE,
validation_type = "NNv")
```
We also create some `data.frames` were we are going to store the prediction results for the validation set
```{r eval = FALSE}
## Create a data frame and store the validation results
## Squared correlation coefficient (R2)
## Root mean squared error (RMSE)
## Mean error (ME)
## (particle size predictions are also back-transformed to the original
## space to report the R2, RMSE and ME).
valrestuls <- data.frame(Property = c("Ca",
"alr_clay",
"alr_silt",
"Clay",
"Silt",
"Sand"),
RMSE = NA,
R2 = NA,
ME = NA)
## Create a data frame and store the variance of the residuals for each layer
## (this will be later used for geostatistical analyses)
residualvariances <- data.frame(Property = c("Ca", "alr_Clay", "alr_Silt"),
layerA = NA,
layerB = NA,
layersAB = NA)
```
Now we perfom MBL and predictions for soil exchangeable Ca^2+^:
```{r eval = FALSE}
## Run the MBL and find the optimal threshold distance
## for neighbor selection and predict Ca for validation
## - MBL fits a wapls model to each sample in Xu using
## the calibration samples {Yr,Xr}
## - The internal validation predictions
## these are predictions of the nearest samples
## of each Xu found in Xr - Nearest neighbor validation (NNv) -
sbl_ca <- mbl(Yr = train$Ca ,
Xr = train$spc,
Xu = valida$spc,
k_diss = diss2test,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
sbl_ca
## Best threshold distance
idx.best.ca <- which.min(sbl_ca$validation_results$nearest_neighbor_validation$st_rmse)
best.kdiss.ca <- sbl_ca$validation_results$nearest_neighbor_validation$k_diss[idx.best.ca]
## Get the predicted values for the validation set
pred_valCa <- unlist(get_predictions(sbl_ca)[,..idx.best.ca])
## R2
valrestuls$R2[valrestuls$Property == "Ca"] <- cor(valida$Ca, pred_valCa)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Ca"] <- mean((valida$Ca - pred_valCa)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Ca"] <- mean(valida$Ca - pred_valCa)
## Residual variances
## layer A
residualvariances$layerA[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer == 'A'] - pred_valCa[valida$layer == 'A'])
## layer B
residualvariances$layerB[residualvariances$Property == "Ca"] <- var(valida$Ca[valida$layer == 'B'] - pred_valCa[valida$layer == 'B'])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "Ca"] <- var(valida$Ca - pred_valCa)
```
MBL and predictions for $alr(clay)$...
```{r eval = FALSE}
sbl_alrclay <- mbl(Yr = train$alr_Clay,
Xr = train$spc,
Xu = valida$spc,
k_diss = diss2test,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
## Best threshold distance
idx.best.alrclay <- which.min(sbl_alrclay$validation_results$nearest_neighbor_validation$st_rmse)
best.kdiss.alrclay <- sbl_alrclay$validation_results$nearest_neighbor_validation$k_diss[idx.best.alrclay]
## Get the predicted values for the validation set
pred_val_alrclay <- unlist(get_predictions(sbl_alrclay)[,..idx.best.alrclay])
## R2
valrestuls$R2[valrestuls$Property == "alr_clay"] <- cor(valida$alr_Clay, pred_val_alrclay)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_clay"] <- mean((valida$alr_Clay - pred_val_alrclay)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_clay"] <- mean(valida$alr_Clay - pred_val_alrclay)
## Residual variances
## layer A
residualvariances$layerA[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer == 'A'] - pred_val_alrclay[valida$layer == 'A'])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay[valida$layer == 'B'] - pred_val_alrclay[valida$layer == 'B'])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Clay"] <- var(valida$alr_Clay - pred_val_alrclay)
```
...and $alr(silt)$...
```{r eval = FALSE}
sbl_alrsilt <- mbl(Yr = train$alr_Silt,
Xr = train$spc,
Xu = valida$spc,
k_diss = diss2test,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
## Best threshold distance
idx.best.alrsilt <- which.min(sbl_alrsilt$validation_results$nearest_neighbor_validation$st_rmse)
best.kdiss.alrsilt <- sbl_alrsilt$validation_results$nearest_neighbor_validation$k_diss[idx.best.alrsilt]
## Get the predicted values for the validation set
pred_val_alrsilt <- unlist(get_predictions(sbl_alrsilt)[,..idx.best.alrsilt])
## R2
valrestuls$R2[valrestuls$Property == "alr_silt"] <- cor(valida$alr_Clay, pred_val_alrsilt)^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "alr_silt"] <- mean((valida$alr_Clay - pred_val_alrsilt)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "alr_silt"] <- mean(valida$alr_Clay - pred_val_alrsilt)
## Residual variances
## layer A
residualvariances$layerA[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer == 'A'] - pred_val_alrsilt[valida$layer == 'A'])
## layer B
residualvariances$layerB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt[valida$layer == 'B'] - pred_val_alrsilt[valida$layer == 'B'])
## layers A and B
residualvariances$layersAB[residualvariances$Property == "alr_Silt"] <- var(valida$alr_Silt - pred_val_alrsilt)
```
Save the table of the variance of the residuals into your working directory. This data will be later used in the spatial analyses.
```{r eval = FALSE}
write.table(x = residualvariances,
file = "vnir_residual_variances.txt",
sep = "\t",
row.names = FALSE)
```
To asses the accuracies and precisions of the predictions (in the validation set) for particle-size distribution, we need to back-transform the additive log-ratio transformed variables to the original clay, silt and sand contents:
```{r eval = FALSE}
## Alr back-transformations
## Clay contents
valClay_alr <- 100 * exp(pred_val_alrclay)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Silt contents
valSilt_alr <- 100 * exp(pred_val_alrsilt)/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
## Sand contents
valSand_alr <- 100 * 1/(1 + exp(pred_val_alrclay) + exp(pred_val_alrsilt))
```
Now we can compute the parameters to asses accuracies and precisions:
```{r eval = FALSE}
## Clay content
## R2
valrestuls$R2[valrestuls$Property == "Clay"] <- (cor(valida$Clay, valClay_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Clay"] <- mean((valida$Clay - valClay_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Clay"] <- mean(valida$Clay - valClay_alr)
## Silt content
## R2
valrestuls$R2[valrestuls$Property == "Silt"] <- (cor(valida$Silt, valSilt_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Silt"] <- mean((valida$Silt - valSilt_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Silt"] <- mean(valida$Silt - valSilt_alr)
## Sand content
## R2
valrestuls$R2[valrestuls$Property == "Sand"] <- (cor(valida$Sand, valSand_alr))^2
# RMSE
valrestuls$RMSE[valrestuls$Property == "Sand"] <- mean((valida$Sand - valSand_alr)^2)^0.5
# ME
valrestuls$ME[valrestuls$Property == "Sand"] <- mean(valida$Sand - valSand_alr)
```
Examine the `valrestuls` object...
```{r eval = FALSE}
valrestuls
```
## Property predictions in the prediction set
After validating the vis-NIR models we can apply them to the prediction set.
We can start by creating a `data.frame` where the predictions will be stored. In this `data.frame` the predicted values will be stored under the following variable names: `Ca_spec`, `alr_Clay_spec` and `alr_Silt_spec`:
```{r eval = FALSE}
vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)),
alr_Clay_spec = rep(NA, nrow(pred)),
alr_Silt_spec = rep(NA, nrow(pred)))
vnirpredictions <- data.frame(Ca_spec = rep(NA, nrow(pred)),
alr_Clay_spec = rep(NA, nrow(pred)),
alr_Silt_spec = rep(NA, nrow(pred)))
## Add additional relevant information from the original prediction set
vnirpredictions <- cbind(pred[,c("ID",
"POINT_X",
"POINT_Y",
"set",
"Ca",
"Clay",
"Silt",
"Sand",
"alr_Clay",
"alr_Silt")],
vnirpredictions)
## Re-encode the set variable to "prediction"
vnirpredictions$set <- factor("prediction")
```
For exchangeable Ca^2+^ predictions...
```{r eval = FALSE}
# Predict Ca in the prediction set based on the optimized threshold
# distance for neighbor selection
sbl_ca_pred <- mbl(Yr = train$Ca,
Xr = train$spc,
Xu = pred$spc,
k_diss = best.kdiss.ca,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
## get the predicted values and store them in nirpredictions
vnirpredictions$Ca_spec <- unlist(get_predictions(sbl_ca_pred))
```
For $alr(clay)$ predictions...
```{r eval = FALSE}
sbl_alrclay_pred <- mbl(Yr = train$alr_Clay,
Xr = train$spc,
Xu = pred$spc,
k_diss = best.kdiss.alrclay,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Clay_spec <- unlist(get_predictions(sbl_alrclay_pred))
```
For $alr(silt)$ predictions...
```{r eval = FALSE}
sbl_alrsilt_pred <- mbl(Yr = train$alr_Silt,
Xr = train$spc,
Xu = pred$spc,
k_diss = best.kdiss.alrsilt,
k_range = kminmax,
diss_method = dmetric,
diss_usage = "none",
method = my_wapls,
control = ctrl,
group = train$samplegroup)
## get the predicted values and store them in nirpredictions
vnirpredictions$alr_Silt_spec <- unlist(get_predictions(sbl_alrsilt_pred))
```
Examine the `vnirpredictions` object...
```{r eval = FALSE}
vnirpredictions
summary(vnirpredictions)
```