-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
188 lines (145 loc) · 6.75 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Pour les noms des variables, nous conservons les memes notations que l'enonce
# Required packages installation
install.packages("glmnet")
# Global parameters
train_set_size = 0.90
# Global functions
# Computes the Root Mean Square Error of the predicted y
compute_rmse <- function(y, y_est){
return(sum((y-y_est)^2)/length(y))
}
count_zeros <- function(x){
return(sum(x == 0) )
}
# Question 2 : Multinlinear regression least squares estimator
# Estimate least sqaure OLS regression estimate of beta
estimate_beta <- function(X, y){
M = t(X)%*%X
M_inv = solve(M)
beta = M_inv%*%t(X)%*%y
return(beta)
}
# Returns the OLS regression prediction of y
predict_y <- function(X, beta){
return(X%*%beta)
}
# Importating the data
ols_data_set = data.matrix(read.table("mysmalldata.txt", sep=','))
# Computing the train-test slicing coordinates
train_row_limit = train_set_size * nrow(ols_data_set)
test_row_start = train_row_limit + 1
# Splitting the data set into a train set and test set
X_train = ols_data_set[1:train_row_limit, 2:ncol(ols_data_set)]
y_train = ols_data_set[1:train_row_limit, 1]
X_test = ols_data_set[test_row_start:nrow(ols_data_set), 2:ncol(ols_data_set)]
y_test = ols_data_set[test_row_start:nrow(ols_data_set), 1]
# Estimating the least square OLS regression estimate of beta using the train set
beta_hat = estimate_beta(X=X_train, y=y_train)
# Predicting and computing the error for the train set and the test set
y_est_train = predict_y(X=X_train, beta=beta_hat)
rmse_lse_train = compute_rmse(y=y_train, y_est=y_est_train)
y_est_test = predict_y(X=X_test, beta=beta_hat)
rmse_lse_test = compute_rmse(y=y_test, y_est=y_est_test)
nb_zeros_ols = count_zeros(beta_hat)
# Explained Variace
R_squared_train = var(y_est_train) / var(y_train)
# Question 7 : Estimateur Lasso
# Packages
library("glmnet")
# Parameters
lambda = c(0.1)
# Importing the dataset
lasso_data_set = data.matrix(read.table("mydata.txt", sep=','))
# Computing the train-test slicing coordinates
train_row_limit = train_set_size * nrow(lasso_data_set)
test_row_start = train_row_limit + 1
# Splitting the data set into a train set and test set
X_train = lasso_data_set[1:train_row_limit, 2:ncol(lasso_data_set)]
y_train = lasso_data_set[1:train_row_limit, 1]
X_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 2:ncol(lasso_data_set)]
y_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 1]
# Estimating the lasso regression parameters using the glmnet package
# The parameter alpha is the weight between Lasso and Ridge regression, we set it to one for Lasso regression
fit = glmnet(x=X_train, y=y_train, alpha = 1, lambda = lambda)
# Predicting y using the lasso estimated parameters
y_est_lasso_test = predict(fit, newx = X_test, type = "response", s=lambda)
y_est_lasso_train = predict(fit, newx = X_train, type = "response", s=lambda)
# Computing the score for the lasso regression
rmse_lasso_test = compute_rmse(y_est_lasso_test, y_test)
rmse_lasso_train = compute_rmse(y_est_lasso_train, y_train)
# Conting the number of zeros
nb_non_zeros_lasso = fit$df
nb_zeros_lasso = ncol(lasso_data_set) - nb_non_zeros_lasso
R_squared_lasso = var(y_est_train) / var(y_train)
# Question 10 : Cross Validation
# Manually coded Cross-Validation
# Parameters
lambda_array = c(0.001, 0.01, 0.1, 1, 10)
train_size = 0.8
val_size = 0.1
test_size = 0.1
train_row_limit = train_size * nrow(ols_data_set)
validation_row_start = train_row_limit + 1
validation_row_end = (train_size + val_size) * nrow(ols_data_set)
test_row_start = validation_row_start + 1
# Train, validation, test split
X_train = lasso_data_set[1:train_row_limit, 2:ncol(lasso_data_set)]
y_train = lasso_data_set[1:train_row_limit, 1]
X_val = lasso_data_set[validation_row_start:validation_row_end, 2:ncol(lasso_data_set)]
y_val = lasso_data_set[validation_row_start:validation_row_end, 1]
X_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 2:ncol(lasso_data_set)]
y_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 1]
rmse_min = Inf
lambda_min = c(0)
# Estimating the lasso regression parameters using the glmnet package
for (lambda in lambda_array){
lambda = c(lambda)
# The parameter alpha is the weight between Lasso and Ridge regression, we set it to one for Lasso regression
fit = glmnet(x=X_train, y=y_train, alpha = 1, lambda = lambda)
# Predicting y using the lasso estimated parameters
y_est_lasso = predict(fit, newx = X_val, type = "response", s=lambda)
# Computing the score for the lasso regression
rmse_lasso = compute_rmse(y_est_lasso, y_val)
if (rmse_lasso < rmse_min){
rmse_min = rmse_lasso
lambda_min = lambda
}
}
# Merging the train and validation into a full train set
X_train = lasso_data_set[1:validation_row_end, 2:ncol(lasso_data_set)]
y_train = lasso_data_set[1:validation_row_end, 1]
# Estimating the lasso regression parameters using the glmnet package for the optimal lambda
fit_optimal = glmnet(x=X_train, y=y_train, alpha = 1, lambda = lambda_min)
# Predicting y using the lasso estimated parameters
y_est_lasso_cval = predict(fit_optimal, newx = X_test, type = "response", s=lambda)
y_est_lasso_cval_train = predict(fit_optimal, newx = X_train, type = "response", s=lambda)
# Computing the score for the lasso regression against the test set
rmse_lasso_cval = compute_rmse(y_est_lasso_cval, y_test)
R_squared_lasso_cval_train = var(y_est_lasso_cval_train) / var(y_train)
# Alternative method for Cross Validation using the glmnet package
# Importing the dataset
lasso_data_set = data.matrix(read.table("mydata.txt", sep=','))
# Computing the train-test slicing coordinates
train_row_limit = train_set_size * nrow(lasso_data_set)
test_row_start = train_row_limit + 1
# Splitting the data set into a train set and test set
X_train = lasso_data_set[1:train_row_limit, 2:ncol(lasso_data_set)]
y_train = lasso_data_set[1:train_row_limit, 1]
X_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 2:ncol(lasso_data_set)]
y_test = lasso_data_set[test_row_start:nrow(lasso_data_set), 1]
# Cross validation with 100 lambda trials
fit_auto = cv.glmnet(X_train, y_train, alpha = 1, nlambda = 100)
# Predicting y using the lasso estimated parameters
y_est_lasso_auto = predict.cv.glmnet(fit_auto, newx = X_test, type = "response")
y_est_lasso_auto_train = predict.cv.glmnet(fit_auto, newx = X_train, type = "response")
# Plotting the CV curve
plot.cv.glmnet(fit_auto)
# Computing the score for the lasso regression
rmse_lasso_auto = compute_rmse(y_est_lasso_auto, y_test)
R_squared_lasso_auto_train = var(y_est_lasso_auto_train) / var(y_train)
lambda_min_cv_auto = fit_auto$lambda.min
# Plotting the number of zeros against lambdas
fit_auto = glmnet(X_train, y_train, alpha = 1, nlambda = 100)
nb_zeros_array = sort(ncol(lasso_data_set) - fit_auto$df)
lambdas_array = sort(fit_auto$lambda)
plot(lambdas_array, nb_zeros_array)