-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsvm.R
126 lines (98 loc) · 4.3 KB
/
svm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# import svm library (use commandinstall.packages(“e1071”))
library("e1071")
# set random number seed (keeps results consistent every time)
set.seed(888)
# read in existing products
products <- read.csv("./existing product attributes.csv", header=TRUE, sep=",",
as.is=TRUE)
colnames(products) <- c("ProductType", "Product", "Price", "5StarReviews",
"4StarReviews", "3StarReviews", "2StarReviews", "1StarReviews",
"PositiveServiceReview", "NegativeServiceReview", "WouldConsumerRecommend",
"BestSellersRank", "ShippingWeight", "ProductDepth", "ProductWidth",
"ProductHeight", "ProfitMargin", "Volume")
# dropping features due to factor levels mismatch in test/train
products$BestSellersRank <- NULL
products$ProductType <- NULL
# shuffle data
products <- products[sample(nrow(products)),]
# create 10 equally sized folds
folds <- cut(seq(1,nrow(products)),breaks=10,labels=FALSE)
#
# example of how to manually perform cross-validation
#
for(i in 1:10){
#Segement your data by fold using the which() function
trainIndexes <- which(folds!=i,arr.ind=TRUE)
trainData <- products[trainIndexes, ]
testData <- products[-trainIndexes, ]
# show run stats
cat('==== Fold no.', i, '====\n')
cat('Training set # of rows: ', nrow(trainData), '\n')
cat('Testing set # of rows: ', nrow(testData), '\n\n')
# train ze model
model <- svm(Volume ~ ., data=trainData)
print(summary(model))
# make predictions, join the predicted with the actual vals
predictions <- round(predict(model, testData), digits=0)
comparison <- cbind(testData$Volume, predictions)
colnames(comparison) <- c("actual", "predicted")
# calculate absolute percent error and print to terminal
comparison <- cbind(comparison,
round(abs(comparison[,1]-comparison[,2])/comparison[,1]*100.0,
digits=1))
colnames(comparison)[3] <- "absPercentError"
print(comparison)
cat('\nMean abs percent err:',
round(sum(comparison[,3]/nrow(comparison)),0.1), 'units\n\n')
}
#
# example of how to tune svm parameters with integrated cross-validation
#
# tune the svm
svm_tuned <- tune.svm(Volume ~ ., data=trainData, cost = 2^(-1:10),
kernel = "linear", gamma=c(0.5, 1, 2))
cat('\n==== Tuned SVM best settings:\n')
print(svm_tuned)
# taking the output from the tuning step above and creating a final model
svm_tuned_model <- svm(Volume ~ ., data = trainData,
kernel = "linear",
gamma = svm_tuned$best.parameters$gamma,
cost = svm_tuned$best.parameters$cost);
cat('\n==== Final SVM model with best settings used:\n')
print(svm_tuned_model)
# make predictions, join the predicted with the actual vals
predictions_tuned <- round(predict(svm_tuned_model, testData), digits=0)
comparison_tuned <- cbind(testData$Volume, predictions_tuned)
colnames(comparison_tuned) <- c("actual", "predicted")
# calculate absolute percent error and print to terminal
comparison_tuned <- cbind(comparison_tuned,
round(abs(comparison_tuned[,1]-comparison_tuned[,2])
/comparison_tuned[,1]*100.0,
digits=1))
colnames(comparison_tuned)[3] <- "absPercentError"
print(comparison_tuned)
cat('\nMean abs percent err of tuned model:',
round(sum(comparison_tuned[,3]/nrow(comparison_tuned)),0.1), 'units\n\n')
#
# example of taking the tuned model and using it to predict new products
#
# read in new products
products_new <- read.csv("./new product attributes.csv", header=TRUE, sep=",",
as.is=TRUE)
colnames(products_new) <- c("ProductType", "Product", "Price", "5StarReviews",
"4StarReviews", "3StarReviews", "2StarReviews", "1StarReviews",
"PositiveServiceReview", "NegativeServiceReview", "WouldConsumerRecommend",
"BestSellersRank", "ShippingWeight", "ProductDepth", "ProductWidth",
"ProductHeight", "ProfitMargin", "Volume")
# dropping features due to factor levels mismatch in test/train
products_new$BestSellersRank <- NULL
products_new$ProductType <- NULL
# setting volume to num from chr (default import type), needed for svm
products_new$Volume <- 0
# make predictions on new products
predictions_tuned_new <- data.frame(round(predict(svm_tuned_model,
products_new), digits=0.1))
# name the columns for easy reading
colnames(predictions_tuned_new) <- 'projected volume'
# view output
print(predictions_tuned_new)