-
Notifications
You must be signed in to change notification settings - Fork 0
/
spot_the_bot_k_means.py
123 lines (89 loc) · 3.35 KB
/
spot_the_bot_k_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""Spot the bot K-means
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VRrl7e7gyHfJZWss63YEg9X-DboFQxHl
"""
# install.packages('factoextra', quiet = TRUE)
"""#Data"""
data <- read.table("vectors2.csv",sep=",",header=TRUE)
data <- data[, 3:43]
data
train_sp <- sample(200, round(0.7*200), replace=FALSE)
train_df <- data[train_sp, ]
test_df <- data[-train_sp, ]
"""#Simple K-means"""
km_ss <- function(k) {
kmk.out <- kmeans(train_df[, 2:41], k)
return(kmk.out$tot.withinss)
}
K <- 15
par(mfcol = c(2,1))
k_ss <- sapply(1:K, km_ss)
plot(1:K, k_ss, ylab='sum of squares within cluster', xlab='K', main='elbow plot', type='b')
k_ss_compare <- sapply(2:(K-1), function(x) (k_ss[x]-k_ss[x+1])/(k_ss[x-1]-k_ss[x]))
plot(2:(K-1), k_ss_compare, ylab='decrease', main='comparative decrease in within cluster SS', xlab='K', type='b')
km_raw <- kmeans(train_df[, 2:41], 3, nstart = 20)
plot(train_df[, 2:3], col=(km_raw$cluster), main="Clusters, K=3")
km_raw <- kmeans(train_df[, 2:41], 2, nstart = 20)
plot(train_df[, 2:3], col=(km_raw$cluster), main="Clusters, K=2")
paste("withins", km_raw$withinss, "between", km_raw$betweenss)
pairs(train_df[, 2:6], col=(km_raw$cluster))
km_predict <- function(object, newdata) {
centers <- object$centers
ss_by_center <- apply(centers, 1, function(x) {
colSums((t(newdata) - x) ^ 2)
})
best_clusters <- apply(ss_by_center, 1, which.min)
return(best_clusters)
}
test_df$raw_predict <- km_predict(km_raw, test_df[, 2:41])
test_df
error <- 0
for (i in 1:dim(test_df)[1]) {
if (test_df[i, 1] == "human" & test_df[i, dim(test_df)[2]] == 2) {
error <- error+1
}
if (test_df[i, 1] == "bot" & test_df[i, dim(test_df)[2]] == 1) {
error <- error+1
}
}
paste(error*100/dim(test_df)[1], "% error rate")
"""#PCA"""
# library(factoextra)
pr.out = prcomp(train_df[, 2:41], scale=FALSE) #features have same scale
# fviz_pca_var(pr.out, col.var = "steelblue")
pr.var = pr.out$sdev^2
pve = pr.var/sum(pr.var)
par(mfrow = c(2,1))
plot(pve, xlab='PC', ylab='Proportion of variance', type='b')
plot(cumsum(pve), xlab='PC', ylab='Cumulative proportion of variance', type='b')
"""#PCA K-means"""
km_ss <- function(k) {
kmk.out <- kmeans(pr.out$x[, 1:7], k)
return(kmk.out$tot.withinss)
}
K <- 15
par(mfcol = c(2,1))
k_ss <- sapply(1:K, km_ss)
plot(1:K, k_ss, ylab='sum of squares within cluster', xlab='K', main='elbow plot', type='b')
k_ss_compare <- sapply(2:(K-1), function(x) (k_ss[x]-k_ss[x+1])/(k_ss[x-1]-k_ss[x]))
plot(2:(K-1), k_ss_compare, ylab='decrease', main='comparative decrease in within cluster SS', xlab='K', type='b')
km.out=kmeans(pr.out$x[,1:2], 3, nstart = 20)
plot(pr.out$x[, 1:10], col=(km.out$cluster), main="Clusters, K=3")
km.out=kmeans(pr.out$x[,1:2], 2, nstart = 20)
plot(pr.out$x[, 1:10], col=(km.out$cluster), main="Clusters, K=2")
paste("withins", km.out$withinss, "between", km.out$betweenss)
pairs(pr.out$x[, 1:5], col=(km.out$cluster))
pr_test.out = prcomp(test_df[, 2:41], scale=FALSE) #features have same scale
test_df$pca_predict <- km_predict(km.out, pr_test.out$x[, 1:7])
error <- 0
for (i in 1:dim(test_df)[1]) {
if (test_df[i, 1] == "human" & test_df[i, 43] == 2) {
error <- error+1
}
if (test_df[i, 1] == "bot" & test_df[i, 43] == 1) {
error <- error+1
}
}
paste(error*100/dim(test_df)[1], "% error rate")