spot_the_bot_k_means.py

# -*- coding: utf-8 -*-
"""Spot the bot K-means

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1VRrl7e7gyHfJZWss63YEg9X-DboFQxHl
"""

# install.packages('factoextra', quiet = TRUE)

"""#Data"""

data <- read.table("vectors2.csv",sep=",",header=TRUE)
data <- data[, 3:43]
data

train_sp <- sample(200, round(0.7*200), replace=FALSE)

train_df <- data[train_sp, ]
test_df <- data[-train_sp, ]

"""#Simple K-means"""

km_ss <- function(k) {
  kmk.out <- kmeans(train_df[, 2:41], k)
  return(kmk.out$tot.withinss)
}

K <- 15
par(mfcol = c(2,1))
k_ss <- sapply(1:K, km_ss)
plot(1:K, k_ss, ylab='sum of squares within cluster', xlab='K', main='elbow plot', type='b')
k_ss_compare <- sapply(2:(K-1), function(x) (k_ss[x]-k_ss[x+1])/(k_ss[x-1]-k_ss[x]))
plot(2:(K-1), k_ss_compare, ylab='decrease', main='comparative decrease in within cluster SS', xlab='K', type='b')

km_raw <- kmeans(train_df[, 2:41], 3, nstart = 20)
plot(train_df[, 2:3], col=(km_raw$cluster), main="Clusters, K=3")

km_raw <- kmeans(train_df[, 2:41], 2, nstart = 20)
plot(train_df[, 2:3], col=(km_raw$cluster), main="Clusters, K=2")

paste("withins", km_raw$withinss, "between", km_raw$betweenss)

pairs(train_df[, 2:6], col=(km_raw$cluster))

km_predict <- function(object, newdata) {

  centers <- object$centers
  ss_by_center <- apply(centers, 1, function(x) {
    colSums((t(newdata) - x) ^ 2)
  })
  best_clusters <- apply(ss_by_center, 1, which.min)
  
  return(best_clusters)
}

test_df$raw_predict <- km_predict(km_raw, test_df[, 2:41])

test_df

error <- 0
for (i in 1:dim(test_df)[1]) {
  if (test_df[i, 1] == "human" & test_df[i, dim(test_df)[2]] == 2) {
    error <- error+1
  }
  if (test_df[i, 1] == "bot" & test_df[i, dim(test_df)[2]] == 1) {
    error <- error+1
  }
}
paste(error*100/dim(test_df)[1], "% error rate")

"""#PCA"""

# library(factoextra)

pr.out = prcomp(train_df[, 2:41], scale=FALSE) #features have same scale
# fviz_pca_var(pr.out, col.var = "steelblue")

pr.var = pr.out$sdev^2
pve = pr.var/sum(pr.var)

par(mfrow = c(2,1))
plot(pve, xlab='PC', ylab='Proportion of variance', type='b')
plot(cumsum(pve), xlab='PC', ylab='Cumulative proportion of variance', type='b')

"""#PCA K-means"""

km_ss <- function(k) {
  kmk.out <- kmeans(pr.out$x[, 1:7], k)
  return(kmk.out$tot.withinss)
}

K <- 15
par(mfcol = c(2,1))
k_ss <- sapply(1:K, km_ss)
plot(1:K, k_ss, ylab='sum of squares within cluster', xlab='K', main='elbow plot', type='b')
k_ss_compare <- sapply(2:(K-1), function(x) (k_ss[x]-k_ss[x+1])/(k_ss[x-1]-k_ss[x]))
plot(2:(K-1), k_ss_compare, ylab='decrease', main='comparative decrease in within cluster SS', xlab='K', type='b')

km.out=kmeans(pr.out$x[,1:2], 3, nstart = 20)
plot(pr.out$x[, 1:10], col=(km.out$cluster), main="Clusters, K=3")

km.out=kmeans(pr.out$x[,1:2], 2, nstart = 20)
plot(pr.out$x[, 1:10], col=(km.out$cluster), main="Clusters, K=2")

paste("withins", km.out$withinss, "between", km.out$betweenss)

pairs(pr.out$x[, 1:5], col=(km.out$cluster))

pr_test.out = prcomp(test_df[, 2:41], scale=FALSE) #features have same scale
test_df$pca_predict <- km_predict(km.out, pr_test.out$x[, 1:7])

error <- 0
for (i in 1:dim(test_df)[1]) {
  if (test_df[i, 1] == "human" & test_df[i, 43] == 2) {
    error <- error+1
  }
  if (test_df[i, 1] == "bot" & test_df[i, 43] == 1) {
    error <- error+1
  }
}
paste(error*100/dim(test_df)[1], "% error rate")