Polish and update not to be dependent on new libraries other than wha…

…t is in the Docker image
NBISweden · Mar 6, 2024 · d6fde1d · d6fde1d
1 parent 4189fc5
commit d6fde1d
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 91 deletions.
diff --git a/data/HC-agglomerative.png b/data/HC-agglomerative.png
diff --git a/data/HC-divisive.png b/data/HC-divisive.png
diff --git a/data/Kmeans_3.gif b/data/Kmeans_3.gif
diff --git a/data/Mahalanobis.png b/data/Mahalanobis.png
diff --git a/slide_clustering.rmd b/slide_clustering.rmd
@@ -24,12 +24,13 @@ output:
 ```{r, include = FALSE}
 #Load the  packages
 library(knitr)
-library(factoextra)
-library(mclust)
-library(MASS) # For mvrnorm to generate multivariate normal samples
 library(ggplot2)
-library(dbscan)
-library(cluster)
+# Following libraries are used for generating the figures:
+  #library(factoextra)
+  #library(mclust)
+  #library(MASS) # For mvrnorm to generate multivariate normal samples
+  #library(dbscan)
+  #library(cluster)
 
 # functions
 calculate_wcss <- function(data, k) {
@@ -49,7 +50,7 @@ groups <- rep(c('Group1', 'Group2', 'Group3'), each=3)
 ```
 
 
----   
+---
 name: intro
 ## Clustering
 
@@ -91,7 +92,7 @@ name: distance-metrics
 
 -   In multidimensional space (raw data)
 
--   in reduced space (i.e. top PCs)
+-   In reduced space (i.e. top PCs)
 
 ---
 name: Euclidean
@@ -195,43 +196,40 @@ name: Mahalanobis-Distance
 
 -   Despite of previous approach which was based on distance between data points, this method measures the distance between a data point and a distribution.
 
-```{r, echo = F, fig.align='center'}
+```{r mahalanobis-plot, echo = F, out.width='60%', fig.align='center'}
+knitr::include_graphics('data/Mahalanobis.png')
+```
+
+
+```{r, echo = F, fig.align='center', eval = F}
 # Generate a dataset of 2 variables
 set.seed(42)
 data <- mvrnorm(n = 300, mu = c(0, 0), Sigma = matrix(c(1, 0.9, 0.9, 1), ncol = 2))
 colnames(data) <- c("X1", "X2")
 
 # Define a point
-point <- c(2, 2)
+point1 <- c(1, 1)
+point2 <- c(1, -1)
 
 # Calculate Euclidean distance from the center (0,0)
 euclidean_distance <- sqrt(sum((point - colMeans(data))^2))
 
 # Correct calculation of Mahalanobis distance
 mahalanobis_distance <- sqrt(t(matrix(point - colMeans(data))) %*% solve(var(data)) %*% matrix(point - colMeans(data)))
 
-# Print the distances
-# print(paste("Euclidean Distance:", round(euclidean_distance, 2)))
-# print(paste("Mahalanobis Distance:", round(mahalanobis_distance, 2)))
-
-
+png('data/Mahalanobis.png', width = 1000, height = 1000,res = 150)
 # Create a base plot
 plot <- ggplot(data.frame(data), aes(x = X1, y = X2)) + 
-  geom_point(color = "lightblue") + 
+  geom_point(color = "blue") + 
   geom_point(aes(x = 0, y = 0), color = "red", size = 3) + 
-  geom_point(aes(x = point[1], y = point[2]), color = "green", size = 3) +
-  geom_segment(aes(x = 0, y = 0, xend = point[1], yend = point[2]), 
-               linetype = "dashed", color = "green") +
+  geom_point(aes(x = point1[1], y = point1[2]), color = "green", size = 3) +
+  geom_point(aes(x = point2[1], y = point2[2]), color = "darkgreen", size = 3) +
   ggtitle("Mahalanobis Distance") +
   theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
 
 # Display the plot
 print(plot)
-
-# Annotate distances
-# plot + annotate("text", x = 1, y = 2, label = paste("Euclidean:", round(euclidean_distance, 2)), color = "green") +
-#       annotate("text", x = 1, y = 1.8, label = paste("Mahalanobis:", round(mahalanobis_distance, 2)), color = "green")
-
+dev.off()
 ```
 
 ---
@@ -241,16 +239,14 @@ name: Centroid-based1
 
 -   In this method the distance between data points and centroids is calculated
 
--   Each data point is assigned to a cluster based on its squared distance from centroid.
+-   Each data point is assigned to a cluster based on Euclidean distance from centroid.
 
--   Square distance
+-   Dependent on number of K (clusters) new centroids are created
 
 <div style="text-align: center;">
-  <img src="data/kmeans.gif" alt="Alt text for the GIF" style="width: 65%; height: auto;">
+<img src="data/kmeans.gif" alt="Alt text for the GIF" style="width: 65%; height: auto;">
 </div>
 
-
-
 ```{r kmeans-create, echo = F, eval = F, fig.align='center'}
 # Load necessary libraries
 library(ggplot2)
@@ -259,7 +255,7 @@ set.seed(123)
 kmeans_result <- kmeans(data, centers=3)
 
 k_values <- 1:5
-pdf('data/kmeans_3.pdf', width = 10, height = 8)
+png('data/kmeans_3.png', width = 10, height = 1000)
 for(k in k_values){
   kmeans_result <- kmeans(data, centers = k, nstart = 20)
   df_kmeans <- data.frame(PC1 = data[,1], PC2 = data[,2], Cluster = as.factor(kmeans_result$cluster))
@@ -286,52 +282,69 @@ for(k in k_values){
   
 }
 dev.off()
+```
+---
+name: Centroid-based2
+## Centroid-based: K-means clustering
+-   One of the most commonly used clustering methods
+
+-   In this method the distance between data points and centroids is calculated
+
+-   Each data point is assigned to a cluster based on Euclidean distance from centroid.
+
+-   Dependent on number of K (clusters) new centroids are created
+
+<div style="text-align: center;">
+<img src="data/kmeans_3.gif" alt="Alt text for the GIF" style="width: 65%; height: auto;">
+</div>
+
 
+```{r kmeans-create-k-3, echo = F, eval = F, fig.align='center'}
 # # Create a synthetic dataset
-# set.seed(123) # For reproducibility
-# # data <- data.frame(x = rnorm(90, mean = 0),
-# #                    y = rnorm(90, mean = 0))
+set.seed(123) # For reproducibility
+data <- data.frame(x = rnorm(90, mean = 0),
+                   y = rnorm(90, mean = 0))
 # data <- data.frame(data)
 # names(data) <- c("x", "y", "z")
-# k <- 3 # Number of clusters
-# 
-# # Initial centroids (randomly selecting 3 points from the dataset in this example)
-# set.seed(123) # For reproducibility
-# initial_centroids <- data[sample(nrow(data), k), ]
-# 
-# # K-means clustering step-by-step
-# 
-# n_iterations <- k # Number of iterations to perform (for demonstration)
-# pdf('data/kmeans_3.pdf', width = 10, height = 8)
-# for (i in 1:n_iterations) {
-#     # Assign points to the nearest centroid
-#     distances <- as.matrix(dist(rbind(data, initial_centroids)))
-#     distances <- distances[1:nrow(data), (nrow(data)+1):(nrow(data)+k)]
-#     cluster_assignment <- apply(distances, 1, which.min)
-#     
-#     # Update centroids
-#     new_centroids <- aggregate(data[,c("x", "y", "z")], by = list(cluster_assignment), FUN = mean)
-#     new_centroids <- new_centroids[, -1] # Removing the grouping column
-#     
-#     
-#     # Plotting
-#     p <- ggplot(as.data.frame(data), aes(x = x, y = y)) +
-#         geom_point(aes(color = factor(cluster_assignment))) +
-#         geom_point(data = new_centroids, aes(x = x, y = y), color = 'black', size = 5) +
-#         geom_segment(data = data, aes(xend = new_centroids[cluster_assignment, "x"], 
-#                                       yend = new_centroids[cluster_assignment, "y"],
-#                                       x = x, y = y), color = 'grey', alpha = 0.5) +
-#         labs(title = paste("Iteration:", i)) +
-#         theme_minimal()
-#     
-#     print(p)
-#     # For the next iteration
-#     initial_centroids <- new_centroids
-# }
-# dev.off()
+k <- 3 # Number of clusters
 
+# Initial centroids (randomly selecting 3 points from the dataset in this example)
+set.seed(123) # For reproducibility
+initial_centroids <- data[sample(nrow(data), k), ]
+
+# K-means clustering step-by-step
+
+n_iterations <- k # Number of iterations to perform (for demonstration)
+pdf('data/kmeans_3.pdf', width = 10, height = 8)
+for (i in 1:n_iterations) {
+  # Assign points to the nearest centroid
+  distances <- as.matrix(dist(rbind(data, initial_centroids)))
+  distances <- distances[1:nrow(data), (nrow(data)+1):(nrow(data)+k)]
+  cluster_assignment <- apply(distances, 1, which.min)
+  
+  # Update centroids
+  new_centroids <- aggregate(data[,c("x", "y")], by = list(cluster_assignment), FUN = mean)
+  new_centroids <- new_centroids[, -1] # Removing the grouping column
+  
+  
+  # Plotting
+  p <- ggplot(as.data.frame(data), aes(x = x, y = y)) +
+    geom_point(aes(color = factor(cluster_assignment))) +
+    geom_point(data = new_centroids, aes(x = x, y = y), color = 'black', size = 5) +
+    geom_segment(data = data, aes(xend = new_centroids[cluster_assignment, "x"],
+                                  yend = new_centroids[cluster_assignment, "y"],
+                                  x = x, y = y), color = 'grey', alpha = 0.5) +
+    labs(title = paste("Iteration:", i)) +
+    theme_minimal()
+  
+  print(p)
+  # For the next iteration
+  initial_centroids <- new_centroids
+}
+dev.off()
 ```
 ---
+
 name: optimal k
 ## What is optimal K?
 
@@ -340,6 +353,8 @@ name: optimal k
   - Gap statistics. 
   - Average Silhouette method
 
+
+
 ```{r optimal-k, echo = F, fig.align='center', fig.height=5, fig.width=10}
 set.seed(123) # For reproducibility
 data <- rbind(
@@ -357,7 +372,7 @@ wcss_values <- sapply(k_values, function(k) calculate_wcss(data, k))
 plot(k_values, wcss_values, type = "b", pch = 16, col = "blue", 
      xlab = "Number of Clusters (K)", ylab = "Within-Cluster Sum of Squares (WCSS)",
      main = "Elbow Method for Optimal K")
-# knitr::include_graphics('data/kmeans_3.pdf')
+# knitr::include_graphics('data/kmeans_3.png')
 k <- 3
 kmeans_result <- kmeans(data, centers = k, nstart = 20)
 df_kmeans <- data.frame(PC1 = data[,1], PC2 = data[,2], Cluster = as.factor(kmeans_result$cluster))
@@ -366,12 +381,8 @@ plot(df_kmeans$PC1, df_kmeans$PC2, col = df_kmeans$Cluster, pch = 16,
 par(mfrow = c(1, 1))
 ``` 
 ???
-Within-cluster sum of squares (WCSS) is a metric used to quantify the compactness of clusters by measuring the squared distances from each point to its cluster centroid. It serves as a key indicator for determining the optimal number of clusters in a dataset. In the Elbow method, the objective is to identify a suitable number of clusters (k) by locating the point where increases in k result in diminishing reductions in WCSS. This 'elbow' point is considered optimal because beyond it, additional clusters do not significantly enhance the model's performance in terms of intra-cluster compactness.p
-
-
-
+Within-cluster sum of squares (WCSS) is a metric used to quantify the compactness of clusters by measuring the squared distances from each point to its cluster centroid. It serves as a key indicator for determining the optimal number of clusters in a dataset. In the Elbow method, the objective is to identify a suitable number of clusters (k) by locating the point where increases in k result in diminishing reductions in WCSS. This 'elbow' point is considered optimal because beyond it, additional clusters do not significantly enhance the model's performance in terms of intra-cluster compactness.
 ---
-
 name: DBSCAN
 ## Density-based clustering: DBSCAN
 
@@ -424,7 +435,12 @@ name: hclust
 name: hclust-agglomorative
 ## Hierarchical-based clustering
 - Agglomerative clustering
-```{r hc-agglomerative, echo = F, fig.align='center'}
+
+```{r hc-agglomerative-fig, echo = F, out.width='60%'}
+knitr::include_graphics('data/HC-agglomerative.png')
+```
+
+```{r hc-agglomerative, echo = F, eval = F, fig.align='center'}
 set.seed(42) # For reproducibility
 # Generate sample data
 # data <- matrix(rnorm(100), ncol = 2)
@@ -436,36 +452,30 @@ rownames(data) <- LETTERS[1:5]
 hc_agglomerative <- hclust(dist(data), method = "complete")
 
 # Plot the dendrogram
+png('data/HC-agglomerative.png', width = 1000, height = 1000, res = 100)
 plot(hc_agglomerative, main = "Agglomerative Hierarchical Clustering")
+dev.off()
 ```
 ---
 name: hclust-divisive
 ## Hierarchical-based clustering
 - Divisive clustering
 
+```{r hc-divisive-fig, echo = F, out.width='60%'}
+knitr::include_graphics('data/HC-divisive.png')
+```
 
-```{r hc-dvisive, echo = F, fig.align='center'}
+```{r hc-dvisive, echo = F, fig.align='center', eval=FALSE}
 # Perform divisive hierarchical clustering
 
 hc_divisive <- diana(data)
 
 # Plot the dendrogram
-#plot(hc_divisive, main = "Divisive Hierarchical Clustering")
+png('data/HC-divisive.png', width = 1000, height = 1000, res=100)
 pltree(hc_divisive, cex = 0.6, main = "Divisive Hierarchical Clustering")
+dev.off()
 ```
 
-```{r hc, eval = F, echo = F, fig.align='center'}
-# Perform hierarchical clustering
-hc_result <- hclust(dist(data), method = "complete")
-
-# Cut tree into 3 clusters
-cutree_result <- cutree(hc_result, k=3)
-
-# Visualization
-
-fviz_cluster(list(data = data, cluster = cutree_result)) + ggtitle("Hierarchical Clustering")
-
-```
 ---
 name: linkage
 ## Linkage methods. 
@@ -484,8 +494,6 @@ name: linear-clustering-summary
 
 - You always need to tune some parameters.  
 
-- Most clustering methods will define clusters on random noise.  
-
 - K-means performs poorly on unbalanced data. 
 
 - On hierarchical clustering, some distance metrics need to be used with a certain