Introduction

In this practical, we will apply hierarchical and k-means clustering to two synthetic datasets. We use the following packages:

library(MASS)
library(tidyverse)
library(patchwork)
library(ggdendro)

The data can be generated by running the code below.

Take-home exercises

1. The code does not have comments. Add descriptive comments to the code below.

set.seed(123)
sigma      <- matrix(c(1, .5, .5, 1), 2, 2)
sim_matrix <- mvrnorm(n = 100, mu = c(5, 5), 
                      Sigma = sigma)
colnames(sim_matrix) <- c("x1", "x2")

sim_df <- 
  sim_matrix %>% 
  as_tibble() %>%
  mutate(class = sample(c("A", "B", "C"), size = 100, 
                        replace = TRUE))

sim_df_small <- 
  sim_df %>%
  mutate(x2 = case_when(class == "A" ~ x2 + .5,
                        class == "B" ~ x2 - .5,
                        class == "C" ~ x2 + .5),
         x1 = case_when(class == "A" ~ x1 - .5,
                        class == "B" ~ x1 - 0,
                        class == "C" ~ x1 + .5))
sim_df_large <- 
  sim_df %>%
  mutate(x2 = case_when(class == "A" ~ x2 + 2.5,
                        class == "B" ~ x2 - 2.5,
                        class == "C" ~ x2 + 2.5),
         x1 = case_when(class == "A" ~ x1 - 2.5,
                        class == "B" ~ x1 - 0,
                        class == "C" ~ x1 + 2.5))

# First, we set the random seed to 123 for reproducibility
# we generate two variables following a normal distribution with correlation 0.5
# we create a data frame out of it and add a random cluster label (a, b, c)
# then we move each cluster a little bit away from the center & each other for the sim_df_small dataset
# for the sim_df_large dataset, we do the same but with bigger differences

2. Prepare two unsupervised datasets by removing the class feature.

df_s <- sim_df_small %>% select(-class)
df_l <- sim_df_large %>% select(-class)

3. For each of these datasets, create a scatterplot. Combine the two plots into a single frame (look up the patchwork package to see how to do this!) What is the difference between the two datasets?

# patchwork defines the "+" operator to combine entire ggplots!
df_s %>% ggplot(aes(x = x1, y = x2)) + geom_point() + ggtitle("Small") +
df_l %>% ggplot(aes(x = x1, y = x2)) + geom_point() + ggtitle("Large")

# df_s has a lot of class overlap, df_l has very little overlap

Hierarchical clustering

4. Run a hierarchical clustering on these datasets and display the result as dendrograms. Use euclidian distances and the complete agglomeration method. Make sure the two plots have the same y-scale. What is the difference between the dendrograms? (Hint: functions you’ll need are hclust, ggdendrogram, and ylim)

dist_s <- dist(df_s, method = "euclidian")
dist_l <- dist(df_l, method = "euclidian")

res_s <- hclust(dist_s, method = "complete")
res_l <- hclust(dist_l, method = "complete")

ggdendrogram(res_s, labels = FALSE) + ggtitle("Small") + ylim(0, 10) +
ggdendrogram(res_l, labels = FALSE) + ggtitle("Large") + ylim(0, 10)

# the dataset with large differences segments into 3 classes much higher up.
# Interestingly, the microstructure (lower splits) is almost exactly the same
# because within the three clusters there is no difference between the datasets

5. For the dataset with small differences, also run a complete agglomeration hierarchical cluster with manhattan distance.

dist_s2 <- dist(df_s, method = "manhattan")
res_s2  <- hclust(dist_s2, method = "complete")

6. Use the cutree() function to obtain the cluster assignments for three clusters and compare the cluster assignments to the 3-cluster euclidian solution. Do this comparison by creating two scatter plots with cluster assignment mapped to the colour aesthetic. Which difference do you see?

clus_1 <- as_factor(cutree(res_s, 3))
clus_2 <- as_factor(cutree(res_s2, 3))

p1 <- df_s %>% 
  ggplot(aes(x = x1, y = x2, colour = clus_1)) + 
  geom_point() + 
  ggtitle("Euclidian") + 
  theme(legend.position = "n") +
  coord_fixed()

p2 <- df_s %>% 
  ggplot(aes(x = x1, y = x2, colour = clus_2)) + 
  geom_point() + 
  ggtitle("Manhattan") + 
  theme(legend.position = "n") +
  coord_fixed()

p1 + p2

# The manhattan distance clustering prefers more rectangular classes, whereas
# the euclidian distance clustering prefers circular classes. The difference is
# most prominent in the very center of the plot and for the top right cluster

Practical exercises

K-means clustering

7. Create k-means clusterings with 2, 3, 4, and 6 classes on the large difference data. Again, create coloured scatter plots for these clusterings.

# I set the seed for reproducibility
set.seed(45)
# we can do it in a single pipeline and with faceting 
# (of course there are many ways to do this, though)
df_l %>% 
  mutate(
    class_2 = as_factor(kmeans(df_l, 2)$cluster),
    class_3 = as_factor(kmeans(df_l, 3)$cluster),
    class_4 = as_factor(kmeans(df_l, 4)$cluster),
    class_6 = as_factor(kmeans(df_l, 6)$cluster)
  ) %>% 
  pivot_longer(cols = c(class_2, class_3, class_4, class_6), 
               names_to = "class_num", values_to = "cluster") %>% 
  ggplot(aes(x = x1, y = x2, colour = cluster)) +
  geom_point() +
  scale_colour_brewer(type = "qual") + # use easy to distinguish scale
  facet_wrap(~class_num)

8. Do the same thing again a few times. Do you see the same results every time? where do you see differences?

# I set the seed for reproducibility
set.seed(46)
df_l %>% 
  mutate(
    class_2 = as_factor(kmeans(df_l, 2)$cluster),
    class_3 = as_factor(kmeans(df_l, 3)$cluster),
    class_4 = as_factor(kmeans(df_l, 4)$cluster),
    class_6 = as_factor(kmeans(df_l, 6)$cluster)
  ) %>% 
  pivot_longer(cols = c(class_2, class_3, class_4, class_6), 
               names_to = "class_num", values_to = "cluster") %>% 
  ggplot(aes(x = x1, y = x2, colour = cluster)) +
  geom_point() +
  scale_colour_brewer(type = "qual") + # use easy to distinguish scale
  facet_wrap(~class_num)

# there is label switching in all plots. There is a different result altogether
# in the class_4 solution in my case.

9. Find a way online to perform bootstrap stability assessment for the 3 and 6-cluster solutions.

# I decided to use the function clusterboot from the fpc package
# NB: this package needs to be installed first!
# install.packages("fpc")
library(fpc)

## Warning: package 'fpc' was built under R version 4.3.2

# you can read the documentation ?clusterboot to figure out how to use this 
# function. This can take a while but is something you will need to do a lot in
# the real world!
boot_3 <- clusterboot(data = df_l, B = 1000, clustermethod = kmeansCBI, k = 3, 
                      count = FALSE)
boot_6 <- clusterboot(data = df_l, B = 1000, clustermethod = kmeansCBI, k = 6, 
                      count = FALSE)

# the average stability is much lower for 6 means than for 3 means:
boot_3$bootmean

## [1] 0.9844165 0.9730219 0.9739694

boot_6$bootmean

## [1] 0.7844248 0.5383414 0.7593547 0.7230086 0.6897734 0.7091287

Challenge question

10. Create a function to perform k-medians clustering

Write this function from scratch: you may use base-R and tidyverse functions. Use Euclidean distance as your distance metric.

Input: - dataset (as a data frame) - K (number of clusters)

Output: - a vector of cluster assignments

Tip: use the unsupervised version of sim_df_large with K = 3 as a tryout-dataset

l2_dist <- function(x, y) c(crossprod(x - y))

kmedians <- function(data, K, smart_init = FALSE, max_iter = 1000) {
  # number of observations
  N <- nrow(data)
  
  # random cluster initialization
  clus <- sample(K, N, replace = TRUE)
  
  # start the iteration process
  converged <- FALSE
  iter <- 0
  while (converged == FALSE && iter < max_iter) {
    old_clus <- clus
    iter     <- iter + 1
    
    # compute medians of clusters
    medians <-
      data %>% 
      mutate(k = clus) %>% 
      group_by(k) %>% 
      summarize(across(everything(), median))
    
    # assign observations to closest cluster
    for (n in 1:N) {
      # get nth data point
      data_n <- unlist(data[n,])
      
      # compute distance to each cluster
      dist_n <- numeric(K)
      for (k in 1:K) {
        median_k  <- unlist(medians[k, 2:3])
        dist_n[k] <- l2_dist(data_n, median_k)
      }
      
      # assign obs to cluster with smallest distance
      clus[n] <- which.min(dist_n)
    }
    
    # check for convergence
    if (all(old_clus == clus))
      converged <- TRUE
  }
  
  # print some information
  cat("K-medians with K =", K, "| Iterations:", iter, "| converged:", converged, "\n")
  
  # return the cluster assignments
  return(clus)
}

kmedians(df_l, 3)

## K-medians with K = 3 | Iterations: 5 | converged: TRUE

##   [1] 1 1 1 3 3 1 2 1 2 3 2 2 2 3 3 3 1 1 3 3 1 3 1 2 3 3 1 2 1 2 3 2 1 2 1 1 3
##  [38] 2 3 2 2 2 2 1 2 3 2 3 1 2 3 1 2 1 2 1 1 3 2 1 3 3 1 2 1 2 2 2 3 2 3 3 3 2
##  [75] 2 1 1 3 2 2 1 1 1 1 2 3 3 1 3 2 2 1 2 3 1 3 3 1 2 3

11. Add an input parameter smart_init. If this is set to TRUE, initialize cluster assignments using hierarchical clustering (from hclust). Using the unsupervised sim_df_small, look at the number of iterations needed when you use this method vs when you randomly initialize.

kmedians2 <- function(data, K, smart_init = FALSE, max_iter = 1000) {
  N <- nrow(data)
  
  # new part
  if (!smart_init) {
    # random cluster initialization
    clus <- sample(K, N, replace = TRUE)
  } else {
    # use hclust for initialization
    clus <- cutree(hclust(dist(df_l)), k = K)
  }
  
  # old part
  converged <- FALSE
  iter <- 0
  while (converged == FALSE && iter < max_iter) {
    old_clus <- clus
    iter     <- iter + 1
    medians <-
      data %>% 
      mutate(k = clus) %>% 
      group_by(k) %>% 
      summarize(across(everything(), median))
    for (n in 1:N) {
      data_n <- unlist(data[n,])
      dist_n <- numeric(K)
      for (k in 1:K) {
        median_k  <- unlist(medians[k, 2:3])
        dist_n[k] <- l2_dist(data_n, median_k)
      }
      clus[n] <- which.min(dist_n)
    }
    if (all(old_clus == clus))
      converged <- TRUE
  }
  cat("K-medians with K =", K, "| Iterations:", iter, "| converged:", converged, "\n")
  return(clus)
}

set.seed(123)
kmedians2(df_s, 3, smart_init = FALSE)

## K-medians with K = 3 | Iterations: 6 | converged: TRUE

##   [1] 2 2 1 3 2 1 2 3 3 3 2 3 2 3 3 1 1 3 2 3 3 3 3 3 3 3 1 3 3 2 1 3 1 2 2 1 2
##  [38] 3 3 3 3 3 3 1 2 3 3 3 1 3 3 2 3 1 3 1 3 2 3 2 1 3 2 3 3 3 2 3 1 1 3 3 1 3
##  [75] 3 1 2 3 3 3 2 1 2 2 3 2 1 1 3 2 2 1 3 3 1 3 1 1 3 3

kmedians2(df_s, 3, smart_init = TRUE)

## K-medians with K = 3 | Iterations: 3 | converged: TRUE

##   [1] 3 1 1 2 3 1 3 3 3 2 1 3 3 2 2 1 1 3 1 3 3 3 3 3 2 3 1 3 3 1 2 3 1 1 1 1 1
##  [38] 3 2 3 3 3 3 1 1 3 3 2 1 3 2 1 3 1 3 1 3 1 3 1 2 3 3 2 3 3 3 3 1 1 2 2 1 2
##  [75] 3 1 3 2 3 3 1 1 3 1 3 2 1 1 2 1 1 1 3 3 1 2 1 1 3 3