library(rClustering)
library(cluster)
library(glue)
library(dplyr)
library(tidyselect)
library(tidyr)
library(stringr)
library(ggplot2)
library(gridExtra)
library(kableExtra)
clusts <- make_clusters(4, c_range = c(-20, 20), density = 500, categorical = F, seed = 1)
clusts %>%
sample_n(10) %>%
kable() %>%
kable_material(c("striped", "hover"), full_width = F)
y_true | x | y |
---|---|---|
1 | -11.621969 | -3.158619 |
1 | -8.281378 | -6.979358 |
3 | -12.552332 | 16.030500 |
1 | -10.828491 | -3.680739 |
1 | -9.308444 | -2.092698 |
3 | -11.584022 | 14.231961 |
1 | -10.087508 | -8.408464 |
1 | -7.257910 | -3.745361 |
3 | -10.429119 | 16.351010 |
4 | 16.405530 | 6.801611 |
clust_positions <- clusts %>%
group_by(y_true) %>%
summarize_all(list(min = min, max = max, mean = mean)) %>%
select(sort(peek_vars()))
clust_plot_data <- clust_positions %>%
pivot_longer(cols = !y_true) %>%
mutate(coord = if_else(str_detect(name, "^y_"), "y", "x")) %>%
mutate(name = str_sub(name, start = 3)) %>%
pivot_wider(names_from = coord, values_from = value) %>%
mutate(center = factor(name == "mean"))
clust_plot_data %>%
head() %>%
kable() %>%
kable_material(c("striped", "hover"), full_width = F)
y_true | name | x | y | center |
---|---|---|---|---|
1 | max | -5.920596 | -1.602418 | FALSE |
1 | mean | -9.489214 | -5.077230 | TRUE |
1 | min | -12.569354 | -8.564183 | FALSE |
2 | max | 4.049007 | 17.464265 | FALSE |
2 | mean | 2.848125 | 16.245504 | TRUE |
2 | min | 1.727480 | 15.115688 | FALSE |
shapes <- c(16, 3)
names(shapes) <- c(F, T)
sizes <- c(3, 10)
names(sizes) <- c(F, T)
p1 <- clust_plot_data %>%
ggplot(aes(x = x, y = y, col = y_true, shape = center, size = center)) +
geom_point() +
ggtitle("Cluster Dimensions") +
coord_cartesian(xlim = c(-20, 20), ylim = c(-20, 20)) +
scale_shape_manual(values = shapes) +
scale_size_manual(values = sizes) +
theme(legend.position = "none")
p2 <- clusts %>% ggplot(aes(x = x, y = y, col = y_true)) +
geom_point() +
ggtitle("Cluster Points") +
coord_cartesian(xlim = c(-20, 20), ylim = c(-20, 20)) +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol = 2)
Since the clusters are perfect circles we can extract the center by taking the mean and the edges by taking the min and max of the coordinates.
# Prepare clusters
clusts <- make_clusters(4, c_range = c(-20, 20), density = 500, categorical = F, seed = 2)
# Prepare arguments
vals <- setNames(data.frame(expand.grid(c(5, 10, 20), c(1, 10, 20))), c("n_iter", "n_start"))
vals$n_clust <- 4
vals$fit_name <- rep(c("A", "B", "C"), 3)
# Loop over argument combinations and fit Kmeans
out <- mapply(get_mean_clusters,
fit_name = vals$fit_name,
n_clust = vals$n_clust,
iter = vals$n_iter,
n_start = vals$n_start,
MoreArgs = list(data = clusts, seed = 1),
SIMPLIFY = F
) %>%
bind_rows() %>%
mutate(y_pred = factor(y_pred))
p1 <- ggplot(out, aes(x = x, y = y, col = y_true)) +
geom_point(size = 0.5) +
ggtitle("Ground Truth") +
theme(legend.position = "none")
p2 <- ggplot(out, aes(x = x, y = y, col = y_pred)) +
geom_point(size = 0.5) +
facet_wrap(~title, labeller = labeller(title = label_wrap_gen(20)), ncol = 3) +
ggtitle("Kmeans", subtitle = "Varying Iterations and Starts") +
theme(legend.position = "none")
grid.arrange(p1, p2,
widths = c(3, 4), heights = c(3, 9),
layout_matrix = rbind(
c(1, NA),
c(3, 3)
)
)
Note how important it is to use multiple starts so the model can converge. No amount of iterations will help if the starting point is very poor. Additionally, Kmeans is an unsupervised learning method. This means that it can find distinct clusters, but the clusters are not guaranteed to have the same label as the ground truth. The more iterations and starts specified the longer the fit takes.
# Prepare clusters
clusts <- make_clusters(4, c_range = c(-20, 20), density = 500, categorical = 0.9, seed = 2)
# Prepare arguments
vals <- setNames(data.frame(expand.grid(c(5, 10, 20), c(1, 10, 20))), c("n_iter", "n_start"))
vals$n_clust <- 4
vals$fit_name <- rep(c("A", "B", "C"), 3)
# Loop over argument combinations and fit Kprototypes
out <- mapply(get_mean_clusters,
fit_name = vals$fit_name,
n_clust = vals$n_clust,
iter = vals$n_iter,
n_start = vals$n_start,
MoreArgs = list(data = clusts, seed = 1),
SIMPLIFY = F
) %>%
bind_rows() %>%
mutate(y_pred = factor(y_pred))
p1 <- ggplot(out, aes(x = x, y = y, col = y_true)) +
geom_point(size = 0.5) +
ggtitle("Ground Truth") +
theme(legend.position = "none")
p2 <- ggplot(out, aes(x = x, y = y, col = y_pred)) +
geom_point(size = 0.5) +
facet_wrap(~title, labeller = labeller(title = label_wrap_gen(20)), ncol = 3) +
ggtitle("Kprototypes", subtitle = "Varying Iterations and Starts") +
theme(legend.position = "none")
grid.arrange(p1, p2,
widths = c(3, 4), heights = c(3, 9),
layout_matrix = rbind(
c(1, NA),
c(3, 3)
)
)
Note that the Kprototypes algorithm can handle the addition of a categorical variable that would be meaningless to Kmeans. Varying the start locations is still very important to finding a good solution.
# Basic Kmeans function
my_km <- function(data, k, seed){
set.seed(seed)
kmeans(x=data, centers=k, nstart=5, iter.max=100)
}
# Generate 4 cluster solution
n_clust <- 4
clusts <- make_clusters(n_clust, c_range = c(-20, 20), density = 500, categorical = F, seed = 3)
# Scale data for clustering
km_data <- clusts %>% select(x, y) %>% mutate_all(~scale(.) %>% as.vector())
# Look over [2, 8] clusters
ks <- seq(2, 8)
# Apply Kmeans to data for 2-8 clusters
fits <- lapply(ks, my_km, data=km_data, seed=1)
# Extract SSE from fits
sses <- sapply(fits, function(x) x$tot.withinss)
sse_data <- data.frame(K=ks, SSE=sses)
p1 <- ggplot(clusts, aes(x = x, y = y, col = y_true)) +
geom_point(size = 0.5) +
ggtitle("Ground Truth") +
theme(legend.position = "none")
p2 <- ggplot(sse_data, aes(x=K, y=SSE)) +
geom_line() + geom_label(aes(label=scales::comma(SSE)), size=3) +
scale_x_continuous(breaks=ks) + coord_cartesian(xlim=c(1, 10)) +
scale_y_continuous(breaks=scales::pretty_breaks(n=8), labels=scales::comma_format()) +
ggtitle("Sum of Squares Error for Different K", subtitle=glue("Ground truth: K={n_clust}"))
grid.arrange(p1, p2, ncol = 2)
The SSE elbow plot illustrates the point at which adding more cluster centers no longer adds much value. A lower SSE is better, however, as K goes to infinity the SSE will generally decrease because we are adding additional centers. There is a point at which the decrease begins to stabilize, this is a good point to set K. In this case that point is K=4 as we see the curve flatten out thereafter. Additionally, the ground truth is that there are distinct clusters since we synthetically generated this data.
# get distance of points from centers
euclidean_dist_mat <- daisy(km_data, metric="euclidean")
# get cluster labels from cluster solutions
clust_labs <- setNames(lapply(fits, function(x) x$cluster), glue("cluster_{ks}"))
# extract silhouette scores from cluster solutions
sil_summaries <- lapply(clust_labs, function(x, ...) summary(silhouette(x, ...)),
dist=euclidean_dist_mat)
# use average width as silhouette metric
sil_data <- data.frame(K=ks, avg.sil=sapply(sil_summaries, function(x) x$avg.width))
p1 <- ggplot(clusts, aes(x = x, y = y, col = y_true)) +
geom_point(size = 0.5) +
ggtitle("Ground Truth") +
theme(legend.position = "none")
p2 <- ggplot(sil_data, aes(x=K, y=avg.sil)) +
geom_line() + geom_label(aes(label=round(avg.sil, 2)), size=3) +
scale_x_continuous(breaks=ks) +
scale_y_continuous(name="Average Width", breaks=seq(0,1,0.2), labels=seq(0,1,0.2)) +
coord_cartesian(xlim=c(1,10), ylim=c(0,1)) +
ggtitle("Silhouette Average Width for Different K", subtitle=glue("Ground truth: K={n_clust}"))
grid.arrange(p1, p2, ncol = 2)
The silhoutte index ranges [-1, 1] where -1 indicates cluster label misalignment and 1 indicates perfect separation. In our case the optimal solution has K=4. However, from the results it would appear the optimal solution is K=3. Cluster performance assessment is part art and part science given the nature of unsupervised learning. What we determine the optimal solution to be from the data may not always be the ground truth but it should be close!