HW5 - Analyze Data (Clustering)
2023-10-28
Preamble
# Library tidyverse
suppressMessages("tidyverse")
## [1] "tidyverse"
library(tidyverse)
# this ensures the random number generator gives
# reproducible results.
set.seed(7)
# the centers of our clusters
centerx <- c(3, -3, 0)
centery <- c(3, 0, -3)
n <- 25
# data
x1 <- centerx[1] + rnorm(n, 0, 1)
y1 <- centery[1] + rnorm(n, 0, 2)
x2 <- centerx[2] + rnorm(n, 0, 2)
y2 <- centery[2] + rnorm(n, 0, 1)
x3 <- centerx[3] + rnorm(n, 0, 2)
y3 <- centery[3] + rnorm(n, 0, 2)
# our dataset
tib <- tibble(
x = c(x1, x2, x3),
y = c(y1, y2, y3)
)
# initial centers
init_centers_tib <- tibble(
x = c(-2, 0, 2),
y = c(-1, 3, 0),
)
Question 1
Write code to visualize the dataset and the initial clusters with the following guidelines: 1.
Use ggplot with geom_point layers to produce a scatter plot. 2. Overlay the initial centers on
top of the data points with both a custom color and shape so that the initial centers are
visually noticeable.
# 1
# Scatterplot
# loading the ggplot library
library(ggplot2)
ggplot(tib, aes(x = x, y = y))+
geom_point(position = "identity", na.rm = FALSE)+
theme_minimal()+
ggtitle("Scatterplot for Initial Clusters")
# Overlaying the initial centers on top of data points
ggplot(tib, aes(x = x, y = y))+
geom_point()+
theme_minimal()+
geom_point(data = init_centers_tib, aes(x = x, y = y), color = "green",
size = 4, shape = 18)+
ggtitle("The Scatterplot with Marked Initial Centers")
Question 2
Write your own kmeans function with the following guidelines: 1. Call your custom
function k_means. Note the underscore as we do not want to mask the built in kmeans
function. 2. This should take two arguments: • tib: your actual data. • centers: your initial
centroids. 3. The output should be a vector of cluster assignments that correspond with
each observation of the original dataset tib.
k_means <- function(tib, centers) {
# Convert the tib to a matrix for easy implementation
tib <- as.matrix(tib)
# Total number of observations and the number of centroids
num_observations <- nrow(tib)
num_centroids <- nrow(centers)
# cluster assignments
cluster_assignments <- rep(0, num_observations)
# Iterate until convergence using the repeat loop
repeat {
# The closer the observation to the centroid the observation is assigned
to that centroid
for (obs in 1:num_observations) {
distances <- sqrt(rowSums((tib[obs, ] - centers)^2))
cluster_assignments[obs] <- which.min(distances)
}
# The centers needs to be updated
new_centroids <- matrix(0, nrow = num_centroids, ncol = ncol(tib))
for (j in 1:num_centroids) {
new_centroids[j, ] <- colMeans(tib[cluster_assignments == j, ])
}
# Check for convergence
if (identical(centers, new_centroids)) {
break
} else {
centers <- new_centroids
}
}
# Return the cluster assignments
return(cluster_assignments)
}
custom_res <- k_means(tib, init_centers_tib)
builtin_res <- kmeans(tib, init_centers_tib)
# previewing the results
print(custom_res)
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 3 1 1 1 1 1 1
1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 3
3 3
print(builtin_res)
## K-means clustering with 3 clusters of sizes 27, 25, 23
##
## Cluster means:
## x y
## 1 -2.7811405 -0.3899246
## 2 3.4640362 3.0267708
## 3 0.8813379 -2.6080654
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 3 1 1 1 1 1 1
1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 3
3 3
##
## Within cluster sum of squares by cluster:
## [1] 107.89646 87.12228 113.80475
## (between_SS / total_SS = 74.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
"tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# add cluster assignments to the original data.
results_tib <- tib |>
mutate(
custom_cluster = custom_res,
builtin_cluster = builtin_res$cluster,
)
# see if any cluster assignments don't match.
results_tib |>
filter(
custom_cluster != builtin_cluster
) |>
dim()
## [1] 0 4
The number of rows is equal 0 since all the assignments matches for both built in k means
and custom k means.
Question 3
results_tib |>
mutate(
custom_cluster = as_factor(custom_res),
)
## # A tibble: 75 × 4
## x y custom_cluster builtin_cluster
## <dbl> <dbl> <fct> <int>
## 1 5.29 3.37 2 2
## 2 1.80 4.50 2 2
## 3 2.31 4.18 2 2
## 4 2.59 1.03 2 2
## 5 2.03 2.45 2 2
## 6 2.05 1.26 2 2
## 7 3.75 4.44 2 2
## 8 2.88 3.22 2 2
## 9 3.15 2.84 2 2
## 10 5.19 2.16 2 2
## # ℹ 65 more rows
ggplot(results_tib, aes(x, y, color = custom_cluster))+
geom_point()+
theme_minimal()+
ggtitle("Graphical Representation of Custom Clusters")
The clusters looks relative located to where the centroids are located.