Introduction to Unsupervised Learning

Unsupervised Learning

  • Supervised learning has the goal of making predictions with a set of known labels for the response variable.
  • In unsupervised learning, we try to find structure in the data of the response variable without predetermined labels.

Goal: organize the items available in the Animal Crossing video game

Data set: Animal Crossing

Source: VillagerDB, MetaCritic, and TidyTuesday

library("ggrepel")
library("tidyverse")

# critic <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/critic.tsv')
# user_reviews <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/user_reviews.tsv')
items <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv')
# villagers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv')

Exploratory Data Analysis

variables

dim(items)
## [1] 4565   16
str(items)
## tibble [4,565 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ num_id       : num [1:4565] 12 14 17 19 20 21 23 24 26 26 ...
##  $ id           : chr [1:4565] "3d-glasses" "a-tee" "abstract-wall" "academy-uniform" ...
##  $ name         : chr [1:4565] "3D Glasses" "A Tee" "Abstract Wall" "Academy Uniform" ...
##  $ category     : chr [1:4565] "Accessories" "Tops" "Wallpaper" "Dresses" ...
##  $ orderable    : logi [1:4565] NA NA TRUE NA FALSE TRUE ...
##  $ sell_value   : num [1:4565] 122 140 390 520 2000 375 420 330 8990 8990 ...
##  $ sell_currency: chr [1:4565] "bells" "bells" "bells" "bells" ...
##  $ buy_value    : num [1:4565] 490 560 1560 2080 NA ...
##  $ buy_currency : chr [1:4565] "bells" "bells" "bells" "bells" ...
##  $ sources      : chr [1:4565] NA NA "Nook's Cranny" NA ...
##  $ customizable : logi [1:4565] NA NA NA NA NA FALSE ...
##  $ recipe       : num [1:4565] NA NA NA NA NA NA NA NA NA NA ...
##  $ recipe_id    : chr [1:4565] NA NA NA NA ...
##  $ games_id     : chr [1:4565] "nh" "nh" "nh" "nh" ...
##  $ id_full      : chr [1:4565] "item-3d-glasses" "item-a-tee" NA "item-academy-uniform" ...
##  $ image_url    : chr [1:4565] "https://villagerdb.com/images/items/full/3d-glasses.b8e1b5a.png" "https://villagerdb.com/images/items/full/a-tee.a1af443.png" NA "https://villagerdb.com/images/items/full/academy-uniform.4475eb5.png" ...
##  - attr(*, "problems")= tibble [2 x 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:2] 4472 4473
##   ..$ col     : chr [1:2] "customizable" "customizable"
##   ..$ expected: chr [1:2] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE"
##   ..$ actual  : chr [1:2] "Yes" "Yes"
##   ..$ file    : chr [1:2] "'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv'" "'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv'"
##  - attr(*, "spec")=
##   .. cols(
##   ..   num_id = col_double(),
##   ..   id = col_character(),
##   ..   name = col_character(),
##   ..   category = col_character(),
##   ..   orderable = col_logical(),
##   ..   sell_value = col_double(),
##   ..   sell_currency = col_character(),
##   ..   buy_value = col_double(),
##   ..   buy_currency = col_character(),
##   ..   sources = col_character(),
##   ..   customizable = col_logical(),
##   ..   recipe = col_double(),
##   ..   recipe_id = col_character(),
##   ..   games_id = col_character(),
##   ..   id_full = col_character(),
##   ..   image_url = col_character()
##   .. )
table(items$category)
## 
## Accessories     Bottoms        Bugs     Dresses        Fish    Flooring 
##         123         142          80         239         101         324 
##     Flowers     Fossils       Fruit   Furniture        Hats       Music 
##         109          76          12        1264         230          95 
##      Photos   Seashells       Shoes       Socks       Tools        Tops 
##         840           8          89          60          74         289 
##   Umbrellas     Usables   Wallpaper 
##          67          88         255
summary(items$buy_value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      40     870    1300    6960    2700 1200000    1014
# table(items$buy_currency)
summary(items$sell_value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       5     240     390    2261    1000  300000      36
summary(items$num_id)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    1722    3569    3662    5607    7443
summary(items$recipe)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   3.000   4.711   6.000  90.000    3977

numerical variables

items %>%
  ggplot(aes(x = buy_value, y = sell_value, color = category)) +
  geom_point() +
  coord_trans(x = "log10") + 
  labs(title = "Items in Animal Crossing",
       subtitle = "Sell Values vs Buy Values",
       caption = "Math 32") +
  scale_x_continuous(name = "Buy Value",
                     breaks = c(0, 10, 100, 1000, 10000, 100000, 1000000),
                     labels = c("0", "10", "100", "1000", "10000", "100000", "1000000")) +
  scale_y_continuous(name = "Sell Value",
                     breaks = c(0, 50000, 100000, 150000, 200000, 250000, 300000),
                     labels = c("0", "50000", "100000", "150000", "200000", "250000", "300000")) +
  theme_minimal()
## Warning: Removed 1035 rows containing missing values (geom_point).

K-Means Clustering

first look

df_numerical <- items %>%
  select(num_id, buy_value, sell_value, recipe) %>%
  filter(complete.cases(.))

summary(df_numerical$buy_value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     240    3780    7080   16912   16480  320000
summary(df_numerical$sell_value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      60     945    1770    4228    4120   80000
clusters4 <- kmeans(df_numerical, 4)

items_with_clusters <- cbind(df_numerical, clusters4$cluster)
colnames(items_with_clusters) <- c("ID", "buy", "sell", "recipe", "cluster")
items_with_clusters$cluster <- factor(items_with_clusters$cluster)
x_breaks <- c(0, 10, 100, 1000, 10000, 100000, 1000000)
y_breaks <- c(0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000)

items_with_clusters %>%
  ggplot() +
  geom_point(aes(x = buy, y = sell)) +
  geom_point(aes(x = buy_value, y = sell_value), 
             data = data.frame(clusters4$centers),
             size = 5,
             color = "blue") +
  coord_trans(x = "log10") +
  labs(title = "Items in Animal Crossing",
       subtitle = "Cluster Centers",
       caption = "Math 32") +
  scale_x_continuous(name = "Buy Value",
                     breaks = x_breaks,
                     labels = paste(x_breaks)) +
  scale_y_continuous(name = "Sell Value",
                     breaks = y_breaks,
                     labels = paste(y_breaks)) +
  theme_minimal()

items_with_clusters %>%
  ggplot(group = cluster) +
  geom_point(aes(x = buy, y = sell, color = cluster), size = 3) +
  coord_trans(x = "log10") +
  labs(title = "Items in Animal Crossing",
       subtitle = "4 Clusters",
       caption = "Math 32") +
  scale_x_continuous(name = "Buy Value",
                     breaks = x_breaks,
                     labels = paste(x_breaks)) +
  scale_y_continuous(name = "Sell Value",
                     breaks = y_breaks,
                     labels = paste(y_breaks)) +
  theme_minimal()

selecting the amount of clusters

Selecting the number of clusters is rather subjective.

  • too few clusters: items that are “far” from each other are still grouped together
  • too many clusters: harder to describe and label groups later

We start with an $d$-dimensional data set of numerical variables and prescribe a number $k$ for the number of clusters and run the kmeans algorithm.

  • Each cluster $C{k}$ has $n{k}$ points labeled $x_{i}$ in $d$-dimensional space
  • Each cluster has a cluster center $\mu_{k}$
  • Each cluster has a within-sum-of-squares

$$WSS = \displaystyle\sum{x{i} \in C{k}} (x{i} - \mu_{k})^{2}$$ Thus, our metric for the clustering will be the reported total-within-sum-of-squares

$$TWSS = \displaystyle\sum{j=1}^{k}\sum{x{i} \in C{k}} (x{i} - \mu{k})^{2}$$ For example, the total-within-sum-of-squares for the 4-cluster results above is 3.1486354\times 10^{10}.

scree plot

It is common practice to build a scree plot and look for the “elbow”.

clusters1 <- kmeans(df_numerical, 1)
clusters2 <- kmeans(df_numerical, 2)
clusters3 <- kmeans(df_numerical, 3)
clusters4 <- kmeans(df_numerical, 4)
clusters5 <- kmeans(df_numerical, 5)
clusters6 <- kmeans(df_numerical, 6)
clusters7 <- kmeans(df_numerical, 7)
clusters8 <- kmeans(df_numerical, 8)
clusters9 <- kmeans(df_numerical, 9)

TSS_vector <- c(clusters1$tot.withinss, clusters2$tot.withinss, clusters3$tot.withinss, 
                clusters4$tot.withinss, clusters5$tot.withinss, clusters6$tot.withinss, 
                clusters7$tot.withinss, clusters8$tot.withinss, clusters9$tot.withinss)

k_vals <- 1:9
df_for_scree_plot <- data.frame(k_vals, TSS_vector)
df_for_scree_plot %>%
  ggplot(aes(x = k_vals, y = TSS_vector)) +
  geom_point(size = 3) +
  geom_line() +
  labs(title = "Scree Plot",
       subtitle = "(find the 'elbow')",
       caption = "Math 32",
       x = "number of clusters",
       y = "total-within-sum-of-squares") +
  scale_x_discrete(name = "number of clusters",
                   breaks = paste(1:9),
                   labels = paste(1:9),
                   limits = 1:9) +
  theme_minimal()

Conclusion

It appears that the optimal number of clusters for our data set is $k = 3$.

items_with_clusters <- cbind(df_numerical, clusters3$cluster)
colnames(items_with_clusters) <- c("ID", "buy", "sell", "recipe", "cluster")
items_with_clusters$cluster <- factor(items_with_clusters$cluster)

df_categorical <- items %>%
  select(num_id, name, category) %>%
  filter(complete.cases(.))
colnames(df_categorical) <- c("ID", "name", "category")

main_df <- df_categorical %>%
  inner_join(items_with_clusters, by = "ID")
main_df %>%
  ggplot(group = cluster) +
  geom_point(aes(x = buy, y = sell, color = cluster), size = 3) +
  coord_trans(x = "log10") +
  labs(title = "Items in Animal Crossing",
       subtitle = "Optimal Clustering",
       caption = "Math 32") +
  scale_x_continuous(name = "Buy Value",
                     breaks = x_breaks,
                     labels = paste(x_breaks)) +
  scale_y_continuous(name = "Sell Value",
                     breaks = y_breaks,
                     labels = paste(y_breaks)) +
  theme_minimal()

Appendix: label a sample of points from each cluster

label_df <- main_df %>%
  group_by(cluster) %>%
  sample_n(2)

main_df %>%
  ggplot(group = cluster) +
  geom_point(aes(x = buy, y = sell, color = cluster), size = 3) +
  geom_label_repel(aes(x = buy, y = sell, label = name), 
                   data = label_df,
                   nudge_x = -1000,
                   nudge_y = 10000) +
  coord_trans(x = "log10") +
  labs(title = "Items in Animal Crossing",
       subtitle = "Optimal Clustering",
       caption = "Math 32") +
  scale_x_continuous(name = "Buy Value",
                     breaks = x_breaks,
                     labels = paste(x_breaks)) +
  scale_y_continuous(name = "Sell Value",
                     breaks = y_breaks,
                     labels = paste(y_breaks)) +
  theme_minimal()

Related