Introduction to Machine Learning

Goals for today

  • introduce machine learning (ideas and terminology)
  • introduce tidymodels package
  • practice with a TidyTuesday data set
library("tidymodels")
library("tidyverse")

Data: Tour de France

Source: TidyTuesday data set from April 7, 2020

tdf_winners <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-07/tdf_winners.csv')
str(tdf_winners)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 106 obs. of  19 variables:
##  $ edition      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ start_date   : Date, format: "1903-07-01" "1904-07-02" ...
##  $ winner_name  : chr  "Maurice Garin" "Henri Cornet" "Louis Trousselier" "René Pottier" ...
##  $ winner_team  : chr  "La Française" "Conte" "Peugeot–Wolber" "Peugeot–Wolber" ...
##  $ distance     : num  2428 2428 2994 4637 4488 ...
##  $ time_overall : num  94.6 96.1 NA NA NA ...
##  $ time_margin  : num  2.99 2.27 NA NA NA ...
##  $ stage_wins   : num  3 1 5 5 2 5 6 4 2 3 ...
##  $ stages_led   : num  6 3 10 12 5 13 13 3 13 13 ...
##  $ height       : num  1.62 NA NA NA NA NA 1.78 NA NA NA ...
##  $ weight       : num  60 NA NA NA NA NA 88 NA NA NA ...
##  $ age          : num  32 19 24 27 24 25 22 22 26 23 ...
##  $ born         : Date, format: "1871-03-03" "1884-08-04" ...
##  $ died         : Date, format: "1957-02-19" "1941-03-18" ...
##  $ full_name    : chr  NA NA NA NA ...
##  $ nickname     : chr  "The Little Chimney-sweep" "Le rigolo (The joker)" "Levaloy / Trou-trou" NA ...
##  $ birth_town   : chr  "Arvier" "Desvres" "Paris" "Moret-sur-Loing" ...
##  $ birth_country: chr  "Italy" "France" "France" "France" ...
##  $ nationality  : chr  " France" " France" " France" " France" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   edition = col_double(),
##   ..   start_date = col_date(format = ""),
##   ..   winner_name = col_character(),
##   ..   winner_team = col_character(),
##   ..   distance = col_double(),
##   ..   time_overall = col_double(),
##   ..   time_margin = col_double(),
##   ..   stage_wins = col_double(),
##   ..   stages_led = col_double(),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   age = col_double(),
##   ..   born = col_date(format = ""),
##   ..   died = col_date(format = ""),
##   ..   full_name = col_character(),
##   ..   nickname = col_character(),
##   ..   birth_town = col_character(),
##   ..   birth_country = col_character(),
##   ..   nationality = col_character()
##   .. )
colnames(tdf_winners)
##  [1] "edition"       "start_date"    "winner_name"   "winner_team"  
##  [5] "distance"      "time_overall"  "time_margin"   "stage_wins"   
##  [9] "stages_led"    "height"        "weight"        "age"          
## [13] "born"          "died"          "full_name"     "nickname"     
## [17] "birth_town"    "birth_country" "nationality"

Cleaning Data

df <- tdf_winners %>%
  select(c(distance, time_overall, 
           height, weight, age)) %>%
  filter(complete.cases(.)) %>%
  filter(height >= 1.7) %>%
  mutate(pace = distance / time_overall) %>%
  select(c(pace, height, weight, age))
dim(df)
## [1] 62  4

Data Visualization

df %>%
  ggplot(aes(x = height, y = pace)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Are taller bicyclists faster?",
       subtitle = "featuring Tour de France winners",
       caption = "Source: TidyTuesday",
       x = "height (meters)",
       y = "pace (km/hr)") +
  theme_minimal()

df %>%
  ggplot(aes(x = age, y = pace)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Are older bicyclists faster?",
       subtitle = "featuring Tour de France winners",
       caption = "Source: TidyTuesday",
       x = "age",
       y = "pace (km/hr)") +
  theme_minimal()

df %>%
  ggplot(aes(x = weight, y = pace)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Are heavier bicyclists faster?",
       subtitle = "featuring Tour de France winners",
       caption = "Source: TidyTuesday",
       x = "weight (kg)",
       y = "pace (km/hr)") +
  theme_minimal()

First Model

“With tidymodels, we start by specifying the functional form of the model that we want using the parsnip package.”

linear_reg()
## Linear Regression Model Specification (regression)

model engine

“However, now that the type of model has been specified, a method for fitting or training the model can be stated using the engine. The engine value is often a mash-up of the software that can be used to fit or train the model as well as the estimation method.”

linear_reg() %>% 
  set_engine("lm") #linear model
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

fitting a model

lm_fit <- linear_reg() %>% 
  set_engine("lm") %>%
  fit(pace ~ height + weight + age, data = df)
lm_fit
## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Coefficients:
## (Intercept)       height       weight          age  
##      3.8455      21.0987      -0.1387       0.2113

examining a model

tidy(lm_fit)
## # A tibble: 4 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    3.85    12.3        0.313  0.755 
## 2 height        21.1      8.06       2.62   0.0112
## 3 weight        -0.139    0.0685    -2.03   0.0474
## 4 age            0.211    0.0979     2.16   0.0350

interaction terms

lm_fit_with_interaction <- linear_reg() %>% 
  set_engine("lm") %>%
  fit(pace ~ height + weight + age + height:weight + height:age + weight:age +
        height:weight:age,
      data = df)
lm_fit_with_interaction
## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Coefficients:
##       (Intercept)             height             weight                age  
##          924.8499          -444.1560           -15.6339           -27.8628  
##     height:weight         height:age         weight:age  height:weight:age  
##            7.9297            13.9352             0.4802            -0.2425
tidy(lm_fit_with_interaction)
## # A tibble: 8 x 5
##   term              estimate std.error statistic p.value
##   <chr>                <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)        925.     2272.        0.407   0.686
## 2 height            -444.     1287.       -0.345   0.731
## 3 weight             -15.6      32.8      -0.477   0.635
## 4 age                -27.9      80.3      -0.347   0.730
## 5 height:weight        7.93     18.5       0.428   0.670
## 6 height:age          13.9      45.5       0.306   0.761
## 7 weight:age           0.480     1.16      0.414   0.680
## 8 height:weight:age   -0.243     0.656    -0.370   0.713

Predictions

new data

  • SpongeBob is a 26-year-old, 1.77 m tall bicyclist who weighs 55 kg
  • Patrick is a 25-year-old, 1.81 m tall bicyclist who weighs 75 kg
  • Squidward is a 31-year-old, 1.89 m tall bicyclist who weighs 65 kg
new_contestants <- data.frame(name = c("SpongeBob", "Patrick", "Squidward"),
                              age = c(26, 25, 31),
                              height = c(1.77, 1.81, 1.89),
                              weight = c(55, 75, 65))

mean_predictions <- predict(lm_fit, new_data = new_contestants)
mean_predictions
## # A tibble: 3 x 1
##   .pred
##   <dbl>
## 1  39.1
## 2  36.9
## 3  41.3

confidence intervals

CI_predictions <- predict(lm_fit,
                          new_data = new_contestants,
                          type = "conf_int")
CI_predictions
## # A tibble: 3 x 2
##   .pred_lower .pred_upper
##         <dbl>       <dbl>
## 1        37.1        41.0
## 2        35.9        38.0
## 3        39.0        43.5

error bars

plot_df <- new_contestants %>%
  bind_cols(mean_predictions) %>%
  bind_cols(CI_predictions)
plot_df
##        name age height weight    .pred .pred_lower .pred_upper
## 1 SpongeBob  26   1.77     55 39.05386    37.07966    41.02807
## 2   Patrick  25   1.81     75 36.91179    35.85758    37.96601
## 3 Squidward  31   1.89     65 41.25491    38.97189    43.53794
plot_df %>%
  ggplot(aes(x = name)) +
  geom_errorbar(aes(ymin = .pred_lower,
                    ymax = .pred_upper),
                color = "red",
                width = 0.32) +
  geom_point(aes(y = .pred), color = "blue", size = 5) +
  labs(title = "Tour de Under the Sea",
       subtitle = "Welcome the new contestants!",
       caption = "Math 32",
       x = "",
       y = "pace (km/hr)") +
  theme_minimal()

Data Splitting

  • “The training set is used to estimate parameters, compare models and feature engineering techniques, tune models, etc.”

  • “The test set is held in reserve until the end of the project, at which point there should only be one or two models under serious consideration. It is used as an unbiased source for measuring final model performance.”

data_split <- initial_split(df)
train_df <- training(data_split)
test_df <- testing(data_split)
print(paste("The number of observations in the training set is:", nrow(train_df)))
## [1] "The number of observations in the training set is: 47"
print(paste("The number of observations in the testing set is:", nrow(test_df)))
## [1] "The number of observations in the testing set is: 15"

visualizing the split

data_split <- initial_split(df)
train_df <- training(data_split)
test_df <- testing(data_split)

train_df %>%
  ggplot(aes(x = height, y = pace)) +
  geom_point(aes(color = "training set"), 
             # color = "black"
             ) +
  geom_smooth(method = "lm",
              aes(x = height, y = pace),
              color = "black",
              data = train_df,
               se = FALSE) +
  geom_point(aes(x = height, y = pace, color = "testing set"),
             # color = "red",
             data = test_df,
             size = 3) +
  labs(title = "Training and Testing Sets",
       subtitle = "approx 75-25 percent split",
       caption = "Math 32",
       x = "height (meters)",
       y = "pace (km/hr)") +
  scale_color_manual(name = "Data Split",
                     breaks = c("training set", "testing set"),
                     values = c("training set" = "black",
                                "testing set" = "red")) +
  theme_minimal()

Tip: run the last code block several times (keyboard shortcut: CTRL-SHFT-ENTER) for the full effect!