ML Models Exercise

Load and Summarize Data

library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
✔ broom        1.0.12     ✔ recipes      1.3.1 
✔ dials        1.4.2      ✔ rsample      1.3.2 
✔ dplyr        1.2.0      ✔ tailor       0.1.0 
✔ ggplot2      4.0.2      ✔ tidyr        1.3.2 
✔ infer        1.1.0      ✔ tune         2.0.1 
✔ modeldata    1.5.1      ✔ workflows    1.3.0 
✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
✔ purrr        1.2.1      ✔ yardstick    1.3.2 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
library(ggplot2)
library(dplyr)  
library(GGally)
library(glmnet)
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loaded glmnet 4.1-10
set.seed(1234)

clean_df <- readRDS("clean-data.rds")

glimpse(clean_df)
Rows: 120
Columns: 7
$ Y    <dbl> 2690.52, 2638.81, 2149.61, 1788.89, 3126.37, 2336.89, 3007.20, 27…
$ DOSE <dbl> 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0,…
$ AGE  <dbl> 42, 24, 31, 46, 41, 27, 23, 20, 23, 28, 46, 22, 43, 50, 19, 26, 3…
$ SEX  <fct> 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RACE <fct> 2, 2, 1, 1, 2, 2, 1, 88, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1…
$ WT   <dbl> 94.3, 80.4, 71.8, 77.4, 64.3, 74.1, 87.9, 61.9, 65.3, 103.5, 83.0…
$ HT   <dbl> 1.769997, 1.759850, 1.809847, 1.649993, 1.560052, 1.829862, 1.850…

Changing the Race Variable

clean_df <- clean_df %>%
  mutate(
    RACE = as.character(RACE),
    RACE = case_when(
      RACE %in% c("7", "88") ~ "3",
      TRUE ~ RACE
    ),
    RACE = factor(RACE)
  )

table(clean_df$RACE)

 1  2  3 
74 36 10 

Correlation Plot for Continous Variable

cont_df <- clean_df %>%
  select(Y, AGE, WT, HT)

# Pairwise correlation plot
ggpairs(cont_df)

Adding BMI

clean_df <- clean_df %>%
  mutate(
    BMI = WT / (HT^2)
  )

summary(clean_df$BMI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.69   24.54   26.38   26.63   29.70   32.21 

Cross-validation and Recipe

folds <- vfold_cv(clean_df, v = 10)

Linear Regression (model 1)

lm_spec <- linear_reg() %>%
  set_engine("lm")

lm_wf <- workflow() %>%
  add_formula(Y ~ .) %>%
  add_model(lm_spec)

lm_cv <- fit_resamples(
  lm_wf,
  resamples = folds,
  metrics = metric_set(rmse, rsq)
)

collect_metrics(lm_cv)
# A tibble: 2 × 6
  .metric .estimator    mean     n std_err .config        
  <chr>   <chr>        <dbl> <int>   <dbl> <chr>          
1 rmse    standard   603.       10 60.3    pre0_mod0_post0
2 rsq     standard     0.603    10  0.0727 pre0_mod0_post0

LASSO Regressio (model 2)

lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_formula(Y ~ .) %>%
  add_model(lasso_spec)

lasso_cv <- fit_resamples(
  lasso_wf,
  resamples = folds,
  metrics = metric_set(rmse, rsq)
)

collect_metrics(lasso_cv)
# A tibble: 2 × 6
  .metric .estimator    mean     n std_err .config        
  <chr>   <chr>        <dbl> <int>   <dbl> <chr>          
1 rmse    standard   603.       10 60.2    pre0_mod0_post0
2 rsq     standard     0.603    10  0.0727 pre0_mod0_post0

Random Forest (model 3)

rngseed <- 1234

rf_spec <- rand_forest() %>%
  set_engine("ranger", seed = rngseed) %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_formula(Y ~ .) %>%
  add_model(rf_spec)

rf_cv <- fit_resamples(
  rf_wf,
  resamples = folds,
  metrics = metric_set(rmse, rsq)
)

collect_metrics(rf_cv)
# A tibble: 2 × 6
  .metric .estimator    mean     n std_err .config        
  <chr>   <chr>        <dbl> <int>   <dbl> <chr>          
1 rmse    standard   683.       10 66.7    pre0_mod0_post0
2 rsq     standard     0.495    10  0.0700 pre0_mod0_post0

First Fit

# create recipe
rngseed <- 1234
set.seed(rngseed)

rec <- recipe(Y ~ ., data = clean_df) %>%
  step_dummy(all_nominal_predictors())

Linear Model

lm_spec <- linear_reg() %>%
  set_engine("lm")

lm_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(lm_spec)

lm_fit <- fit(lm_wf, data = clean_df)

LASSO Model

lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(lasso_spec)

lasso_fit <- fit(lasso_wf, data = clean_df)

Random Forest Model

rf_spec <- rand_forest() %>%
  set_engine("ranger", seed = rngseed) %>% set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_spec)

rf_fit <- fit(rf_wf, data = clean_df)

Predictions on Full Dataset

lm_pred <- predict(lm_fit, new_data = clean_df) %>%
  bind_cols(clean_df %>% select(Y))

lasso_pred <- predict(lasso_fit, new_data = clean_df) %>%
  bind_cols(clean_df %>% select(Y))

rf_pred <- predict(rf_fit, new_data = clean_df) %>%
  bind_cols(clean_df %>% select(Y))

RMSE for Each Model

lm_rmse <- rmse(lm_pred, truth = Y, estimate = .pred) %>%
  mutate(model = "Linear model")

lasso_rmse <- rmse(lasso_pred, truth = Y, estimate = .pred) %>%
  mutate(model = "LASSO")

rf_rmse <- rmse(rf_pred, truth = Y, estimate = .pred) %>%
  mutate(model = "Random forest")

bind_rows(lm_rmse, lasso_rmse, rf_rmse) %>%
  select(model, .estimate)
# A tibble: 3 × 2
  model         .estimate
  <chr>             <dbl>
1 Linear model       572.
2 LASSO              572.
3 Random forest      380.

Observed vs Predicted Linear Model

ggplot(lm_pred, aes(x = Y, y = .pred)) +
  geom_point(alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    title = "Linear Model: Observed vs Predicted",
    x = "Observed Y",
    y = "Predicted Y"
  ) +
  theme_minimal()

Observed vs Predicted LASSO Model

ggplot(lasso_pred, aes(x = Y, y = .pred)) +
  geom_point(alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    title = "LASSO: Observed vs Predicted",
    x = "Observed Y",
    y = "Predicted Y"
  ) +
  theme_minimal()

Observed vs Predicted Random Forest Model

ggplot(rf_pred, aes(x = Y, y = .pred)) +
  geom_point(alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    title = "Random Forest: Observed vs Predicted",
    x = "Observed Y",
    y = "Predicted Y"
  ) +
  theme_minimal()

The linear model and the LASSO model gave very similar results. THhis may be because the LASSO penalty was small, so the coefficients were only shrunk slightly.

Tuning Models

AI tools were used to help me write this section.

## LASSO Model
lasso_tune_spec <- linear_reg(
  penalty = tune(),
  mixture = 1
) %>%
  set_engine("glmnet")

lasso_tune_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(lasso_tune_spec)

lasso_grid <- tibble(
  penalty = 10^seq(-5, 2, length.out = 50)
)

# apparent () is used to create a resample object 
app_resamples <- apparent(clean_df)

# tune LASSO
lasso_tune_res <- tune_grid(
  lasso_tune_wf,
  resamples = app_resamples,
  grid = lasso_grid,
  control = control_grid(save_pred = TRUE)
)

# collect predictions
lasso_preds <- collect_predictions(lasso_tune_res)

# compute RMSE manually for each penalty value
lasso_metrics <- lasso_preds %>%
  group_by(penalty) %>%
  rmse(truth = Y, estimate = .pred)

lasso_metrics
# A tibble: 50 × 4
     penalty .metric .estimator .estimate
       <dbl> <chr>   <chr>          <dbl>
 1 0.00001   rmse    standard        572.
 2 0.0000139 rmse    standard        572.
 3 0.0000193 rmse    standard        572.
 4 0.0000268 rmse    standard        572.
 5 0.0000373 rmse    standard        572.
 6 0.0000518 rmse    standard        572.
 7 0.0000720 rmse    standard        572.
 8 0.0001    rmse    standard        572.
 9 0.000139  rmse    standard        572.
10 0.000193  rmse    standard        572.
# ℹ 40 more rows
# plot (autoplot wasn't working)
ggplot(lasso_metrics, aes(x = penalty, y = .estimate)) +
  geom_line() +
  geom_point() +
  scale_x_log10() +
  labs(
    x = "Penalty",
    y = "RMSE",
    title = "LASSO tuning results"
  ) +
  theme_minimal()

When tuning LASSO we see that the RMSE value is the lowest for small penalties becuase there is little to no shrinkage of coeffcients. As the penalty increases, the LASSO begins to shrink the coefficients closer to 0. This reduces the influence of predictors and leads to underfitting. As a result, the RMSE increases. The RMSE never drops below the value from the linear model because the linear regression model already provides the best fit for the data when when looking at minimizinf squared error.

Random Forest

# RF tuning spec
rf_tune_spec <- rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = 300
) %>%
  set_engine("ranger", seed = rngseed) %>%
  set_mode("regression")

rf_tune_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_tune_spec)

# RF tuning grid
rf_grid <- grid_regular(
  mtry(range = c(1, 7)),
  min_n(range = c(1, 21)),
  levels = 7
)

# apparent () is used to create a resample object 
app_resamples <- apparent(clean_df)

# tune RF
rf_tune_res <- tune_grid(
  rf_tune_wf,
  resamples = app_resamples,
  grid = rf_grid,
  control = control_grid(save_pred = TRUE),
  metrics = metric_set(rmse)
)

rf_preds <- collect_predictions(rf_tune_res)

rf_metrics <- rf_preds %>%
  group_by(mtry, min_n) %>%
  rmse(truth = Y, estimate = .pred)

# plot
ggplot(rf_metrics, aes(x = mtry, y = min_n, fill = .estimate)) +
  geom_tile() +
  labs(
    x = "mtry",
    y = "min_n",
    fill = "RMSE",
    title = "Random Forest tuning results"
  ) +
  theme_minimal()

Tuning with CV

set.seed(1234)

cv_folds <- vfold_cv(clean_df, v = 5, repeats = 5)

LASSO Tuning with CV

lasso_tune_res_cv <- tune_grid(
  lasso_tune_wf,
  resamples = cv_folds,
  grid = lasso_grid
)

# plot
autoplot(lasso_tune_res_cv)

Random Forest Tuning with CV

rf_tune_res_cv <- tune_grid(
  rf_tune_wf,
  resamples = cv_folds,
  grid = rf_grid,
)

# plot
autoplot(rf_tune_res_cv)

When using CV, the model is evaluated on data that was not used for training. This provides a more realistic estimate of model performance. As a result, RMSE values increase.

When tuning without CV, the random forest appeared to perform best because it overfit the training data. When using CV, this overfitting is penalized, and the random forest’s preformance decreases. The LASSO model generalizes better to new data and therefore achieves a lower RMSE with CV.

The LASSO still performs best at small penalty values. High prnalties shrink coeffcinets too much, which causes underfitting. A small penalty allows the model to keep most of predictors while still applying slight shrinkage.

Based on the reuslts of tuning with CV, the LASSO model with a small penalty performs best, as it achieves the lowest RMSE and generalizes better to unseen data compared to random forest.