Fitting Exercise

Load and summarize data

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(GGally)
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
✔ broom        1.0.12     ✔ rsample      1.3.2 
✔ dials        1.4.2      ✔ tailor       0.1.0 
✔ infer        1.1.0      ✔ tidyr        1.3.2 
✔ modeldata    1.5.1      ✔ tune         2.0.1 
✔ parsnip      1.4.1      ✔ workflows    1.3.0 
✔ purrr        1.2.1      ✔ workflowsets 1.1.1 
✔ recipes      1.3.1      ✔ yardstick    1.3.2 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard()  masks scales::discard()
✖ dplyr::filter()   masks stats::filter()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
mavo <- read_csv("Mavoglurant.csv")
Rows: 2678 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (17): ID, CMT, EVID, EVI2, MDV, DV, LNDV, AMT, TIME, DOSE, OCC, RATE, AG...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(mavo)
Rows: 2,678
Columns: 17
$ ID   <dbl> 793, 793, 793, 793, 793, 793, 793, 793, 793, 793, 793, 793, 793, …
$ CMT  <dbl> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,…
$ EVID <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
$ EVI2 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
$ MDV  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
$ DV   <dbl> 0.00, 491.00, 605.00, 556.00, 310.00, 237.00, 147.00, 101.00, 72.…
$ LNDV <dbl> 0.000, 6.196, 6.405, 6.321, 5.737, 5.468, 4.990, 4.615, 4.282, 3.…
$ AMT  <dbl> 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, …
$ TIME <dbl> 0.000, 0.200, 0.250, 0.367, 0.533, 0.700, 1.200, 2.200, 3.200, 4.…
$ DOSE <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2…
$ OCC  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RATE <dbl> 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 0, 0, 0, 0,…
$ AGE  <dbl> 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 2…
$ SEX  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RACE <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ WT   <dbl> 94.3, 94.3, 94.3, 94.3, 94.3, 94.3, 94.3, 94.3, 94.3, 94.3, 94.3,…
$ HT   <dbl> 1.769997, 1.769997, 1.769997, 1.769997, 1.769997, 1.769997, 1.769…
dim(mavo)
[1] 2678   17

Convert data to plot

ggplot(mavo, aes(x = TIME, y = DV, group = ID, color = factor(DOSE))) +
  geom_line(alpha = 0.6) +
  labs(color = "Dose") +
  theme_minimal()

#only keeping observations with OCC = 1
mavo_occ1 <- mavo %>%
  filter(OCC == 1)

table(mavo_occ1$OCC)

   1 
1665 

Creates a data frame with time that is not equal to 0 and computes the sum of DV

mavo_no0 <- mavo_occ1 %>%
  filter(TIME != 0)

Y_df <- mavo_no0 %>%
  group_by(ID) %>%
  summarize(Y = sum(DV, na.rm = TRUE))

Creates a data frame with time = 0

baseline_df <- mavo_occ1 %>%
  filter(TIME == 0)

Joins the tow data frames together

final_df <- baseline_df %>%
  left_join(Y_df, by = "ID")

dim(final_df)
[1] 120  18

Coverts race and sex to variable factors and cleans data

clean_df <- final_df %>%
  mutate(
    SEX = factor(SEX),
    RACE = factor(RACE)
  ) %>%
  select(Y, DOSE, AGE, SEX, RACE, WT, HT)

saveRDS(clean_df, file = "clean-data.rds")

summary(clean_df)
       Y               DOSE            AGE        SEX     RACE   
 Min.   : 826.4   Min.   :25.00   Min.   :18.00   1:104   1 :74  
 1st Qu.:1700.5   1st Qu.:25.00   1st Qu.:26.00   2: 16   2 :36  
 Median :2349.1   Median :37.50   Median :31.00           7 : 2  
 Mean   :2445.4   Mean   :36.46   Mean   :33.00           88: 8  
 3rd Qu.:3050.2   3rd Qu.:50.00   3rd Qu.:40.25                  
 Max.   :5606.6   Max.   :50.00   Max.   :50.00                  
       WT               HT       
 Min.   : 56.60   Min.   :1.520  
 1st Qu.: 73.17   1st Qu.:1.700  
 Median : 82.10   Median :1.770  
 Mean   : 82.55   Mean   :1.759  
 3rd Qu.: 90.10   3rd Qu.:1.813  
 Max.   :115.30   Max.   :1.930  

Summary of data

clean_df %>%
  summarize(
    n = n(),
    mean_Y = mean(Y),
    sd_Y = sd(Y),
    mean_age = mean(AGE),
    mean_WT = mean(WT),
    mean_HT = mean(HT)
  )
# A tibble: 1 × 6
      n mean_Y  sd_Y mean_age mean_WT mean_HT
  <int>  <dbl> <dbl>    <dbl>   <dbl>   <dbl>
1   120  2445.  962.       33    82.6    1.76

This summary table shows that there are 120 indivduals and it shows the the mean and standard deviation of drug exposure, and the mean od age, WT, and HT.

Summary grouped by dose

  clean_df %>%
  group_by(DOSE) %>%
  summarize(
    n = n(),
    mean_Y = mean(Y),
    sd_Y = sd(Y),
    mean_WT = mean(WT),
    mean_AGE = mean(AGE)
  )
# A tibble: 3 × 6
   DOSE     n mean_Y  sd_Y mean_WT mean_AGE
  <dbl> <int>  <dbl> <dbl>   <dbl>    <dbl>
1  25      59  1783.  601.    81.5     32.1
2  37.5    12  2464.  488.    81.1     36.1
3  50      49  3239.  787.    84.2     33.4

This summary table shows the relationship between dose and exposure. As dose increases, total exposure decreases.

Summary grouped by sex

  clean_df %>%
  group_by(SEX) %>%
  summarize(
    n = n(),
    mean_Y = mean(Y),
    mean_WT = mean(WT)
  )
# A tibble: 2 × 4
  SEX       n mean_Y mean_WT
  <fct> <int>  <dbl>   <dbl>
1 1       104  2478.    84.1
2 2        16  2236.    72.5

This summary table shows the data grouped by sex. Sex 1 has 104 individuals and sex 2 has 16 individuals.

Plot y versus dose

ggplot(clean_df, aes(x = factor(DOSE), y = Y)) +
  geom_boxplot() +
  labs(x = "Dose", y = "Total Drug (Y)") +
  theme_minimal()

This boxplot shows total drug exposure versus dose; the higher the dose is associated with drug exposure.

Plot y versus age

  ggplot(clean_df, aes(x = AGE, y = Y)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

This scatterplot shows total drug exposure versus age; there is no relationship between the two variables.

Distribution of Y

ggplot(clean_df, aes(x = Y)) +
  geom_histogram(bins = 20) +
  theme_minimal()

This histogram shows total drug exposure skewed to the right. This means that most individuals have exposure values between 1500 and 3500, but a few individuals have high exposure values.

Distribution of HT

  ggplot(clean_df, aes(x = HT)) +
  geom_histogram(bins = 15) +
  theme_minimal()

The histogram shoes height with a normal distribution with most idividuals within 1.60 to 1.90 meters.

Correlation matrix

numeric_df <- clean_df %>%
  select(Y, AGE, WT, HT)

cor(numeric_df)
              Y         AGE         WT         HT
Y    1.00000000  0.01256372 -0.2128719 -0.1583297
AGE  0.01256372  1.00000000  0.1196740 -0.3518581
WT  -0.21287194  0.11967399  1.0000000  0.5997505
HT  -0.15832972 -0.35185806  0.5997505  1.0000000

This correlation matrix shows the strength and whether the relationship is positive or negative between total drug exposure, age, height, and weight. +1 menas there is a positive relationship, 0 means there is no relationship, and -1 means there is a negative relationship.

Correlation matrix plot plot

ggpairs(clean_df,
        columns = c("Y", "AGE", "WT", "HT"),
        aes(color = factor(DOSE)))

This correlation matrix plot shows the same relationships as the correlation matrix table.

Model fitting

set.seed(123)

data_split <- initial_split(clean_df, prop = 0.8, strata = SEX)

train_data <- training(data_split)
test_data  <- testing(data_split)

Fitting the linear model to the continous outcome of Y using dose as the main predictor

lm_spec <- linear_reg() %>%
  set_engine("lm")

lm_dose <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE)

lm_dose_fit <- fit(lm_dose, data = train_data)

pred_dose <- predict(lm_dose_fit, test_data) %>%
  bind_cols(test_data)

  metrics_dose <- pred_dose %>%
  metrics(truth = Y, estimate = .pred)

metrics_dose
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     725.   
2 rsq     standard       0.556
3 mae     standard     546.   

Fitting the linear model to the continous outcome of Y using all the predictors

lm_all <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE + AGE + SEX + RACE + WT + HT)

lm_all_fit <- fit(lm_all, data = train_data)

pred_all <- predict(lm_all_fit, test_data) %>%
  bind_cols(test_data)

metrics_all <- pred_all %>%
  metrics(truth = Y, estimate = .pred)

metrics_all
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     744.   
2 rsq     standard       0.511
3 mae     standard     562.   

Fitting the logistic model to the continous outcome of sex using dose as the main predictor

log_spec <- logistic_reg() %>%
  set_engine("glm")

log_dose <- workflow() %>%
  add_model(log_spec) %>%
  add_formula(SEX ~ DOSE)

log_dose_fit <- fit(log_dose, data = train_data)

pred_log_dose <- predict(log_dose_fit, test_data, type = "prob") %>%
  bind_cols(
    predict(log_dose_fit, test_data),
    test_data
  )

pred_log_dose %>%
  metrics(truth = SEX, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.84
2 kap      binary          0   
pred_log_dose %>%
  roc_auc(truth = SEX, .pred_1)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.494

Fitting the logistic model to the continous outcome of Y using all the predictors

log_all <- workflow() %>%
  add_model(log_spec) %>%
  add_formula(SEX ~ DOSE + AGE + RACE + WT + HT)

log_all_fit <- fit(log_all, data = train_data)

pred_log_all <- predict(log_all_fit, test_data, type = "prob") %>%
  bind_cols(
    predict(log_all_fit, test_data),
    test_data
  )

pred_log_all %>%
  metrics(truth = SEX, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary             1
2 kap      binary             1
pred_log_all %>%
  roc_auc(truth = SEX, .pred_1)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary             1

The first linear model is using dose as the only predictor of total drug exposure. It shows that the higher the dose, the higher the drug exposure is.

The second linear model is using all variables (age, dose, sex, race, height, and weight) as predictors of total drug exposure. It showed that dose was the strongest predictor of total drug exposure.

The first logistic model is using dose as the only predictor of sex. This model produced low ROC-AUC and accuracy values because there is no relationship between dose and sex.

The second logistic model is using all the variables as predictors of sex. This model has higher ROC-AUC and accuracy values compared to the first model becuase height and weight can potentially be predictors of sex.

Model Improvement

Excluding race as a variable

rngseed = 1234

clean_df <- final_df %>%
  mutate(
    SEX = factor(SEX)
  ) %>%
  select(Y, DOSE, AGE, SEX, WT, HT)

dim(clean_df)
[1] 120   6
summary(clean_df)
       Y               DOSE            AGE        SEX           WT        
 Min.   : 826.4   Min.   :25.00   Min.   :18.00   1:104   Min.   : 56.60  
 1st Qu.:1700.5   1st Qu.:25.00   1st Qu.:26.00   2: 16   1st Qu.: 73.17  
 Median :2349.1   Median :37.50   Median :31.00           Median : 82.10  
 Mean   :2445.4   Mean   :36.46   Mean   :33.00           Mean   : 82.55  
 3rd Qu.:3050.2   3rd Qu.:50.00   3rd Qu.:40.25           3rd Qu.: 90.10  
 Max.   :5606.6   Max.   :50.00   Max.   :50.00           Max.   :115.30  
       HT       
 Min.   :1.520  
 1st Qu.:1.700  
 Median :1.770  
 Mean   :1.759  
 3rd Qu.:1.813  
 Max.   :1.930  

Setting test and train data

set.seed(rngseed)

data_split <- initial_split(clean_df, prop = 0.75)

train_data <- training(data_split)
test_data  <- testing(data_split)

nrow(train_data)
[1] 90
nrow(test_data)
[1] 30

Linear models

lm_spec <- linear_reg() %>%
  set_engine("lm")

Linear Model with Dose

lm_dose_workflow <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE)

lm_dose_fit <- fit(lm_dose_workflow, data = train_data)

lm_dose_pred <- predict(lm_dose_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y))

metrics(lm_dose_pred, truth = Y, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     703.   
2 rsq     standard       0.451
3 mae     standard     546.   

Linear Model with all Predictors

lm_all_workflow <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE + AGE + SEX + WT + HT)

lm_all_fit <- fit(lm_all_workflow, data = train_data)

lm_all_pred <- predict(lm_all_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y))

metrics(lm_all_pred, truth = Y, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     627.   
2 rsq     standard       0.562
3 mae     standard     486.   

Null model

mean_y <- mean(train_data$Y)

null_pred <- train_data %>%
  mutate(.pred = mean_y)

rmse_null <- null_pred %>%
  rmse(truth = Y, estimate = .pred)

rmse_null
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        948.

The null model, predicts the mean outcome for all observations, had the highest RMSE being 948. This RMSE value indicates that the predictive preformace of this model is poor. The model that used dose as the only predictor had a lower RMSE value (703), this suggests that dose explains variability in the outcome. The model that uses all predictors had the lowest RMSE (627), which indicates the best preformance.

set.seed(rngseed)

# creates 10 fold CV split
folds <- vfold_cv(train_data, v = 10)

Model using Dose

lm_dose_workflow <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE)

Model Using All Predictors

lm_all_workflow <- workflow() %>%
  add_model(lm_spec) %>%
  add_formula(Y ~ DOSE + AGE + SEX + WT + HT)

Cross Validation

  dose_cv <- fit_resamples(
  lm_dose_workflow,
  resamples = folds,
  metrics = metric_set(rmse)
)

all_cv <- fit_resamples(
  lm_all_workflow,
  resamples = folds,
  metrics = metric_set(rmse)
)

RMSE and Standard Validation

dose_cv_results <- collect_metrics(dose_cv)
all_cv_results  <- collect_metrics(all_cv)

dose_cv_results
# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config        
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
1 rmse    standard    691.    10    67.5 pre0_mod0_post0
all_cv_results
# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config        
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
1 rmse    standard    646.    10    64.8 pre0_mod0_post0

The RMSE values from the cros validation are a little higher than those from the training data, this could be because of less overfitting. This gives us a more realistic idea of the models preformance on new data. The model that used all the predictors still had the lowest RMSE value, followed by the model that used dose as the only predictor, and the null model had the highest RMSE value. The standard error values suggest that there is some variability in how the model preforms across folds, but the differences are consistent.

Different Value for the Random Seed

set.seed(999)

folds2 <- vfold_cv(train_data, v = 10)

dose_cv2 <- fit_resamples(lm_dose_workflow, resamples = folds2, metrics = metric_set(rmse))
all_cv2  <- fit_resamples(lm_all_workflow, resamples = folds2, metrics = metric_set(rmse))

collect_metrics(dose_cv2)
# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config        
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
1 rmse    standard    705.    10    45.7 pre0_mod0_post0
collect_metrics(all_cv2)
# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config        
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
1 rmse    standard    648.    10    52.7 pre0_mod0_post0

Changing the random seed resulted in slightly different RMSE values, this could be due to variation in how the data was split into the folds. The pattern stayed the same with the model that used all the predictors having the lowest RMSE value, and the null model having the highest RMSE value.

This section added by Riley Herber

Predictions for each model

# Dose model predictions
dose_pred <- predict(lm_dose_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y)) %>%
  mutate(model = "Dose Model")

# All predictor model predictions
all_pred <- predict(lm_all_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y)) %>%
  mutate(model = "All Predictors Model")

# Null model predictions
null_pred <- train_data %>%
  mutate(.pred = mean(train_data$Y),
         model = "Null Model") %>%
  select(Y, .pred, model)

Combine predictions into one dataframe

predictions_df <- bind_rows(
  dose_pred %>% select(Y, .pred, model),
  all_pred %>% select(Y, .pred, model),
  null_pred
)

Observed vs Predicted

ggplot(predictions_df, aes(x = Y, y = .pred, color = model, shape = model)) +
  geom_point(alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000)) +
  labs(
    x = "Observed Values",
    y = "Predicted Values",
    color = "Model",
    shape = "Model"
  ) +
  theme_minimal()

For the model that includes only dose as a predictor, the predicted values fall along three horizontal lines because the DOSE variable only takes three distinct values in the dataset.

Plot All Predictors Model

all_residuals <- predict(lm_all_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y)) %>%
  mutate(residual = .pred - Y)

ggplot(all_residuals, aes(x = .pred, y = residual)) +
  geom_point(alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_cartesian(ylim = c(-2000, 2000)) +
  labs(
    x = "Predicted Values",
    y = "Residuals (Predicted - Observed)",
    title = "Residuals vs Predicted Values for Full Model"
  ) +
  theme_minimal()

Model predictions and uncertainty

library(rsample)
library(purrr)

set.seed(rngseed) 

# 100 bootstrap resamples
dat_bs <- bootstraps(train_data, times = 100)

Fit the model on each bootstrap and make predictions for the original training data

# store predictions
pred_list <- map(dat_bs$splits, function(split) {

dat_sample <- rsample::analysis(split)
  
  # fit full model on bootstrap sample
  fit_bs <- lm(Y ~ DOSE + AGE + SEX + WT + HT, data = dat_sample)
  
  # predict on original training data
  predict(fit_bs, newdata = train_data)
})

Combine predictions into a matrix

pred_bs <- do.call(rbind, pred_list)
dim(pred_bs) 
[1] 100  90

Compute median and 89% confidence intervals

preds_ci <- apply(pred_bs, 2, quantile, probs = c(0.055, 0.5, 0.945)) %>%
  t() %>%
  as.data.frame()

colnames(preds_ci) <- c("lower", "median", "upper")
preds_ci$observed <- train_data$Y
preds_ci$predicted <- predict(lm_all_fit, new_data = train_data)$.pred

Plot observed vs. predicted with uncertainty

library(ggplot2)

ggplot(preds_ci, aes(x = observed)) +
  geom_point(aes(y = predicted), color = "black", alpha = 0.7) +        # original point estimate
  geom_point(aes(y = median), color = "blue", alpha = 0.5) +             # median from bootstrap
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0, color = "red", alpha = 0.3) +  # CI
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000)) +
  labs(
    x = "Observed Values",
    y = "Predicted Values",
    title = "Observed vs Predicted Values with 89% Bootstrap Confidence Intervals"
  ) +
  theme_minimal()

The plot of observed versus predicted values with 89% bootstrap confidence intervals shows that the full linear model predicts total drug exposure reasonably well. The black points represent the original model predictions, and most lie close to the 45° line, indicating general agreement between observed and predicted values. The blue points, representing the median predictions from 100 bootstrap samples, largely overlap with the original predictions, demonstrating that the model is stable across resampled datasets. The red vertical lines indicate the 89% confidence intervals, which are relatively narrow for most observations, reflecting low prediction uncertainty, but are wider for a few individuals, likely corresponding to less typical combinations of predictor values. Overall, the pattern suggests that the model captures the main trends in the data, and the bootstrap confirms both the stability and the uncertainty of the predictions.

Final Evaluation

Predictions for Training Data Using Model 2

train_pred_all <- predict(lm_all_fit, new_data = train_data) %>%
  bind_cols(train_data %>% select(Y)) %>%
  mutate(dataset = "Training")

Predictions for Test Data Using Model 2

test_pred_all <- predict(lm_all_fit, new_data = test_data) %>%
  bind_cols(test_data %>% select(Y)) %>%
  mutate(dataset = "Test")

Combine Training and Testing Data Predictions

final_pred_df <- bind_rows(train_pred_all, test_pred_all)

Plot for Observed vs Predicted for Both Datasets

ggplot(final_pred_df, aes(x = Y, y = .pred, color = dataset, shape = dataset)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000)) +
  labs(
    x = "Observed Values",
    y = "Predicted Values",
    color = "Dataset",
    shape = "Dataset",
    title = "Observed vs Predicted Values for Training and Test Data"
  ) +
  theme_minimal()

This plot compares observed vs predicted values for both the training and test data using model 2. The test data points and the training data points are mixed in with each other, which suggests that the model generalizes well to new data and does not overfit.

Overall Model Assessment

The null model gives us a baseline to compare against, because it does not use any predictors and just predicts the same average value every time. Model 1 uses dose as the only predictor, so the main question is whether dose alone improves prediction. Model 1 has a lower RMSE than the null model, so dose does help explain some of the variation in the outcome. Model 2 includes all predictors, and this model has an even lower RMSE value compared to model 1. The lower RMSE value shows that using all predictors is more accurate. The test data predictions are mixed in with the training data predictions, which means that the model is generalizing to new data rather than overfitting.