set.seed(123) #random number generatorData Exercise (option 2)
Generate synthetic data
n <- 1200
#setting n equal to 1200Demographics / baseline predictors
age <- round(rnorm(n, mean = 35, sd = 12)) #creates age as a variable
age <- pmax(pmin(age, 70), 18) #limits the age range between 18 and 70
sex <- factor(rbinom(n, 1, 0.55), labels = c("F", "M"))
study_hours <- pmax(rnorm(n, mean = 12, sd = 5), 0) #generates random study times
exercise_hours <- pmax(4 - 0.12 * study_hours + rnorm(n, 0, 1.8), 0)
tutoring <- rbinom(n, 1, plogis(-1 + 0.10 * study_hours))
ability <- rnorm(n, 0, 1)Outcomes
score <- 55 +
2.4 * study_hours +
1.2 * exercise_hours +
3.0 * (sex == "M") +
6.5 * ability +
0.08 * age - 0.0015 * age^2 +
rnorm(n, 0, 10)
# Binary outcome based on a logistic model:
p_pass <- plogis(-10 + 0.14 * score + 0.35 * tutoring)
passed <- rbinom(n, 1, p_pass)# outcome to illustrate a different model (stress, coffee, exercise, ability, study):
stress <- 0.6 * scale(study_hours)[,1] - 0.7 * scale(exercise_hours)[,1] - 0.4 * ability + rnorm(n, 0, 0.6)
lambda_coffee <- exp(1.2 + 0.35 * stress)
coffee <- rpois(n, lambda_coffee)
# data frame
dat <- data.frame(
age, sex, study_hours, exercise_hours,
tutoring = factor(tutoring, labels = c("no", "yes")),
ability, score, passed = factor(passed, labels = c("no", "yes")),
stress, coffee
)Data
# Summary
summary(dat) age sex study_hours exercise_hours tutoring
Min. :18.00 F:555 Min. : 0.000 Min. :0.000 no :520
1st Qu.:27.00 M:645 1st Qu.: 8.719 1st Qu.:1.228 yes:680
Median :35.00 Median :11.792 Median :2.584
Mean :35.63 Mean :11.980 Mean :2.601
3rd Qu.:43.00 3rd Qu.:15.421 3rd Qu.:3.742
Max. :70.00 Max. :29.105 Max. :7.874
ability score passed stress
Min. :-3.26215 Min. : 35.86 no : 187 Min. :-4.526330
1st Qu.:-0.63826 1st Qu.: 77.97 yes:1013 1st Qu.:-0.917046
Median : 0.01796 Median : 89.25 Median : 0.082226
Mean : 0.01690 Mean : 89.08 Mean :-0.002875
3rd Qu.: 0.71944 3rd Qu.:100.89 3rd Qu.: 0.850127
Max. : 3.71572 Max. :137.91 Max. : 4.401843
coffee
Min. : 0.000
1st Qu.: 2.000
Median : 3.000
Mean : 3.607
3rd Qu.: 5.000
Max. :18.000
# Creates a dataset called nums_vars
num_vars <- dat[, c("age","study_hours","exercise_hours","ability","score","stress","coffee")]
round(cor(num_vars), 2) age study_hours exercise_hours ability score stress coffee
age 1.00 -0.04 0.02 0.02 -0.05 -0.04 0.00
study_hours -0.04 1.00 -0.30 -0.01 0.68 0.64 0.42
exercise_hours 0.02 -0.30 1.00 0.03 -0.10 -0.69 -0.43
ability 0.02 -0.01 0.03 1.00 0.39 -0.33 -0.27
score -0.05 0.68 -0.10 0.39 1.00 0.26 0.13
stress -0.04 0.64 -0.69 -0.33 0.26 1.00 0.65
coffee 0.00 0.42 -0.43 -0.27 0.13 0.65 1.00
# Tables for categorical vars
table(dat$sex)
F M
555 645
table(dat$tutoring)
no yes
520 680
prop.table(table(dat$passed))
no yes
0.1558333 0.8441667
op <- par(mfrow = c(2, 2), mar = c(4,4,2,1))
# Score vs study
plot(dat$study_hours, dat$score,
xlab = "Study hours", ylab = "Score", main = "Score vs Study hours")
abline(lm(score ~ study_hours, dat), lwd = 2)
# Exercise vs study
plot(dat$study_hours, dat$exercise_hours,
xlab = "Study hours", ylab = "Exercise hours", main = "Exercise vs Study")
abline(lm(exercise_hours ~ study_hours, dat), lwd = 2)
# Score by tutoring
barplot(prop.table(table(dat$passed, dat$tutoring), 2)["yes", ],
ylab = "P(passed = yes)", main = "Pass rate by tutoring", ylim = c(0,1))
# Coffee vs stress
plot(dat$stress, dat$coffee,
xlab = "Stress", ylab = "Coffee (count)", main = "Coffee vs Stress")Fit models
# Linear models
m1 <- lm(score ~ study_hours + exercise_hours + sex + age, data = dat)
summary(m1)
Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age,
data = dat)
Residuals:
Min 1Q Median 3Q Max
-44.371 -8.312 0.147 8.338 36.751
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 56.89963 1.67358 33.999 < 2e-16 ***
study_hours 2.37927 0.07229 32.912 < 2e-16 ***
exercise_hours 1.11540 0.20654 5.400 8.01e-08 ***
sexM 3.63287 0.68279 5.321 1.23e-07 ***
age -0.03307 0.03027 -1.092 0.275
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 11.79 on 1195 degrees of freedom
Multiple R-squared: 0.4892, Adjusted R-squared: 0.4875
F-statistic: 286.1 on 4 and 1195 DF, p-value: < 2.2e-16
m2 <- lm(score ~ study_hours + exercise_hours + sex + age + I(age^2), data = dat)
summary(m2)
Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age +
I(age^2), data = dat)
Residuals:
Min 1Q Median 3Q Max
-44.191 -8.256 0.181 8.311 36.639
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58.776583 3.245917 18.108 < 2e-16 ***
study_hours 2.379755 0.072312 32.909 < 2e-16 ***
exercise_hours 1.113877 0.206597 5.392 8.41e-08 ***
sexM 3.614299 0.683501 5.288 1.47e-07 ***
age -0.142163 0.164445 -0.865 0.387
I(age^2) 0.001445 0.002142 0.675 0.500
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 11.79 on 1194 degrees of freedom
Multiple R-squared: 0.4894, Adjusted R-squared: 0.4873
F-statistic: 228.9 on 5 and 1194 DF, p-value: < 2.2e-16
m3 <- lm(score ~ study_hours + exercise_hours + sex + age + I(age^2) + ability, data = dat)
summary(m3)
Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age +
I(age^2) + ability, data = dat)
Residuals:
Min 1Q Median 3Q Max
-33.494 -6.496 -0.213 7.211 33.395
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58.3425915 2.7150404 21.489 < 2e-16 ***
study_hours 2.3757184 0.0604842 39.278 < 2e-16 ***
exercise_hours 0.9945639 0.1728839 5.753 1.11e-08 ***
sexM 3.9906764 0.5719395 6.977 4.97e-12 ***
age -0.1034551 0.1375568 -0.752 0.452
I(age^2) 0.0008021 0.0017915 0.448 0.654
ability 6.4810807 0.2859615 22.664 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.865 on 1193 degrees of freedom
Multiple R-squared: 0.6431, Adjusted R-squared: 0.6413
F-statistic: 358.2 on 6 and 1193 DF, p-value: < 2.2e-16
# Comparing all the variables
AIC(m1, m2, m3) df AIC
m1 6 9334.197
m2 7 9335.739
m3 8 8908.055