Data Exercise (option 2)

set.seed(123) #random number generator

Generate synthetic data

n <- 1200 
#setting n equal to 1200

Demographics / baseline predictors

age <- round(rnorm(n, mean = 35, sd = 12)) #creates age as a variable
age <- pmax(pmin(age, 70), 18)  #limits the age range between 18 and 70                
sex <- factor(rbinom(n, 1, 0.55), labels = c("F", "M"))

study_hours <- pmax(rnorm(n, mean = 12, sd = 5), 0) #generates random study times
exercise_hours <- pmax(4 - 0.12 * study_hours + rnorm(n, 0, 1.8), 0)
tutoring <- rbinom(n, 1, plogis(-1 + 0.10 * study_hours))
ability <- rnorm(n, 0, 1)

Outcomes

score <- 55 +
  2.4 * study_hours +
  1.2 * exercise_hours +
  3.0 * (sex == "M") +
  6.5 * ability +
  0.08 * age - 0.0015 * age^2 +                 
  rnorm(n, 0, 10)

# Binary outcome based on a logistic model:
p_pass <- plogis(-10 + 0.14 * score + 0.35 * tutoring)
passed <- rbinom(n, 1, p_pass)

# outcome to illustrate a different model (stress, coffee, exercise, ability, study):
stress <- 0.6 * scale(study_hours)[,1] - 0.7 * scale(exercise_hours)[,1] - 0.4 * ability + rnorm(n, 0, 0.6)
lambda_coffee <- exp(1.2 + 0.35 * stress)
coffee <- rpois(n, lambda_coffee)

# data frame
dat <- data.frame(
  age, sex, study_hours, exercise_hours,
  tutoring = factor(tutoring, labels = c("no", "yes")),
  ability, score, passed = factor(passed, labels = c("no", "yes")),
  stress, coffee
)

Data

# Summary
summary(dat)

      age        sex      study_hours     exercise_hours  tutoring 
 Min.   :18.00   F:555   Min.   : 0.000   Min.   :0.000   no :520  
 1st Qu.:27.00   M:645   1st Qu.: 8.719   1st Qu.:1.228   yes:680  
 Median :35.00           Median :11.792   Median :2.584            
 Mean   :35.63           Mean   :11.980   Mean   :2.601            
 3rd Qu.:43.00           3rd Qu.:15.421   3rd Qu.:3.742            
 Max.   :70.00           Max.   :29.105   Max.   :7.874            
    ability             score        passed         stress         
 Min.   :-3.26215   Min.   : 35.86   no : 187   Min.   :-4.526330  
 1st Qu.:-0.63826   1st Qu.: 77.97   yes:1013   1st Qu.:-0.917046  
 Median : 0.01796   Median : 89.25              Median : 0.082226  
 Mean   : 0.01690   Mean   : 89.08              Mean   :-0.002875  
 3rd Qu.: 0.71944   3rd Qu.:100.89              3rd Qu.: 0.850127  
 Max.   : 3.71572   Max.   :137.91              Max.   : 4.401843  
     coffee      
 Min.   : 0.000  
 1st Qu.: 2.000  
 Median : 3.000  
 Mean   : 3.607  
 3rd Qu.: 5.000  
 Max.   :18.000

# Creates a dataset called nums_vars
num_vars <- dat[, c("age","study_hours","exercise_hours","ability","score","stress","coffee")]
round(cor(num_vars), 2)

                 age study_hours exercise_hours ability score stress coffee
age             1.00       -0.04           0.02    0.02 -0.05  -0.04   0.00
study_hours    -0.04        1.00          -0.30   -0.01  0.68   0.64   0.42
exercise_hours  0.02       -0.30           1.00    0.03 -0.10  -0.69  -0.43
ability         0.02       -0.01           0.03    1.00  0.39  -0.33  -0.27
score          -0.05        0.68          -0.10    0.39  1.00   0.26   0.13
stress         -0.04        0.64          -0.69   -0.33  0.26   1.00   0.65
coffee          0.00        0.42          -0.43   -0.27  0.13   0.65   1.00

# Tables for categorical vars
table(dat$sex)


  F   M 
555 645

table(dat$tutoring)


 no yes 
520 680

prop.table(table(dat$passed))


       no       yes 
0.1558333 0.8441667

op <- par(mfrow = c(2, 2), mar = c(4,4,2,1))

# Score vs study
plot(dat$study_hours, dat$score,
     xlab = "Study hours", ylab = "Score", main = "Score vs Study hours")
abline(lm(score ~ study_hours, dat), lwd = 2)

# Exercise vs study 
plot(dat$study_hours, dat$exercise_hours,
     xlab = "Study hours", ylab = "Exercise hours", main = "Exercise vs Study")
abline(lm(exercise_hours ~ study_hours, dat), lwd = 2)

# Score by tutoring
barplot(prop.table(table(dat$passed, dat$tutoring), 2)["yes", ],
        ylab = "P(passed = yes)", main = "Pass rate by tutoring", ylim = c(0,1))

# Coffee vs stress
plot(dat$stress, dat$coffee,
     xlab = "Stress", ylab = "Coffee (count)", main = "Coffee vs Stress")

Fit models

# Linear models
m1 <- lm(score ~ study_hours + exercise_hours + sex + age, data = dat)
summary(m1)


Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age, 
    data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.371  -8.312   0.147   8.338  36.751 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    56.89963    1.67358  33.999  < 2e-16 ***
study_hours     2.37927    0.07229  32.912  < 2e-16 ***
exercise_hours  1.11540    0.20654   5.400 8.01e-08 ***
sexM            3.63287    0.68279   5.321 1.23e-07 ***
age            -0.03307    0.03027  -1.092    0.275    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.79 on 1195 degrees of freedom
Multiple R-squared:  0.4892,    Adjusted R-squared:  0.4875 
F-statistic: 286.1 on 4 and 1195 DF,  p-value: < 2.2e-16

m2 <- lm(score ~ study_hours + exercise_hours + sex + age + I(age^2), data = dat)
summary(m2)


Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age + 
    I(age^2), data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.191  -8.256   0.181   8.311  36.639 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    58.776583   3.245917  18.108  < 2e-16 ***
study_hours     2.379755   0.072312  32.909  < 2e-16 ***
exercise_hours  1.113877   0.206597   5.392 8.41e-08 ***
sexM            3.614299   0.683501   5.288 1.47e-07 ***
age            -0.142163   0.164445  -0.865    0.387    
I(age^2)        0.001445   0.002142   0.675    0.500    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.79 on 1194 degrees of freedom
Multiple R-squared:  0.4894,    Adjusted R-squared:  0.4873 
F-statistic: 228.9 on 5 and 1194 DF,  p-value: < 2.2e-16

m3 <- lm(score ~ study_hours + exercise_hours + sex + age + I(age^2) + ability, data = dat)
summary(m3)


Call:
lm(formula = score ~ study_hours + exercise_hours + sex + age + 
    I(age^2) + ability, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-33.494  -6.496  -0.213   7.211  33.395 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    58.3425915  2.7150404  21.489  < 2e-16 ***
study_hours     2.3757184  0.0604842  39.278  < 2e-16 ***
exercise_hours  0.9945639  0.1728839   5.753 1.11e-08 ***
sexM            3.9906764  0.5719395   6.977 4.97e-12 ***
age            -0.1034551  0.1375568  -0.752    0.452    
I(age^2)        0.0008021  0.0017915   0.448    0.654    
ability         6.4810807  0.2859615  22.664  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.865 on 1193 degrees of freedom
Multiple R-squared:  0.6431,    Adjusted R-squared:  0.6413 
F-statistic: 358.2 on 6 and 1193 DF,  p-value: < 2.2e-16

# Comparing all the variables
AIC(m1, m2, m3)

   df      AIC
m1  6 9334.197
m2  7 9335.739
m3  8 8908.055