Code
# Load libraries
library(tidyverse)
library(cmdstanr)
library(tidybayes)
library(gt)
library(ggridges)
# Set ggplot theme
theme_set(theme_classic())Missing data can happen for many reasons in the educational context.
The surface area for missing values to occur increases as the student progresses through the educational system. A student in year 12 has more prior information than a student in year 2. Given our aim of predicting achievement, it makes sense to account for missing values and use all information available to quantify our uncertainty.
Let’s start by loading our simulated data and artificially adding NA values with 5% probability.
grades <- read_csv(file = "data/sim_grades_teacher.csv") |>
transmute(
student,
teacher_8, teacher_9, teacher_10,
teacher_8_id, teacher_9_id, teacher_10_id,
y8 = round(y8, 2),
y9 = round(y9, 2),
y10 = round(y10, 2)) |>
# Remove zeros or ones for the Beta distribution and Stan
mutate(
across(
starts_with("y"), ~ case_when(
.x == 0 ~ 0.001,
.x == 1 ~ 0.999,
TRUE ~ .x
)))
na_prob <- 0.05
set.seed(2025)
grades <- grades |>
mutate(
y8 = ifelse(runif(n()) < na_prob, NA, y8),
y9 = ifelse(runif(n()) < na_prob, NA, y9),
y10 = ifelse(runif(n()) < na_prob, NA, y10)
)
grades |>
select(starts_with("y")) |>
summary() y8 y9 y10
Min. :0.4200 Min. :0.1100 Min. :0.0400
1st Qu.:0.7700 1st Qu.:0.6800 1st Qu.:0.5525
Median :0.8600 Median :0.8000 Median :0.7700
Mean :0.8355 Mean :0.7644 Mean :0.7055
3rd Qu.:0.9300 3rd Qu.:0.9000 3rd Qu.:0.8700
Max. :0.9900 Max. :0.9900 Max. :0.9990
NA's :13 NA's :15 NA's :8
We see that there are several missing values for each year. Keep in mind that the missing values were assigned completely at random which is likely not true in reality. Recent migration influences both missing values and achievement.
data_list <- list(
N = nrow(grades),
J = length(unique(c(grades$teacher_8_id, grades$teacher_9_id, grades$teacher_10_id))),
teacher_8 = grades$teacher_8_id,
teacher_9 = grades$teacher_9_id,
teacher_10 = grades$teacher_10_id,
N_x8_obs = sum(!is.na(grades$y8)),
x8_idx_obs = which(!is.na(grades$y8)),
x8_obs = grades$y8[!is.na(grades$y8)],
N_x9_obs = sum(!is.na(grades$y9)),
x9_idx_obs = which(!is.na(grades$y9)),
x9_obs = grades$y9[!is.na(grades$y9)],
N_y10_obs = sum(!is.na(grades$y10)),
y10_idx_obs = which(!is.na(grades$y10)),
y10_obs = grades$y10[!is.na(grades$y10)]
)# Compile the Stan Model
missing_data_mod <- cmdstan_model(
stan_file = 'stan_models/missing_data_model.stan'
)
# Fit the model
fit_missing_data_mod <- missing_data_mod$sample(
dat = data_list,
seed = 2025
)
# Save the Stan model
fit_missing_data_mod$save_object(file = "stan_models/fit_missing_data_mod.RDS")
fit_missing_data_mod$cmdstan_diagnose()
rm("missing_data_mod", "fit_missing_data_mod")$num_divergent
[1] 0 0 0 0
$num_max_treedepth
[1] 0 0 0 0
$ebfmi
[1] 1.0176588 0.9743219 1.0268019 0.9228442
# A draws_df: 1000 iterations, 4 chains, and 6 variables
treedepth__ divergent__ energy__ accept_stat__ stepsize__ n_leapfrog__
1 4 0 -371 0.95 0.26 15
2 4 0 -378 0.98 0.26 15
3 4 0 -382 0.72 0.26 15
4 5 0 -379 0.78 0.26 31
5 4 0 -377 0.95 0.26 15
6 4 0 -378 0.91 0.26 15
7 5 0 -384 0.95 0.26 31
8 4 0 -384 0.79 0.26 15
9 4 0 -383 0.95 0.26 31
10 4 0 -385 0.97 0.26 15
# ... with 3990 more draws
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
| variable | mean | median | sd | mad | q5 | q95 | rhat | ess_bulk | ess_tail |
|---|---|---|---|---|---|---|---|---|---|
| alpha8 | 1.41 | 1.41 | 0.03 | 0.03 | 1.36 | 1.46 | 1.00 | 4,284.92 | 3,373.34 |
| alpha9 | 1.20 | 1.20 | 0.03 | 0.03 | 1.15 | 1.25 | 1.00 | 4,528.76 | 3,185.14 |
| alpha10 | 0.95 | 0.95 | 0.03 | 0.03 | 0.90 | 1.00 | 1.00 | 5,886.07 | 3,056.30 |
| phi8 | 9.17 | 9.15 | 0.81 | 0.80 | 7.82 | 10.51 | 1.00 | 4,923.34 | 3,205.91 |
| phi9 | 5.79 | 5.79 | 0.48 | 0.47 | 5.03 | 6.61 | 1.00 | 5,820.20 | 3,097.43 |
| phi10 | 3.58 | 3.58 | 0.30 | 0.29 | 3.12 | 4.10 | 1.00 | 5,478.89 | 3,050.12 |
| beta1 | 4.63 | 4.58 | 1.29 | 1.30 | 2.58 | 6.80 | 1.00 | 4,486.62 | 2,879.51 |
| beta2 | 1.93 | 1.90 | 0.88 | 0.92 | 0.52 | 3.42 | 1.00 | 3,407.02 | 1,688.83 |
| beta3 | 4.59 | 4.53 | 1.36 | 1.31 | 2.46 | 6.86 | 1.00 | 4,118.10 | 2,283.64 |
| zeta[1] | 0.39 | 0.38 | 0.16 | 0.16 | 0.13 | 0.66 | 1.00 | 2,695.41 | 2,424.11 |
| zeta[2] | −0.12 | −0.12 | 0.14 | 0.14 | −0.34 | 0.11 | 1.00 | 2,640.87 | 3,038.87 |
| zeta[3] | 0.10 | 0.10 | 0.12 | 0.12 | −0.09 | 0.29 | 1.00 | 2,804.22 | 3,134.81 |
| zeta[4] | 0.28 | 0.28 | 0.12 | 0.12 | 0.09 | 0.46 | 1.00 | 2,803.96 | 2,366.86 |
| zeta[5] | 0.12 | 0.12 | 0.12 | 0.12 | −0.08 | 0.31 | 1.00 | 2,919.75 | 2,683.19 |
| zeta[6] | −0.19 | −0.19 | 0.09 | 0.09 | −0.34 | −0.04 | 1.00 | 3,232.45 | 2,848.76 |
| zeta[7] | 0.05 | 0.05 | 0.09 | 0.10 | −0.11 | 0.20 | 1.00 | 3,666.73 | 3,147.67 |
| zeta[8] | −0.03 | −0.04 | 0.15 | 0.15 | −0.29 | 0.22 | 1.00 | 2,903.86 | 2,953.83 |
| tau_8[1] | 0.43 | 0.43 | 0.14 | 0.14 | 0.21 | 0.67 | 1.00 | 2,927.19 | 2,964.60 |
| tau_8[2] | −0.14 | −0.13 | 0.11 | 0.11 | −0.32 | 0.05 | 1.00 | 3,085.46 | 2,941.72 |
| tau_8[3] | 0.11 | 0.11 | 0.08 | 0.08 | −0.02 | 0.24 | 1.00 | 2,669.63 | 3,247.33 |
| tau_8[4] | 0.31 | 0.31 | 0.07 | 0.07 | 0.19 | 0.43 | 1.00 | 2,508.82 | 2,496.61 |
| tau_8[5] | 0.12 | 0.12 | 0.15 | 0.15 | −0.14 | 0.37 | 1.00 | 3,443.50 | 2,897.22 |
| tau_8[6] | −0.19 | −0.19 | 0.13 | 0.13 | −0.41 | 0.04 | 1.00 | 4,530.75 | 3,134.58 |
| tau_8[7] | 0.05 | 0.05 | 0.14 | 0.13 | −0.18 | 0.27 | 1.00 | 4,485.29 | 3,604.99 |
| tau_8[8] | −0.03 | −0.03 | 0.18 | 0.18 | −0.34 | 0.27 | 1.00 | 3,088.52 | 3,007.85 |
| tau_9[1] | 0.39 | 0.39 | 0.19 | 0.19 | 0.08 | 0.70 | 1.00 | 3,225.19 | 2,780.08 |
| tau_9[2] | −0.12 | −0.12 | 0.17 | 0.17 | −0.40 | 0.15 | 1.00 | 3,021.66 | 2,983.54 |
| tau_9[3] | 0.10 | 0.09 | 0.15 | 0.16 | −0.15 | 0.34 | 1.00 | 3,283.30 | 3,150.36 |
| tau_9[4] | 0.28 | 0.28 | 0.15 | 0.15 | 0.03 | 0.53 | 1.00 | 3,489.71 | 2,733.06 |
| tau_9[5] | 0.13 | 0.13 | 0.08 | 0.09 | −0.01 | 0.27 | 1.00 | 2,705.47 | 2,838.07 |
| tau_9[6] | −0.23 | −0.22 | 0.07 | 0.07 | −0.35 | −0.11 | 1.00 | 3,962.98 | 2,720.92 |
| tau_9[7] | 0.08 | 0.08 | 0.09 | 0.09 | −0.07 | 0.23 | 1.00 | 3,950.09 | 3,355.42 |
| tau_9[8] | −0.03 | −0.04 | 0.18 | 0.18 | −0.33 | 0.26 | 1.00 | 3,039.28 | 3,087.84 |
| tau_10[1] | 0.39 | 0.38 | 0.19 | 0.18 | 0.09 | 0.71 | 1.00 | 2,948.67 | 2,661.36 |
| tau_10[2] | −0.12 | −0.12 | 0.17 | 0.17 | −0.40 | 0.16 | 1.00 | 3,266.31 | 3,309.91 |
| tau_10[3] | 0.10 | 0.10 | 0.15 | 0.15 | −0.15 | 0.35 | 1.00 | 3,219.80 | 3,315.76 |
| tau_10[4] | 0.28 | 0.28 | 0.15 | 0.15 | 0.03 | 0.53 | 1.00 | 3,109.16 | 2,584.59 |
| tau_10[5] | 0.12 | 0.12 | 0.16 | 0.16 | −0.15 | 0.38 | 1.00 | 3,450.83 | 3,037.69 |
| tau_10[6] | −0.17 | −0.17 | 0.09 | 0.09 | −0.32 | −0.02 | 1.00 | 4,754.00 | 3,355.92 |
| tau_10[7] | 0.02 | 0.02 | 0.09 | 0.08 | −0.13 | 0.16 | 1.00 | 4,552.03 | 3,226.64 |
| tau_10[8] | −0.04 | −0.04 | 0.13 | 0.13 | −0.26 | 0.19 | 1.00 | 3,162.64 | 3,130.35 |
post_long <- fit_missing_data_mod |>
spread_draws(y10_hat[student]) |>
inner_join(grades, by = join_by(student))
grades_pred_summary <- post_long |>
group_by(student) |>
summarise(
n = n(),
y8 = first(y8),
y9 = first(y9),
y10 = first(y10),
y10_hat_mean = mean(y10_hat),
y10_hat_median = median(y10_hat),
residuals = y10 - y10_hat_median,
y10_hat_sd = sd(y10_hat),
lower_95 = quantile(y10_hat, 0.025),
upper_95 = quantile(y10_hat, 0.975),
p_fail = mean(y10_hat < 0.5),
p_1 = mean(y10_hat >= 0.5 & y10_hat < 0.6),
p_2 = mean(y10_hat >= 0.6 & y10_hat < 0.7),
p_3 = mean(y10_hat >= 0.7 & y10_hat < 0.8),
p_4 = mean(y10_hat >= 0.8),
p_1_om = mean(y10_hat >= 0.5),
p_2_om = mean(y10_hat >= 0.6),
p_3_om = mean(y10_hat >= 0.7),
p_4_om = mean(y10_hat >= 0.8),
p_pass = mean(y10_hat >= 0.5)
)
post_long <- post_long |>
inner_join(grades_pred_summary, by = join_by(student, y8, y9, y10))
# Randomly select 25 student IDs
set.seed(2025)
random_25_students <- grades_pred_summary |>
slice_sample(n = 25) |>
pull(student)
grades |>
filter(student %in% random_25_students) |>
filter(if_any(everything(), is.na))# A tibble: 5 × 10
student teacher_8 teacher_9 teacher_10 teacher_8_id teacher_9_id teacher_10_id
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 23 B E G 2 5 7
2 27 C F F 3 6 6
3 159 C E G 3 5 7
4 193 B E F 2 5 6
5 205 D G H 4 7 8
# ℹ 3 more variables: y8 <dbl>, y9 <dbl>, y10 <dbl>
post_long |>
filter(student %in% random_25_students) |>
ggplot(aes(x = y10_hat, y = fct_reorder(factor(student), y10_hat_mean))) +
geom_density_ridges_gradient(
aes(height = ..density.., fill = ..x..),
stat = "density",
scale = 0.95,
rel_min_height = 0.01
) +
stat_pointinterval(.width = c(0.66, 0.90), point_interval = median_hdi) +
geom_point(aes(x = y10), color = "blue", size = 2) +
geom_vline(xintercept = 0.5, linetype = "dashed") +
scale_x_continuous(
breaks = seq(0, 1, 0.1), limits = c(0, 1),
labels = scales::percent
) +
scale_fill_gradientn(
colours = c("red", "red", "yellow", "lightgreen", "forestgreen"),
values = c(0, 0.45, 0.6, 0.8, 1),
limits = c(0, 1),
guide = "none"
) +
labs(
x = "Posterior Predicted Year 10 Result",
y = "Student",
title = "Posterior Predictive Distributions for 25 Random Students",
subtitle = "66% and 90% median HDI ; Blue point = true observed value"
) +
theme(
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line(color = "gray90"),
panel.grid.major.y = element_blank(),
)
#ggsave("posterior_plot_teacher.png", width = 8, height = 10)Instead, as Gelman et al. (2013) recommended, we can handle missing data using a two-step process:
Do multiple imputation using a specialized program Use brms or rstan (or other Bayesian methods) to analyze each imputed data set
mice library
https://psyc-bayes-notes.netlify.app/missing-data