6 Missing Data

Code

# Load libraries
library(tidyverse)
library(cmdstanr)
library(tidybayes)
library(gt)
library(ggridges)

# Set ggplot theme
theme_set(theme_classic())

Missing data can happen for many reasons in the educational context.

student moved from another country or province
student did not take a specific course as their are many streams available in high school
student skipped a grade
student had to stop attending school because of an injury but was given the credit
student took an online course

The surface area for missing values to occur increases as the student progresses through the educational system. A student in year 12 has more prior information than a student in year 2. Given our aim of predicting achievement, it makes sense to account for missing values and use all information available to quantify our uncertainty.

Let’s start by loading our simulated data and artificially adding NA values with 5% probability.

Code

grades <- read_csv(file = "data/sim_grades_teacher.csv") |>
  transmute(
    student,
    teacher_8, teacher_9, teacher_10,
    teacher_8_id, teacher_9_id, teacher_10_id,
    y8 = round(y8, 2),
    y9 = round(y9, 2),
    y10 = round(y10, 2)) |>
  # Remove zeros or ones for the Beta distribution and Stan
  mutate(
    across(
      starts_with("y"), ~ case_when(
        .x == 0 ~ 0.001,
        .x == 1 ~ 0.999,
        TRUE ~ .x
      )))

na_prob <- 0.05

set.seed(2025)
grades <- grades |>
  mutate(
    y8 = ifelse(runif(n()) < na_prob, NA, y8),
    y9 = ifelse(runif(n()) < na_prob, NA, y9),
    y10 = ifelse(runif(n()) < na_prob, NA, y10)
  )

grades |>
  select(starts_with("y")) |>
  summary()

       y8               y9              y10        
 Min.   :0.4200   Min.   :0.1100   Min.   :0.0400  
 1st Qu.:0.7700   1st Qu.:0.6800   1st Qu.:0.5525  
 Median :0.8600   Median :0.8000   Median :0.7700  
 Mean   :0.8355   Mean   :0.7644   Mean   :0.7055  
 3rd Qu.:0.9300   3rd Qu.:0.9000   3rd Qu.:0.8700  
 Max.   :0.9900   Max.   :0.9900   Max.   :0.9990  
 NA's   :13       NA's   :15       NA's   :8

We see that there are several missing values for each year. Keep in mind that the missing values were assigned completely at random which is likely not true in reality. Recent migration influences both missing values and achievement.

6.1 Teat Missing Values As Unknown Parameters

Code

data_list <- list(
  N = nrow(grades),
  J = length(unique(c(grades$teacher_8_id, grades$teacher_9_id, grades$teacher_10_id))),
  
  teacher_8 = grades$teacher_8_id,
  teacher_9 = grades$teacher_9_id,
  teacher_10 = grades$teacher_10_id,
  
  N_x8_obs = sum(!is.na(grades$y8)),
  x8_idx_obs = which(!is.na(grades$y8)),
  x8_obs = grades$y8[!is.na(grades$y8)],
  
  N_x9_obs = sum(!is.na(grades$y9)),
  x9_idx_obs = which(!is.na(grades$y9)),
  x9_obs = grades$y9[!is.na(grades$y9)],
  
  N_y10_obs = sum(!is.na(grades$y10)),
  y10_idx_obs = which(!is.na(grades$y10)),
  y10_obs = grades$y10[!is.na(grades$y10)]
)

Code

# Compile the Stan Model
missing_data_mod <- cmdstan_model(
  stan_file = 'stan_models/missing_data_model.stan'
  )

# Fit the model
fit_missing_data_mod <- missing_data_mod$sample(
  dat = data_list,
  seed = 2025
  )

# Save the Stan model
fit_missing_data_mod$save_object(file = "stan_models/fit_missing_data_mod.RDS")

fit_missing_data_mod$cmdstan_diagnose()

rm("missing_data_mod", "fit_missing_data_mod")

Code

# Load the Stan model
fit_missing_data_mod <- readRDS("stan_models/fit_missing_data_mod.RDS")

Code

fit_missing_data_mod$diagnostic_summary()

$num_divergent
[1] 0 0 0 0

$num_max_treedepth
[1] 0 0 0 0

$ebfmi
[1] 1.0176588 0.9743219 1.0268019 0.9228442

Code

fit_missing_data_mod$sampler_diagnostics(format = "df")

# A draws_df: 1000 iterations, 4 chains, and 6 variables
   treedepth__ divergent__ energy__ accept_stat__ stepsize__ n_leapfrog__
1            4           0     -371          0.95       0.26           15
2            4           0     -378          0.98       0.26           15
3            4           0     -382          0.72       0.26           15
4            5           0     -379          0.78       0.26           31
5            4           0     -377          0.95       0.26           15
6            4           0     -378          0.91       0.26           15
7            5           0     -384          0.95       0.26           31
8            4           0     -384          0.79       0.26           15
9            4           0     -383          0.95       0.26           31
10           4           0     -385          0.97       0.26           15
# ... with 3990 more draws
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}

Code

fit_missing_data_mod$summary(
  variables = c(
    "alpha8", "alpha9", "alpha10",
    "phi8", "phi9", "phi10",
    "beta1", "beta2", "beta3",
    "zeta",
    "tau_8", "tau_9", "tau_10")
  ) |>
  gt() |>
  fmt_number(columns = where(is.numeric), decimals = 2)

variable	mean	median	sd	mad	q5	q95	rhat	ess_bulk	ess_tail
alpha8	1.41	1.41	0.03	0.03	1.36	1.46	1.00	4,284.92	3,373.34
alpha9	1.20	1.20	0.03	0.03	1.15	1.25	1.00	4,528.76	3,185.14
alpha10	0.95	0.95	0.03	0.03	0.90	1.00	1.00	5,886.07	3,056.30
phi8	9.17	9.15	0.81	0.80	7.82	10.51	1.00	4,923.34	3,205.91
phi9	5.79	5.79	0.48	0.47	5.03	6.61	1.00	5,820.20	3,097.43
phi10	3.58	3.58	0.30	0.29	3.12	4.10	1.00	5,478.89	3,050.12
beta1	4.63	4.58	1.29	1.30	2.58	6.80	1.00	4,486.62	2,879.51
beta2	1.93	1.90	0.88	0.92	0.52	3.42	1.00	3,407.02	1,688.83
beta3	4.59	4.53	1.36	1.31	2.46	6.86	1.00	4,118.10	2,283.64
zeta[1]	0.39	0.38	0.16	0.16	0.13	0.66	1.00	2,695.41	2,424.11
zeta[2]	−0.12	−0.12	0.14	0.14	−0.34	0.11	1.00	2,640.87	3,038.87
zeta[3]	0.10	0.10	0.12	0.12	−0.09	0.29	1.00	2,804.22	3,134.81
zeta[4]	0.28	0.28	0.12	0.12	0.09	0.46	1.00	2,803.96	2,366.86
zeta[5]	0.12	0.12	0.12	0.12	−0.08	0.31	1.00	2,919.75	2,683.19
zeta[6]	−0.19	−0.19	0.09	0.09	−0.34	−0.04	1.00	3,232.45	2,848.76
zeta[7]	0.05	0.05	0.09	0.10	−0.11	0.20	1.00	3,666.73	3,147.67
zeta[8]	−0.03	−0.04	0.15	0.15	−0.29	0.22	1.00	2,903.86	2,953.83
tau_8[1]	0.43	0.43	0.14	0.14	0.21	0.67	1.00	2,927.19	2,964.60
tau_8[2]	−0.14	−0.13	0.11	0.11	−0.32	0.05	1.00	3,085.46	2,941.72
tau_8[3]	0.11	0.11	0.08	0.08	−0.02	0.24	1.00	2,669.63	3,247.33
tau_8[4]	0.31	0.31	0.07	0.07	0.19	0.43	1.00	2,508.82	2,496.61
tau_8[5]	0.12	0.12	0.15	0.15	−0.14	0.37	1.00	3,443.50	2,897.22
tau_8[6]	−0.19	−0.19	0.13	0.13	−0.41	0.04	1.00	4,530.75	3,134.58
tau_8[7]	0.05	0.05	0.14	0.13	−0.18	0.27	1.00	4,485.29	3,604.99
tau_8[8]	−0.03	−0.03	0.18	0.18	−0.34	0.27	1.00	3,088.52	3,007.85
tau_9[1]	0.39	0.39	0.19	0.19	0.08	0.70	1.00	3,225.19	2,780.08
tau_9[2]	−0.12	−0.12	0.17	0.17	−0.40	0.15	1.00	3,021.66	2,983.54
tau_9[3]	0.10	0.09	0.15	0.16	−0.15	0.34	1.00	3,283.30	3,150.36
tau_9[4]	0.28	0.28	0.15	0.15	0.03	0.53	1.00	3,489.71	2,733.06
tau_9[5]	0.13	0.13	0.08	0.09	−0.01	0.27	1.00	2,705.47	2,838.07
tau_9[6]	−0.23	−0.22	0.07	0.07	−0.35	−0.11	1.00	3,962.98	2,720.92
tau_9[7]	0.08	0.08	0.09	0.09	−0.07	0.23	1.00	3,950.09	3,355.42
tau_9[8]	−0.03	−0.04	0.18	0.18	−0.33	0.26	1.00	3,039.28	3,087.84
tau_10[1]	0.39	0.38	0.19	0.18	0.09	0.71	1.00	2,948.67	2,661.36
tau_10[2]	−0.12	−0.12	0.17	0.17	−0.40	0.16	1.00	3,266.31	3,309.91
tau_10[3]	0.10	0.10	0.15	0.15	−0.15	0.35	1.00	3,219.80	3,315.76
tau_10[4]	0.28	0.28	0.15	0.15	0.03	0.53	1.00	3,109.16	2,584.59
tau_10[5]	0.12	0.12	0.16	0.16	−0.15	0.38	1.00	3,450.83	3,037.69
tau_10[6]	−0.17	−0.17	0.09	0.09	−0.32	−0.02	1.00	4,754.00	3,355.92
tau_10[7]	0.02	0.02	0.09	0.08	−0.13	0.16	1.00	4,552.03	3,226.64
tau_10[8]	−0.04	−0.04	0.13	0.13	−0.26	0.19	1.00	3,162.64	3,130.35

Code

post_long <- fit_missing_data_mod |>
  spread_draws(y10_hat[student]) |>
  inner_join(grades, by = join_by(student))

grades_pred_summary <- post_long |>
  group_by(student) |>
  summarise(
    n = n(),
    y8 = first(y8),
    y9 = first(y9),
    y10 = first(y10),
    y10_hat_mean = mean(y10_hat),
    y10_hat_median = median(y10_hat),
    residuals = y10 - y10_hat_median,
    y10_hat_sd = sd(y10_hat),
    lower_95 = quantile(y10_hat, 0.025),
    upper_95 = quantile(y10_hat, 0.975),
    p_fail = mean(y10_hat < 0.5),
    p_1 = mean(y10_hat >= 0.5 & y10_hat < 0.6),
    p_2 = mean(y10_hat >= 0.6 & y10_hat < 0.7),
    p_3 = mean(y10_hat >= 0.7 & y10_hat < 0.8),
    p_4 = mean(y10_hat >= 0.8),
    p_1_om = mean(y10_hat >= 0.5),
    p_2_om = mean(y10_hat >= 0.6),
    p_3_om = mean(y10_hat >= 0.7),
    p_4_om = mean(y10_hat >= 0.8),
    p_pass = mean(y10_hat >= 0.5)
  )

post_long <- post_long |>
  inner_join(grades_pred_summary, by = join_by(student, y8, y9, y10))

# Randomly select 25 student IDs
set.seed(2025)
random_25_students <- grades_pred_summary |>
  slice_sample(n = 25) |>
  pull(student)

grades |>
  filter(student %in% random_25_students)  |>
  filter(if_any(everything(), is.na))

# A tibble: 5 × 10
  student teacher_8 teacher_9 teacher_10 teacher_8_id teacher_9_id teacher_10_id
    <dbl> <chr>     <chr>     <chr>             <dbl>        <dbl>         <dbl>
1      23 B         E         G                     2            5             7
2      27 C         F         F                     3            6             6
3     159 C         E         G                     3            5             7
4     193 B         E         F                     2            5             6
5     205 D         G         H                     4            7             8
# ℹ 3 more variables: y8 <dbl>, y9 <dbl>, y10 <dbl>

Code

post_long |>
  filter(student %in% random_25_students) |>
  ggplot(aes(x = y10_hat, y = fct_reorder(factor(student), y10_hat_mean))) +
  geom_density_ridges_gradient(
    aes(height = ..density.., fill = ..x..),
    stat = "density",
    scale = 0.95,
    rel_min_height = 0.01
  ) +
  stat_pointinterval(.width = c(0.66, 0.90), point_interval = median_hdi) +
  geom_point(aes(x = y10), color = "blue", size = 2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(
    breaks = seq(0, 1, 0.1), limits = c(0, 1),
    labels = scales::percent
  ) +
  scale_fill_gradientn(
    colours = c("red", "red", "yellow", "lightgreen", "forestgreen"),
    values = c(0, 0.45, 0.6, 0.8, 1),
    limits = c(0, 1),
    guide = "none"
  ) +
  labs(
    x = "Posterior Predicted Year 10 Result",
    y = "Student",
    title = "Posterior Predictive Distributions for 25 Random Students",
    subtitle = "66% and 90% median HDI ; Blue point = true observed value"
  ) +
  theme(
    panel.grid.major = element_line(color = "gray90"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "gray90"),
    panel.grid.major.y = element_blank(),
  )

#ggsave("posterior_plot_teacher.png", width = 8, height = 10)

Figure 6.1: geom_density_ridges_gradient()

6.2 Impute Data First

Instead, as Gelman et al. (2013) recommended, we can handle missing data using a two-step process:

Do multiple imputation using a specialized program Use brms or rstan (or other Bayesian methods) to analyze each imputed data set

mice library

https://psyc-bayes-notes.netlify.app/missing-data