3  Simulated Data

A great way to start modeling data is to simulate artificial data that couldn’t be distinguished from observed data.

Code
# Load libraries
library(tidyverse)
library(GGally)
library(patchwork)
library(ggpmisc)

# Set ggplot theme
theme_set(theme_classic())

to_percent <- function(x, digits = 2){
  return(round(x * 100, digits = digits))
}

3.1 Simple Generative Model (DAG A)

Given all our prior knowledge about the data-generating process of school grades, let’s try to write down the model from Figure 2.1 both in math and then in code to simulate data.

\[x_{8i} \sim \text{Beta}(0.8 \cdot 10, (1-0.8) \cdot 10)\] \[\mu_{9i} = x_{8i} - 0.03\] \[x_{9i} \sim N(\mu_{9i}, 0.02)\] \[\mu_{10i} = x_{9i} - 0.05\] \[y_{10i} \sim N(\mu_{10i}, 0.02)\] There’s a lot to unpack here. The first equation says that year 8 results follow a Beta distribution with a mean \(\mu\) of 80% and a precision parameter \(\phi\) of 10. Let’s visualize this distribution for different values of \(\phi \in [10, 15, 20, 25, 30]\).

Code
ggplot() +
  stat_function(fun=dbeta, args=list(shape1=0.8*10, shape2=(1-0.8)*10),
                color = "darkorange") +
  stat_function(fun=dbeta, args=list(shape1=0.8*15, shape2=(1-0.8)*15)) +
  stat_function(fun=dbeta, args=list(shape1=0.8*20, shape2=(1-0.8)*20)) +
  stat_function(fun=dbeta, args=list(shape1=0.8*25, shape2=(1-0.8)*25)) +
  stat_function(fun=dbeta, args=list(shape1=0.8*30, shape2=(1-0.8)*30)) +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Theoretical Distribution of year 8 Math Results",
    y = "Probability Density"
  )
Figure 3.1: Theoretical distribution of year 8 math results.

We see that the orange curve in Figure 3.1 above with a precision parameter of 10 is a reasonable starting point. Very few students fail year 8 math.

We can generate random numbers from this distribution using the R function rbeta(). The second equation simply shifts the year 8 results by 3% to reflect our knowledge that high school is typically more difficult. The third equation states that year 9 results are normally distributed with a mean that’s three percent lower than their result in year 8 and a standard deviation of 2%. It acts as random noise and we know that roughly 95% of the observations will be \(\pm4\)% of the mean. Note that we assume a 5% drop between year 9 and year 10 since year 10 is the first academic stream course.

Let’s simulate 250 observations using our mathematical model.

Code
set.seed(2025)

N <- 250
sim_grades <- tibble(
  y8 = rbeta(n = N, shape1 = 0.8*10, shape2 = (1-0.8)*10),
  mu9 = y8 - 0.03,
  y9 = rnorm(n = N, mean = mu9, sd = 0.02),
  mu10 = y9 - 0.05,
  y10 = rnorm(n = N, mean = mu10, sd = 0.02)
)
Code
sim_grades |>
  pivot_longer(
    cols = starts_with("y"),
    names_to = "year", 
    values_to = "grade"
  ) |>
  mutate(
    year = factor(
      year,
      levels = c("y8", "y9", "y10"),
      labels = c("year 8", "MTH1W", "MPM2D")
      )) |>
  ggplot(aes(x = grade, fill = year)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Simulated Grades",
    y = "Probability Density"
  )
Figure 3.2: Overlapping densities of simulated data.
Code
# Custom lower panel with regression line, equation, and styled points
lower_fun <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    geom_point(shape = 1, alpha = 0.4, size = 1.5) +
    geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE, color = "steelblue") +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
    stat_poly_eq(
      formula = y ~ x,
      aes(label = after_stat(eq.label)),
      parse = TRUE,
      size = 3
    ) +
    scale_x_continuous(labels = scales::percent, breaks = seq(-1,1, 0.25)) +
    scale_y_continuous(labels = scales::percent, breaks = seq(-1,1, 0.250))
}

sim_grades |>
  select(starts_with("y")) |>
  ggpairs(lower = list(continuous = lower_fun))
Figure 3.3: Summary statistics of the simulated data.
Code
sim_grades |>
  summary()
       y8              mu9               y9              mu10       
 Min.   :0.3812   Min.   :0.3512   Min.   :0.3350   Min.   :0.2850  
 1st Qu.:0.6945   1st Qu.:0.6645   1st Qu.:0.6683   1st Qu.:0.6183  
 Median :0.8145   Median :0.7845   Median :0.7819   Median :0.7319  
 Mean   :0.7876   Mean   :0.7576   Mean   :0.7571   Mean   :0.7071  
 3rd Qu.:0.8886   3rd Qu.:0.8586   3rd Qu.:0.8651   3rd Qu.:0.8151  
 Max.   :0.9874   Max.   :0.9574   Max.   :0.9870   Max.   :0.9370  
      y10        
 Min.   :0.3091  
 1st Qu.:0.6189  
 Median :0.7307  
 Mean   :0.7077  
 3rd Qu.:0.8097  
 Max.   :0.9518  

The distributions in Figure 3.2 roughly match my experience. All of them are left skewed with few students failing. As expected, we see that the correlation between year 9 and year 10 is stronger than the one between year 8 and year 10. That said, both correlations are very strong so we can add a bit more noise to the process.

The noise problem can easily be addressed but the fact that some grades are bigger than 100% is a fatal issue for this model. We’ll see how we can address this problem in Section 3.2. Let’s change the standard deviations to 5% instead of 2% and see what we get.

Code
set.seed(2025)

sim_grades <- tibble(
  y8 = rbeta(n = N, shape1 = 0.8*10, shape2 = (1-0.8)*10),
  mu9 = y8 - 0.03,
  y9 = rnorm(n = N, mean = mu9, sd = 0.05),
  mu10 = y9 - 0.05,
  y10 = rnorm(n = N, mean = mu10, sd = 0.05)
)

sim_grades |>
  pivot_longer(
    cols = starts_with("y"),
    names_to = "year", 
    values_to = "grade"
  ) |>
  mutate(
    year = factor(
      year,
      levels = c("y8", "y9", "y10"),
      labels = c("year 8", "MTH1W", "MPM2D")
      )) |>
  ggplot(aes(x = grade, fill = year)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Simulated Grades",
    y = "Probability Density"
  )

Code
sim_grades |>
  select(starts_with("y")) |>
  ggpairs(lower = list(continuous = lower_fun))

Code
sim_grades |>
  summary()
       y8              mu9               y9              mu10       
 Min.   :0.3812   Min.   :0.3512   Min.   :0.3107   Min.   :0.2607  
 1st Qu.:0.6945   1st Qu.:0.6645   1st Qu.:0.6645   1st Qu.:0.6145  
 Median :0.8145   Median :0.7845   Median :0.7739   Median :0.7239  
 Mean   :0.7876   Mean   :0.7576   Mean   :0.7564   Mean   :0.7064  
 3rd Qu.:0.8886   3rd Qu.:0.8586   3rd Qu.:0.8640   3rd Qu.:0.8140  
 Max.   :0.9874   Max.   :0.9574   Max.   :1.0609   Max.   :1.0109  
      y10        
 Min.   :0.2674  
 1st Qu.:0.6017  
 Median :0.7296  
 Mean   :0.7079  
 3rd Qu.:0.8159  
 Max.   :1.0948  

We see that our correlations are more realistic. However, more students in year 9 and 10 received grades over 100%. The top student in year 9 finished with 106% while the top student in year 10 finished with 109%! This is simply because, unlike the beta distribution, the normal distribution is not bounded between zero and one.

3.2 A More Complex Generative Model (DAG A)

Since we need data to be bounded, let’s use a beta distribution to simulate the artificial data for all three grades.

\[x_{8i} \sim \text{Beta}(0.8 \cdot \phi, \, (1 - 0.8) \cdot \phi))\]

\[\mu_{9i} = x_{8i} - 0.03\] \[x_{9i} \sim \text{Beta}(\mu_{9i} \cdot \phi, \, (1 - \mu_{9i}) \cdot \phi)\] \[\mu_{10i} = x_{9i} - 0.05\] \[y_{10i} \sim \text{Beta}(\mu_{10i} \cdot \phi, \, (1 - \mu_{10i}) \cdot \phi)\]

Note that in model, year 10 results only depend on year 9 results which themselves depend on year 8 results. There’s no direct link between year 8 and year 10 grades like in DAG A of Figure 2.1.

Code
set.seed(2025)

sim_grades <- tibble(
  y8 = rbeta(n = N, shape1 = 0.8*10, shape2 = (1-0.8)*10),
  mu9 = y8 - 0.03,
  y9 = rbeta(n = N, shape1 = mu9*10, shape2 = (1-mu9)*10),
  mu10 = y9 - 0.05,
  y10 = rbeta(n = N, shape1 = mu10*10, shape2 = (1-mu10)*10)
)

sim_grades |>
  pivot_longer(
    cols = starts_with("y"),
    names_to = "year", 
    values_to = "grade"
  ) |>
  mutate(
    year = factor(
      year,
      levels = c("y8", "y9", "y10"),
      labels = c("year 8", "MTH1W", "MPM2D")
      )) |>
  ggplot(aes(x = grade, fill = year)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Simulated Grades",
    y = "Probability Density"
  )

Code
sim_grades |>
  select(starts_with("y")) |>
  ggpairs(lower = list(continuous = lower_fun))

Code
sim_grades |>
  summary()
       y8              mu9               y9              mu10       
 Min.   :0.3812   Min.   :0.3512   Min.   :0.1973   Min.   :0.1473  
 1st Qu.:0.6945   1st Qu.:0.6645   1st Qu.:0.6414   1st Qu.:0.5914  
 Median :0.8145   Median :0.7845   Median :0.8014   Median :0.7514  
 Mean   :0.7876   Mean   :0.7576   Mean   :0.7596   Mean   :0.7096  
 3rd Qu.:0.8886   3rd Qu.:0.8586   3rd Qu.:0.9092   3rd Qu.:0.8592  
 Max.   :0.9874   Max.   :0.9574   Max.   :0.9985   Max.   :0.9485  
      y10        
 Min.   :0.0380  
 1st Qu.:0.5845  
 Median :0.7480  
 Mean   :0.7132  
 3rd Qu.:0.8848  
 Max.   :0.9999  

We’ve fixed the issue of students getting over 100%. The correlations are realistic. What doesn’t match my prior knowledge is the shape of the distribution of grades. The peaks of the year 9 and 10 distributions are in the nineties which is even higher than for year 8 students. Ideally, the mode of the distributions would shift to the left with each subsequent year instead of stretching its left tail to achieve a lower mean.

3.4 Linear Link (DAG B)

Let’s add the term \(\beta_3 \cdot (x_{9i} - \bar{x}_9)\) to Equation 3.1 to introduce a direct effect of year 8 and year 10 as depicted in Figure 2.3.

\[x_{8i} \sim \text{Beta}(0.8 \cdot \phi, \, (1 - 0.8) \cdot \phi))\]

\[\mu_{9i} = \alpha_9 + \beta_1 \cdot (x_{8i} - \bar{x}_8)\]

\[x_{9i} \sim \text{Beta}(\mu_{9i} \cdot \phi_9,\ (1 - \mu_{9i}) \cdot \phi_9)\]

\[\mu_{10i} = \alpha_{10} + \beta_2 \cdot (x_{8i} - \bar{x}_8) + \beta_3 \cdot (x_{9i} - \bar{x}_9)\]

\[y_{10i} \sim \text{Beta}(\mu_{10i} \cdot \phi_{10},\ (1 - \mu_{10i}) \cdot \phi_{10})\] We see that \(\mu_{10}\) now directly depends on both year 9 and year 8 results. We can also interpret the \(\beta\) slope parameters as causal effects if we assume that DAG B is the true data-generating model. Here’s how to interpret the coefficients :

  1. The direct effect of year 8 on year 9 is \(\beta_1\).
  2. The direct effect of year 8 on year 10 is \(\beta_2\).
  3. The direct effect of year 9 on year 10 is \(\beta_3\).
  4. The indirect effect of year 8 on year 10 is \(\beta_1 \times \beta_3\).
  5. Total effect of year 8 on year 10 is \(\beta_2 + \beta_1 \times \beta_3\).

The difficult thing is to set reasonable priors for these \(\beta\) parameters. We expect for example that grades are positively correlated and, hence, the slopes should be positive. We also expect the direct effect of year 8 on year 9, \(\beta_1\), to be close to one. We expect the direct effect of year 8 on year 10 to be relatively small since the total of year 8 on year 10 to be mostly mediated by year 9.

Let’s consider a student who finished with 70% in year 8 and 67% in year 9. We expect this student to score around 62% in year 10. We can use the code below tho try out meta different values of \(\beta_2\) and \(\beta_3\) to see which range of parameters produce reasonable results.

Code
# Constants
mean_x8 <- 0.80
mean_x9 <- 0.77
alpha_10 <- 0.72

beta_2 <- seq(0, 2, by = 0.01)
beta_3 <- seq(0, 2, by = 0.01)

# Define vector of x8 values
x8_vals <- c(0.55, 0.6, 0.65, 0.7, 0.75, 0.85, 0.90, 0.95, 1)

# Function to generate plot for a given x8
generate_plot <- function(x8) {
  x9 <- x8 - 0.03
  x10_expected <- x9 - 0.05
  
  expand.grid(beta_2 = beta_2, beta_3 = beta_3) |>
    mutate(
      mu10 = alpha_10 + beta_2 * (x8 - mean_x8) + beta_3 * (x9 - mean_x9)
    ) |>
    filter(
      mu10 >= x10_expected - 0.1,
      mu10 <= x10_expected + 0.1,
      beta_3 > beta_2
    ) |>
    ggplot(aes(x = beta_2, y = beta_3)) +
    geom_tile(aes(fill = mu10)) +
    geom_contour(aes(z = mu10), breaks = x10_expected + 0.02 * c(-1, 1), color = "red", linewidth = 1) +
    geom_abline() +
    scale_fill_viridis_c(labels = scales::percent_format(accuracy = 1)) +
    labs(
      title = paste0("x8 = ", scales::percent(x8, accuracy = 1)),
      x = expression(beta[2]),
      y = expression(beta[3]),
      fill = expression(mu[10])
    )
}

# Combine all plots into a grid
plots <- map(x8_vals, generate_plot)
wrap_plots(plots, ncol = 3)
Figure 3.7: Visualizing plausible values for the slope parameters.

We did we skip 80% in Figure 3.7? It’s because 80% is the average of year 8 so we’d only output the expected grade in year 10 of 72%. The previous plot tells that plausible values for \(\beta_2\) are between 0 and 0.5 and the ones for \(\beta_3\) are between 0.5 and 1.5. Thus, could set the following priors for \(\beta_2\) and \(\beta_3\).

  • \(\beta_2 \sim N(\mu = 0.25, \sigma = 0.1\)
  • \(\beta_3 \sim N(\mu = 1, \sigma = 0.3\)
Code
ggplot() +
  stat_function(
    fun = dnorm,
    args = list(mean = 0.25, sd = 0.1),
    xlim = c(0, 2)
  ) +
  stat_function(
    fun = dnorm,
    args = list(mean = 1, sd = 0.3),
    xlim = c(0, 2)
  ) +
  labs(
    x = "Beta",
    y = "Probability Density"
  )

Code
set.seed(2025)

sim_grades <- tibble(
  y8 = rbeta(n = N, shape1 = 0.8*10, shape2 = (1-0.8)*10),
  mu9 = 0.77 + 0.9 * (y8 - mean(y8)),
  y9 = rbeta(n = N, shape1 = mu9 * 10, shape2 = (1 - mu9) * 10),
  mu10 = 0.72 + 0.2 * (y8 - mean(y8)) + 0.9 * (y9 - mean(y9)),
  y10 = rbeta(n = N, shape1 = mu10 * 10, shape2 = (1 - mu10) * 10),
)

sim_grades |>
  pivot_longer(
    cols = starts_with("y"),
    names_to = "year", 
    values_to = "grade"
  ) |>
  mutate(
    year = factor(
      year,
      levels = c("y8", "y9", "y10"),
      labels = c("year 8", "MTH1W", "MPM2D")
      )) |>
  ggplot(aes(x = grade, fill = year)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Simulated Grades",
    y = "Probability Density"
  )

Code
sim_grades |>
  select(starts_with("y")) |>
  ggpairs(lower = list(continuous = lower_fun))

Code
sim_grades |>
  summary()
       y8              mu9               y9              mu10       
 Min.   :0.3812   Min.   :0.4042   Min.   :0.2155   Min.   :0.1773  
 1st Qu.:0.6945   1st Qu.:0.6863   1st Qu.:0.6690   1st Qu.:0.6108  
 Median :0.8145   Median :0.7943   Median :0.8123   Median :0.7548  
 Mean   :0.7876   Mean   :0.7700   Mean   :0.7747   Mean   :0.7200  
 3rd Qu.:0.8886   3rd Qu.:0.8610   3rd Qu.:0.9110   3rd Qu.:0.8608  
 Max.   :0.9874   Max.   :0.9499   Max.   :0.9982   Max.   :0.9554  
      y10         
 Min.   :0.04241  
 1st Qu.:0.59311  
 Median :0.77903  
 Mean   :0.72853  
 3rd Qu.:0.89926  
 Max.   :0.99979  

The simulated data from this model produces realistic results which gives us more confidence of our choice of priors.

3.6 Logit Link (DAG B)

Let’s add the term \(\beta_3 \cdot (x_{9i} - \bar{x}_9)\) to Equation 3.3 to introduce a direct effect of year 8 and year 10 as depicted in Figure 2.3.

\[x_{8i} \sim \text{Beta}(0.8 \cdot \phi, \, (1 - 0.8) \cdot \phi))\]

\[\text{logit}(\mu_{9i}) = \alpha_9 + \beta_1 \cdot (x_{8i} - \bar{x}_8)\]

\[x_{9i} \sim \text{Beta}(\mu_{9i} \cdot \phi_9,\ (1 - \mu_{9i}) \cdot \phi_9)\]

\[\text{logit}(\mu_{10i}) = \alpha_{10} + \beta_2 \cdot (x_{8i} - \bar{x}_8) + \beta_3 \cdot (x_{9i} - \bar{x}_9)\]

\[y_{10i} \sim \text{Beta}(\mu_{10i} \cdot \phi_{10},\ (1 - \mu_{10i}) \cdot \phi_{10})\] How to set reasonable values for \(\beta_2\) and \(\beta_3\)?

Code
# Inputs
x8 <- 0.71
x9 <- 0.68
mean_x8 <- 0.80
mean_x9 <- 0.77
alpha_10 <- 0.94

beta_2 <- seq(0, 10, by = 0.5)
beta_3 <- seq(0, 10, by = 0.5)

# Generate grid of all combinations
grid <- expand.grid(beta_2 = beta_2, beta_3 = beta_3)

# Compute mu_10 on the logit and probability scale
grid <- grid |>
  mutate(
    logit_mu10 = alpha_10 + beta_2 * (x8 - mean_x8) + beta_3 * (x9 - mean_x9),
    mu10 = plogis(logit_mu10)  # inverse logit
  )

plausible_betas <- grid |>
  filter(mu10 <= 0.68, mu10 >= 0.58, beta_3 > beta_2)

# Plot with contour around the region mu10 ∈ [0.55, 0.70]
ggplot(grid, aes(x = beta_2, y = beta_3, fill = mu10)) +
  geom_tile() +
  geom_contour(aes(z = mu10), breaks = c(0.58, 0.68), color = "white") +
  geom_abline(color = "white") +
  scale_fill_viridis_c(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = expression(paste("Prior Predictive: ", mu[10], " as a function of ", beta[2], " and ", beta[3])),
    x = expression(beta[2]),
    y = expression(beta[3]),
    fill = expression(mu[10])
  ) +
  theme_minimal()

Code
summary(plausible_betas)
     beta_2          beta_3        logit_mu10          mu10       
 Min.   :0.000   Min.   :1.500   Min.   :0.3550   Min.   :0.5878  
 1st Qu.:0.500   1st Qu.:3.000   1st Qu.:0.4000   1st Qu.:0.5987  
 Median :1.000   Median :3.500   Median :0.4900   Median :0.6201  
 Mean   :1.035   Mean   :3.814   Mean   :0.5036   Mean   :0.6229  
 3rd Qu.:1.500   3rd Qu.:4.500   3rd Qu.:0.5800   3rd Qu.:0.6411  
 Max.   :3.000   Max.   :6.500   Max.   :0.7150   Max.   :0.6715  
Code
set.seed(2025)

sim_grades <- tibble(
  y8 = rbeta(n = N, shape1 = 0.8*10, shape2 = (1-0.8)*10),
  mu9 = plogis(1.21 + 5 * (y8 - mean(y8))),
  y9 = rbeta(n = N, shape1 = mu9 * 10, shape2 = (1 - mu9) * 10),
  mu10 = plogis(0.94 + 1.2 * (y8 - mean(y8)) + 4.35 * (y9 - mean(y9))),
  y10 = rbeta(n = N, shape1 = mu10 * 10, shape2 = (1 - mu10) * 10),
)

#write_csv(sim_grades, file = "sim_grades.csv")

sim_grades |>
  pivot_longer(
    cols = starts_with("y"),
    names_to = "year", 
    values_to = "grade"
  ) |>
  mutate(
    year = factor(
      year,
      levels = c("y8", "y9", "y10"),
      labels = c("year 8", "MTH1W", "MPM2D")
      )) |>
  ggplot(aes(x = grade, fill = year)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(0, 1, 0.1),labels = scales::percent) +
  labs(
    x = "Simulated Grades",
    y = "Probability Density"
  )

Code
sim_grades |>
  select(starts_with("y")) |>
  ggpairs(lower = list(continuous = lower_fun))

Code
sim_grades |>
  summary()
       y8              mu9               y9              mu10       
 Min.   :0.3812   Min.   :0.3053   Min.   :0.1413   Min.   :0.1070  
 1st Qu.:0.6945   1st Qu.:0.6781   1st Qu.:0.6502   1st Qu.:0.6061  
 Median :0.8145   Median :0.7933   Median :0.7844   Median :0.7588  
 Mean   :0.7876   Mean   :0.7529   Mean   :0.7567   Mean   :0.6976  
 3rd Qu.:0.8886   3rd Qu.:0.8475   3rd Qu.:0.8935   3rd Qu.:0.8340  
 Max.   :0.9874   Max.   :0.9011   Max.   :0.9993   Max.   :0.9012  
      y10         
 Min.   :0.03041  
 1st Qu.:0.58798  
 Median :0.73727  
 Mean   :0.70746  
 3rd Qu.:0.87299  
 Max.   :0.99867