Chapter 3 MR-BRT
3.1 Prepare simulated data
library(dplyr)
library(mrbrt002, lib.loc = path_to_r_version4_packages)
set.seed(1)
<- 10
k_studies <- 5
n_per_study # k_studies <- 40
# n_per_study <- 40
<- 4
tau_1 <- 1
sigma_1 <- 0.6
tau_2 <- 0.2
sigma_2
<- data.frame(study_id = as.factor(1:k_studies)) %>%
df_sim_study mutate(
study_effect1 = rnorm(n = k_studies, mean = 0, sd = tau_1),
study_effect2 = rnorm(n = k_studies, mean = 0, sd = tau_2)
# study_colors = brewer.pal(n = k_studies, "Spectral")
)
<- do.call("rbind", lapply(1:nrow(df_sim_study), function(i) {
df_sim1 rep(i, n_per_study), ] })) %>%
df_sim_study[mutate(
x1 = runif(n = nrow(.), min = 0, max = 10),
y1 = 0.9*x1 + study_effect1 + rnorm(nrow(.), mean = 0, sd = sigma_1),
y1_se = sigma_1,
y2_true = 2 * sin(0.43*x1-2.9),
y2 = y2_true - min(y2_true) + study_effect2 + rnorm(nrow(.), mean = 0, sd = sigma_2),
y2_se = sigma_2,
is_outlier = FALSE) %>%
arrange(x1)
<- df_sim1 %>%
df_sim2 rbind(., .[(nrow(.)-7):(nrow(.)-4), ] %>% mutate(y1=y1-18, y2=y2-4, is_outlier = TRUE))
# function for plotting uncertainty intervals
<- function(dat, x_var, lo_var, hi_var, color = "darkblue", opacity = 0.2) {
add_ui polygon(
x = c(dat[, x_var], rev(dat[, x_var])),
y = c(dat[, lo_var], rev(dat[, hi_var])),
col = adjustcolor(col = color, alpha.f = opacity), border = FALSE
) }
3.2 Fitting a standard mixed effects model
<- MRData()
dat1 $load_df(
dat1data = df_sim1, col_obs = "y1", col_obs_se = "y1_se",
col_covs = list("x1"), col_study_id = "study_id" )
<- MRBRT(
mod1 data = dat1,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE),
LinearCovModel("x1") ) )
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod1
<- data.frame(x1 = seq(0, 10, by = 0.1))
df_pred1
<- MRData()
dat_pred1
$load_df(
dat_pred1data = df_pred1,
col_covs=list('x1')
)
To save a model object, use the py_save_object()
function. To load a saved model object, use py_load_object
.
# py_save_object(object = mod1, filename = file.path(path_to_misc_outputs, "mod1.pkl"), pickle = "dill")
# mod1 <- py_load_object(filename = file.path(path_to_misc_outputs, "mod1.pkl"), pickle = "dill")
3.2.1 – Point prediction only
If you don’t need uncertainty estimates, this option is the fastest.
$pred0 <- mod1$predict(data = dat_pred1)
df_pred1with(df_sim1, plot(x1, y1))
with(df_pred1, lines(x1, pred0))
### – Uncertainty from fixed effects only (using asymptotic statistics) {#section1_2}
If we can assume that the posterior distributions of the parameters are Gaussian (i.e. no hard constraints specified in the model), we can obtain samples from the fixed effects using a fast algorithm.
<- 1000L
n_samples1
<- core$other_sampling$sample_simple_lme_beta(
samples1 sample_size = n_samples1,
model = mod1
)
<- mod1$create_draws(
draws1 data = dat_pred1,
beta_samples = samples1,
gamma_samples = matrix(rep(0, n_samples1), ncol = 1),
random_study = FALSE )
$pred1 <- mod1$predict(data = dat_pred1)
df_pred1$pred1_lo <- apply(draws1, 1, function(x) quantile(x, 0.025))
df_pred1$pred1_hi <- apply(draws1, 1, function(x) quantile(x, 0.975))
df_pred1with(df_sim1, plot(x1, y1))
with(df_pred1, lines(x1, pred1))
add_ui(df_pred1, "x1", "pred1_lo", "pred1_hi")
3.2.2 – Uncertainty from fixed effects only (using fit-refit)
We use the fit-refit method for obtaining samples from the parameters when there are uniform priors (a.k.a. constraints) specified in the model, which can make the posterior distributions of parameters not Gaussian. This code using mod1$sample_soln
gives the same result as above.
<- 1000L
n_samples2
<- mod1$sample_soln(sample_size = n_samples2)
samples2_fitrefit
<- mod1$create_draws(
draws2_fitrefit data = dat_pred1,
beta_samples = samples2_fitrefit[[1]],
gamma_samples = samples2_fitrefit[[2]],
random_study = FALSE )
$pred2 <- mod1$predict(data = dat_pred1)
df_pred1$pred2_lo <- apply(draws2_fitrefit, 1, function(x) quantile(x, 0.025))
df_pred1$pred2_hi <- apply(draws2_fitrefit, 1, function(x) quantile(x, 0.975))
df_pred1with(df_sim1, plot(x1, y1))
with(df_pred1, lines(x1, pred2))
add_ui(df_pred1, "x1", "pred2_lo", "pred2_hi")
3.2.3 – Uncertainty from fixed effects and between-study heterogeneity
When we require samples of gamma (uncertainty in the parameter that estimates between-study heterogeneity), we need to use the sample_soln
approach.
<- 1000L
n_samples3 <- mod1$sample_soln(sample_size = n_samples3)
samples3
<- mod1$create_draws(
draws3 data = dat_pred1,
beta_samples = samples3[[1]],
gamma_samples = samples3[[2]],
random_study = TRUE )
# if a single value of gamma is sufficient (not sampling from the uncertainty of gamma),
# you can pass in the point estimate for gamma like this:
#
# draws3 <- mod1$create_draws(
# data = dat_pred1,
# beta_samples = samples3[[1]],
# gamma_samples = matrix(rep(mod1$gamma_soln, n_samples3), ncol = 1),
# random_study = TRUE )
$pred3 <- mod1$predict(dat_pred1)
df_pred1$pred3_lo <- apply(draws3, 1, function(x) quantile(x, 0.025))
df_pred1$pred3_hi <- apply(draws3, 1, function(x) quantile(x, 0.975))
df_pred1with(df_sim1, plot(x1, y1))
with(df_pred1, lines(x1, pred3))
add_ui(df_pred1, "x1", "pred3_lo", "pred3_hi")
3.2.4 – Predicting out on the random effects
To incorporate the estimated random effects into point predictions, specify col_study_id
in the data object you pass to the predict()
function, and set predict_for_study = TRUE
and sort_by_data_id = TRUE
in the predict()
function.
Important! If sort_by_data_id = TRUE
is not specified, the predictions might not line up with the rows of the prediction data frame.
# 1. convert prediction frame into a MRData() object
# 2. extract the (potentially) sorted data frame with the to_df() function
# 3. add the predictions to this new data frame
<- MRData()
dat_sim1_tmp $load_df(data = df_sim1, col_covs = list("x1"), col_study_id = "study_id")
dat_sim1_tmp
$pred_re <- mod1$predict(
df_sim1data = dat_sim1_tmp,
predict_for_study = TRUE,
sort_by_data_id = TRUE
)
with(df_sim1, plot(x1, y1))
for (id in unique(df_sim1$study_id)) {
with(filter(df_sim1, study_id == id), lines(x1, y1, lty = 2))
}
with(df_sim1, plot(x1, y1))
for (id in unique(df_sim1$study_id)) {
# id <- 4 # dev
<- filter(arrange(df_sim1, x1), study_id == id)
df_tmp with(df_tmp, lines(x1, pred_re, lty = 1))
}
# to get the random effects by group...
<- mod1$re_soln
re <- data.frame(level = names(mod1$re_soln), re = unlist(mod1$re_soln))
df_re
# with(df_sim1, plot(x1, y1))
3.3 Priors
3.3.0.1 - Setting a prior on gamma
<- MRBRT(
mod3 data = dat1,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE, prior_gamma_gaussian = array(c(0, 0.02))),
LinearCovModel("x1") ) )
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod3
<- MRData()
dat_sim1_tmp $load_df(data = df_sim1, col_covs = list("x1"), col_study_id = "study_id")
dat_sim1_tmp$pred_re_prior <- mod3$predict(data = dat_sim1_tmp, predict_for_study = TRUE, sort_by_data_id = TRUE)
df_sim1
with(df_sim1, plot(x1, y1))
for (id in unique(df_sim1$study_id)) {
<- filter(arrange(df_sim1, x1), study_id == id)
df_tmp with(df_tmp, lines(x1, pred_re_prior, lty = 1))
}
3.3.1 - Setting a prior on beta
With great power comes great responsibility…
<- MRBRT(
mod4 data = dat1,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE),
LinearCovModel("x1", prior_beta_uniform = array(c(-3.1, -2.9))) ) )
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod4$pred4 <- mod4$predict(data = dat_pred1)
df_pred1with(df_sim1, plot(x1, y1))
with(df_pred1, lines(x1, pred4))
3.4 Removing the effects of outliers with trimming
<- MRData()
dat2 $load_df(
dat2data = df_sim2, col_obs = "y1", col_obs_se = "y1_se",
col_covs = list("x1"), col_study_id = "study_id" )
<- data.frame(x1 = seq(0, 10, by = 2))
df_pred2 <- MRData()
dat_pred2
$load_df(
dat_pred2data = df_pred2,
col_covs=list('x1')
)
<- MRBRT(
mod2 data = dat2,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE),
LinearCovModel("x1") ),
inlier_pct = 0.9)
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod2$pred4 <- mod2$predict(data = dat_pred2)
df_pred2
# get a data frame with estimated weights
<- cbind(mod2$data$to_df(), data.frame(w = mod2$w_soln))
df_mod2
with(df_mod2, plot(
x = x1, y = obs,
col = ifelse(w == 1, "black", "red"),
pch = ifelse(w == 1, 1, 16)
))with(df_pred2, lines(x1, pred4))
3.5 Splines
3.5.1 - Setting priors and shape constraints on splines
spline_l_linear
andspline_r_linear
make the left and right tail segments linear, respectivelyprior_spline_monotonicity
forces the spline to be “increasing” or “decreasing”prior_spline_convexity
makes the spline “convex” or “concave”prior_spline_maxder_gaussian = array(c(0, 0.03))
has the effect of putting a dampening prior on the spline, making the highest-order derivative of each segment subject to a \(N(0, 0.03^2)\) Gaussian prior. This makes a cubic spline more quadratic, quadratic more linear, etc.When
spline_l_linear
andspline_r_linear
are set toTRUE
, the value given toprior_spline_maxder_gaussian
for those segments is a direct prior on the slope of the segment. For example,prior_spline_maxder_gaussian = rbind(c(0,0,0,0,-1), c(Inf,Inf,Inf,Inf,0.0001))
makes the slope of the right tail very close to-1
whenspline_r_linear = TRUE
. Note that the function takes a matrix as an input, where the first row is a vector a prior means and the second row is a vector of prior standard deviations. The number of columns must match the number of knots minus 1.
<- MRData()
dat3 $load_df(
dat3data = df_sim1, col_obs = "y2", col_obs_se = "y2_se",
col_covs = list("x1"), col_study_id = "study_id" )
<- MRBRT(
mod5 data = dat3,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE),
LinearCovModel(
alt_cov = "x1",
use_spline = TRUE,
# spline_knots = array(c(0, 0.25, 0.5, 0.75, 1)),
spline_knots = array(seq(0, 1, by = 0.2)),
spline_degree = 2L,
spline_knots_type = 'domain',
spline_r_linear = TRUE,
spline_l_linear = FALSE
# prior_spline_monotonicity = 'increasing'
# prior_spline_convexity = "convex"
# prior_spline_maxder_gaussian = array(c(0, 0.01))
# prior_spline_maxder_gaussian = rbind(c(0,0,0,0,-1), c(Inf,Inf,Inf,Inf,0.0001))
)
)
)
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod5
<- data.frame(x1 = seq(0, 10, by = 0.1))
df_pred3 <- MRData()
dat_pred3 $load_df(
dat_pred3data = df_pred3,
col_covs=list('x1')
)
$pred5 <- mod5$predict(data = dat_pred3)
df_pred3with(df_sim1, plot(x1, y2))
with(df_pred3, lines(x1, pred5))
# # visualize knot locations
for (k in mod5$cov_models[[2]]$spline_knots) abline(v = k, col = "gray")
3.5.2 - Ensemble splines
See this link about arguments for the sample_knots()
function (or do py_help(utils$sample_knots)
in an interactive session): https://github.com/ihmeuw-msca/mrtool/blob/740f605264732f0faa604614a987fc38c7d88f83/src/mrtool/core/utils.py#L293.
<- MRData()
dat4 $load_df(
dat4data = df_sim1, col_obs = "y2", col_obs_se = "y2_se",
col_covs = list("x1"), col_study_id = "study_id" )
<- utils$sample_knots(
knots_samples num_intervals = 3L,
knot_bounds = rbind(c(0.0, 0.4), c(0.6, 1.0)),
num_samples = 20L
)
<- LinearCovModel(
ensemble_cov_model1 alt_cov = "x1",
use_spline = TRUE,
spline_knots = array(c(0, 0.33, 0.66, 1)),
spline_degree = 3L,
spline_knots_type = 'frequency'
)
<- MRBeRT(
mod5a data = dat3,
ensemble_cov_model = ensemble_cov_model1,
ensemble_knots = knots_samples,
cov_models = list(
LinearCovModel("intercept", use_re = TRUE)
)
)
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod5a
<- data.frame(x1 = seq(0, 10, by = 0.1))
df_pred3 <- MRData()
dat_pred3 $load_df(
dat_pred3data = df_pred3,
col_covs=list('x1')
)
# this predicts a weighted average from the spline ensemble
$pred5a <- mod5a$predict(data = dat_pred3)
df_pred3
with(df_sim1, plot(x1, y2))
with(df_pred3, lines(x1, pred5a))
# # view all splines in the ensemble
#
# pred5a <- mod5a$predict(data = dat_pred3, return_avg = FALSE)
#
# with(df_sim1, plot(x1, y2))
# for (i in 1:nrow(pred5a)) {
# tmp <- pred5a[i, ]
# with(df_pred3, lines(x1, tmp))
# }
3.6 The ratio model
This model is useful when a covariate is represented as a range. For example, often population-level data are indexed by age group. We can represent this in the model as alt_cov = c("age_start", "age_end")
. Or if a relative risk estimate corresponds to a comparison of two BMI ranges, we specify the model as alt_cov = c("b_0", "b_1")
and ref_cov = c("a_0", "a_1")
. When predicting out, we need to set a constant reference level and varying alternative level.
For LogCovModel
, our regression model can be represented as
\(y = ln(1 + X_{alt}\beta) - ln(1 + X_{ref}\beta)\),
where \(X_{alt}\) and \(X_{ref}\) are the design matrix for the alternative and reference groups. They could be either covariates or the spline design matrices from the covariates.
When we want to include the random effects, we always assume a random “slope,”
\(y = (\frac{ln(1 + X_{alt}\beta) - ln(1 + X_{ref}\beta)}{X_{alt}-X_{ref}} + u)(X_{alt}-X_{ref})\)
Ratio models do not need an intercept specified, as we are considering relative risk where measurement is invariant with the scaling of the curve.
In the ratio model, if we set use_re=TRUE
, we automatically turn on the use_re_mid_point
, since for log relative risk, we always consider the random effects as the random average slope.
We use spline to parameterize the relative risk curve in the linear space, and all the shape constraints are in linear space.
# creating simulated data with exposure ranges
set.seed(1)
<- do.call("rbind", lapply(1:nrow(df_sim1), function(i) {
df_sim3 # i <- 1 # dev
<- 0.1
x_offset
<- sample(1:nrow(df_sim1), 1)
row_j <- df_sim1[i, ]
df_i <- df_sim1[row_j, ]
df_j
<- data.frame(
df_k row_i = i,
row_j = row_j,
a_0 = df_i$x1 - x_offset, a_1 = df_i$x1 + x_offset,
b_0 = df_j$x1 - x_offset, b_1 = df_j$x1 + x_offset,
log_rr = df_j$y2 - df_i$y2,
log_rr_se = 1
)
return(df_k)
}))
head(df_sim3)
<- MRData()
dat1 $load_df(
dat1data = df_sim3, col_obs = "log_rr", col_obs_se = "log_rr_se",
col_covs = list("a_0", "a_1", "b_0", "b_1") )
<- MRBRT(
mod6 data = dat1,
cov_models = list(
LogCovModel(
alt_cov = c("b_0", "b_1"),
ref_cov = c("a_0", "a_1"),
use_re = TRUE,
use_spline = TRUE,
# spline_knots = array(seq(0, 1, by = 0.2)),
spline_knots = array(c(0, 0.25, 0.5, 0.75, 1)),
spline_degree = 3L,
spline_knots_type = 'frequency',
name = "exposure"
)
)
)
$fit_model(inner_print_level = 5L, inner_max_iter = 1000L)
mod6
# df_pred6 <- data.frame(exposure = seq(0, 10, by = 2), study_id = "4")
<- data.frame(
df_pred6 b_0 = seq(0, 10, by = 0.1),
b_1 = seq(0, 10, by = 0.1) ) %>%
mutate(a_0 = 0, a_1 = 0 )
<- MRData()
dat_pred6
$load_df(
dat_pred6data = df_pred6,
col_covs=list('a_0', 'a_1', 'b_0', 'b_1')
# col_covs=list("exposure")
)
<- mod6$predict(dat_pred6)
pred6
with(df_sim3, plot(b_0, log_rr))
with(df_pred6, lines(b_0, pred6))
3.7 Automated covariate selection
# create simulated data
set.seed(123)
<- c(0, 0, 0, 1.2, 0, 1.5, 0)
beta <- rep(0, 7)
gamma <- 100
num_obs = 0.1
obs_se <- c("A", "B", "C")
studies
<- cbind(
covs rep(1, num_obs),
do.call("cbind", lapply(1:(length(beta)-1), function(i) rnorm(num_obs)))
)
<- covs %*% beta + rnorm(n = num_obs, mean = 0, sd = obs_se)
obs <- as.data.frame(do.call("cbind", list(obs, rep(obs_se, num_obs), covs))) %>%
df_sim4 select(-V3)
<- paste0("cov", 1:(length(beta)-1))
cov_names names(df_sim4) <- c("obs", "obs_se", cov_names)
$study_id <- sample(studies, num_obs, replace = TRUE)
df_sim4
<- MRData()
mrdata
$load_df(
mrdatadata = df_sim4,
col_obs = 'obs',
col_obs_se = 'obs_se',
col_study_id = 'study_id',
col_covs = as.list(cov_names)
)
<- cov_names[!cov_names == "cov1"]
candidate_covs
<- CovFinder(
covfinder data = mrdata,
covs = as.list(candidate_covs),
pre_selected_covs = list("cov1"),
normalized_covs = FALSE,
num_samples = 1000L,
power_range = list(-4, 4),
power_step_size = 1,
laplace_threshold = 1e-5
)
$select_covs(verbose = TRUE)
covfinder
$selected_covs covfinder
## [1] "cov1" "cov3" "cov5"
3.8 Calculating an evidence score
For documentation on the Scorelator
function, see py_help(evidence_score$Scorelator)
or go to https://github.com/ihmeuw-msca/MRTool/blob/5078ded1f22a99fd733ffd15de258cc407a8f4fd/src/mrtool/evidence_score/scorelator.py.
The code below creates a dataset for simulating the effect of a dichotomous risk factor, fits a MR-BRT model on it, and calculates an evidence score. Note that continuous risk factors will want to specify score_type = "area"
in the evidence_score$Scorelator()
function.
# get simulated data, run model and get draws
# for passing to the Scorelator function
<- read.csv("example_data/linear_sim_evidence_score_0_0.5.csv")
df_sim7_in
= round(quantile(df_sim7_in$b_1, probs = 0.3), 2)
cutoff
<- df_sim7_in %>%
df_sim7 mutate(
a_mid = (a_0 + a_1) / 2,
b_mid = (b_0 + b_1) / 2,
ref = ifelse(a_mid <= cutoff, 0, 1),
alt = ifelse(b_mid <= cutoff, 0, 1) ) %>%
filter(ref == 0 & alt == 1)
<- MRData()
dat_sim7
$load_df(
dat_sim7data = df_sim7,
col_obs = "rr_log",
col_obs_se = "rr_log_se",
col_covs = list("ref", "alt"),
col_study_id = "study_id"
)
<- list(
cov_models7 LinearCovModel(
alt_cov = "intercept",
use_re = TRUE,
prior_beta_uniform = array(c(0.0, Inf))
)
)
<- MRBRT(
mod7 data = dat_sim7,
cov_models = cov_models7
)$fit_model(inner_max_iter = 1000L, inner_print_level = 5L)
mod7
<- data.frame(intercept = 1, ref = 0, alt = 1)
df_pred7 <- MRData()
dat_pred7 $load_df(data = df_pred7, col_covs = list("ref", "alt"))
dat_pred7
<- mod7$sample_soln(sample_size = 1000L)
samples7 <- samples7[[1]]
beta_samples7 <- samples7[[2]]
gamma_samples7
<- mod7$create_draws(
y_draws7 data = dat_pred7,
beta_samples = beta_samples7,
gamma_samples = gamma_samples7,
random_study = TRUE
)
# using the scorelator
# need to run 'repl_python()' to open an interactive Python interpreter,
# then immediately type 'exit' to get back to the R interpreter
# -- this helps to load a required Python package
repl_python()
# -- type 'exit' or hit escape
<- import("mrtool.evidence_score.scorelator")
evidence_score <- evidence_score$Scorelator(
scorelator7 ln_rr_draws = t(y_draws7),
exposures = array(1),
score_type = "point"
)
<- scorelator7$get_evidence_score(path_to_diagnostic=file.path(path_to_misc_outputs, "tmp_score.pdf")) score
3.9 Frequently asked questions
- How do I access the help documentation?
In an interactive R session, type py_help(...)
where ...
is the name of the function. This will show the Python docstring with information about required/optional arguments and a description of what they mean.
- Where should I go with questions?
The #friends_of_mr_brt Slack channel is a good place to start for troubleshooting questions. It’s maintained by the Mathematical Sciences and Computational Algorithms (MSCA) team. If something requires a more in-depth discussion, feel free to sign up for a slot at office hours (#mscm-office-hours Slack channel).
- What does MR-BRT stand for?
Meta-regression with Bayesian priors, Regularization and Trimming