Step 5 - Moving Block Bootstrap (MBB) for ITS confidence intervals¶
Goal: understand how MBB turns model residuals into confidence intervals around the counterfactual.
Sections:
- 5a. Inspect residuals and their autocorrelation (the reason we need blocks).
- 5b. Single simulation in slow motion - block resampling made visible.
- 5c. Build the
pred_matrixmanually - watch the fan of counterfactuals accumulate. - 5d. Compare CI methods: quantile vs symmetric_sd.
- 5e. Block length sensitivity - how
block_lengthaffects CI width.
%matplotlib inline
from IPython.display import display
import logging
import math
import warnings
from pathlib import Path
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
datefmt="%H:%M:%S",
)
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
logging.getLogger("its2s").setLevel(logging.WARNING)
OUT_DIR = Path.cwd() / "figures"
OUT_DIR.mkdir(exist_ok=True)
INTERVENTION = "2022-03-15"
TEST_DAYS = 365
HOLDOUT_DAYS = 42
Setup - load data, split, and fit the same Prophet+XGB model as step 4¶
from its2s.data_prep import prepare_splits
from its2s.models.prophet_xgb import ProphetXGBHybridModel
from its2s.settings import get_model_config, load_config
df = pd.read_csv("data/dummy_data.csv", parse_dates=["ds"])
config = load_config()
splits = prepare_splits(df, INTERVENTION, test_days=TEST_DAYS, holdout_days=HOLDOUT_DAYS)
print("Fitting Prophet+XGB on training data ...")
model = ProphetXGBHybridModel(params=get_model_config(config, "prophet_xgb"))
fit_result = model.fit(splits.train_df, target_col="y", date_col="ds")
print("Fit complete.")
Fitting Prophet+XGB on training data ...
Fit complete.
5a. Raw residuals - the raw material for MBB¶
MBB resamples residuals in contiguous blocks to preserve autocorrelation. Non-zero autocorrelation is exactly why we can't use iid resampling.
residuals = fit_result.residuals
fitted_values = fit_result.fitted_values
train_dates = splits.train_df["ds"]
print(f" n residuals : {len(residuals)}")
print(f" mean : {residuals.mean():.4f} (should be near 0)")
print(f" std : {residuals.std():.4f}")
print(f" min / max : {residuals.min():.3f} / {residuals.max():.3f}")
lag1_acf = np.corrcoef(residuals[1:], residuals[:-1])[0, 1]
lag7_acf = np.corrcoef(residuals[7:], residuals[:-7])[0, 1]
lag14_acf = np.corrcoef(residuals[14:], residuals[:-14])[0, 1]
print(f"\n Autocorrelation of residuals:")
print(f" lag-1 : {lag1_acf:.3f}")
print(f" lag-7 : {lag7_acf:.3f}")
print(f" lag-14 : {lag14_acf:.3f}")
n residuals : 1227
mean : -0.0001 (should be near 0)
std : 1.5196
min / max : -5.353 / 4.784
Autocorrelation of residuals:
lag-1 : 0.008
lag-7 : -0.004
lag-14 : -0.073
fig, axes = plt.subplots(2, 1, figsize=(13, 5), sharex=True)
axes[0].plot(train_dates, splits.train_df["y"].values, color="#333333",
linewidth=0.6, alpha=0.8, label="Observed y")
axes[0].plot(train_dates, fitted_values, color="#2166AC",
linewidth=0.9, alpha=0.8, label="Fitted values")
axes[0].set_ylabel("y")
axes[0].legend(fontsize=8)
axes[0].set_title("3a. Fitted values vs observed (training period)", fontsize=10)
axes[1].plot(train_dates, residuals, color="#B2182B", linewidth=0.6, alpha=0.8)
axes[1].axhline(0, color="black", linewidth=0.8, linestyle="--")
axes[1].set_ylabel("Residual")
axes[1].set_title(f"Residuals (std={residuals.std():.3f}, lag-1 ACF={lag1_acf:.3f})", fontsize=10)
axes[1].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
plt.savefig(OUT_DIR / "step5a_residuals.png", dpi=150)
display(fig)
5b. Single MBB simulation in slow motion¶
One bootstrap world = a fresh set of residuals built by stitching contiguous blocks, then added back to the fitted values to form a plausible alternative history.
from its2s.bootstrap.mbb import _resample_blocks
BLOCK_LENGTH = 14
sim_rng = np.random.default_rng(0)
resampled = _resample_blocks(residuals, BLOCK_LENGTH, sim_rng)
# Recover block start indices (deterministic with same seed)
viz_rng = np.random.default_rng(0)
n = len(residuals)
n_blocks = math.ceil(n / BLOCK_LENGTH)
max_start = n - BLOCK_LENGTH
starts = viz_rng.integers(0, max_start + 1, size=n_blocks)
print(f" block_length : {BLOCK_LENGTH}")
print(f" n_blocks : {n_blocks} (to cover {n} residuals)")
print(f" First 5 block start indices: {starts[:5].tolist()}")
perturbed_y = fitted_values + resampled
print(f"\n Original training y std : {splits.train_df["y"].values.std():.3f}")
print(f" Perturbed training y std : {perturbed_y.std():.3f}")
block_length : 14 n_blocks : 88 (to cover 1227 residuals) First 5 block start indices: [1032, 773, 620, 327, 373] Original training y std : 7.393 Perturbed training y std : 7.324
fig, axes = plt.subplots(2, 1, figsize=(13, 5), sharex=True)
colors_cycle = ["#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B3"]
for b_idx, start in enumerate(starts[:8]):
x_start = train_dates.iloc[start]
x_end = train_dates.iloc[min(start + BLOCK_LENGTH - 1, n - 1)]
axes[0].axvspan(x_start, x_end,
color=colors_cycle[b_idx % len(colors_cycle)],
alpha=0.25)
axes[0].plot(train_dates, residuals, color="#333333", linewidth=0.7, label="Original residuals")
axes[0].plot(train_dates, resampled, color="#B2182B", linewidth=0.7, alpha=0.7, label="Resampled residuals")
axes[0].axhline(0, color="black", linewidth=0.6, linestyle="--")
axes[0].set_ylabel("Residual")
axes[0].legend(fontsize=8)
axes[0].set_title(
f"3b. Original vs resampled residuals (block_length={BLOCK_LENGTH}, shaded = first 8 blocks picked)",
fontsize=10,
)
axes[1].plot(train_dates, splits.train_df["y"].values, color="#333333",
linewidth=0.6, label="Original training y")
axes[1].plot(train_dates, perturbed_y, color="#2166AC",
linewidth=0.6, alpha=0.7, label="Perturbed training y")
axes[1].set_ylabel("y")
axes[1].legend(fontsize=8)
axes[1].set_title("Perturbed training series (fitted + resampled residuals)", fontsize=10)
axes[1].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
plt.savefig(OUT_DIR / "step5b_block_resampling.png", dpi=150)
display(fig)
5c. Build pred_matrix manually - the fan of counterfactual worlds¶
Each simulation refits the model on a perturbed training series and forecasts over full_predict_df. Stacking N simulations gives an (n_dates x n_sims) matrix - the empirical distribution of counterfactual trajectories.
from its2s.bootstrap.mbb import _single_mbb_sim
N_SIM = 100
base_rng = np.random.default_rng(42)
sim_seeds = base_rng.integers(0, 2**31, size=N_SIM)
n_dates = len(splits.full_predict_df)
pred_matrix = np.full((n_dates, N_SIM), np.nan)
print(f" Running {N_SIM} simulations ...")
for i in range(N_SIM):
preds = _single_mbb_sim(
i, model, splits.train_df, splits.full_predict_df,
fitted_values, residuals,
BLOCK_LENGTH, "y", "ds", None, int(sim_seeds[i]),
)
pred_matrix[:, i] = preds
print(f" pred_matrix shape : {pred_matrix.shape} (n_dates x n_sims)")
print(f" Spread at day 0 : std = {np.nanstd(pred_matrix[0, :]):.3f}")
mid = min(180, n_dates - 1)
print(f" Spread at day {mid:>3}: std = {np.nanstd(pred_matrix[mid, :]):.3f}")
print(" -> uncertainty typically grows further from the training period")
Running 100 simulations ...
pred_matrix shape : (349, 100) (n_dates x n_sims) Spread at day 0 : std = 0.525 Spread at day 180: std = 0.579 -> uncertainty typically grows further from the training period
pred_dates = pd.to_datetime(splits.full_predict_df["ds"])
intervention_ts = pd.Timestamp(INTERVENTION)
fig, ax = plt.subplots(figsize=(13, 5))
for i in range(N_SIM):
ax.plot(pred_dates, pred_matrix[:, i], color="#B2182B", linewidth=0.3, alpha=0.12)
conf_lo = np.nanpercentile(pred_matrix, 2.5, axis=1)
conf_hi = np.nanpercentile(pred_matrix, 97.5, axis=1)
point_pred = model.predict(splits.full_predict_df, target_col="y", date_col="ds")
ax.fill_between(pred_dates, conf_lo, conf_hi,
color="#B2182B", alpha=0.25, label="95% CI (quantile)")
ax.plot(pred_dates, point_pred.predicted, color="#B2182B", linewidth=1.4,
label="Point prediction")
for part in [splits.train_df, splits.test_df, splits.holdout_df]:
ax.plot(part["ds"], part["y"], color="#333333", linewidth=0.6, alpha=0.7)
ax.plot([], [], color="#333333", linewidth=0.6, label="Observed")
ax.axvline(intervention_ts, color="#4DAF4A", linestyle="--", linewidth=1.2, label="Intervention")
ax.set_xlabel("Date")
ax.set_ylabel("y")
ax.set_title(
f"3c. The bootstrap fan - {N_SIM} counterfactual trajectories (each faint red line = one simulation)",
fontsize=10,
)
ax.legend(fontsize=8, loc="upper left")
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
plt.savefig(OUT_DIR / "step5c_bootstrap_fan.png", dpi=150)
display(fig)
5d. CI method comparison: quantile vs symmetric_sd¶
- quantile: take the 2.5th/97.5th percentiles of
pred_matrixat each date - adapts to asymmetry. - symmetric_sd: point prediction +/-
z * sd(pred_matrix)- simpler, but assumes symmetry.
If the bootstrap distribution is symmetric, both agree. If skewed, quantile CIs are more honest.
from its2s.bootstrap.base import BaseBootstrap
ci_lo_q, ci_hi_q = BaseBootstrap.calculate_ci(pred_matrix, point_pred.predicted,
method="quantile", level=0.95)
ci_lo_sd, ci_hi_sd = BaseBootstrap.calculate_ci(pred_matrix, point_pred.predicted,
method="symmetric_sd", level=0.95)
width_q = (ci_hi_q - ci_lo_q ).mean()
width_sd = (ci_hi_sd - ci_lo_sd).mean()
print(f" Mean CI width - quantile : {width_q:.3f}")
print(f" Mean CI width - symmetric_sd : {width_sd:.3f}")
holdout_col_idx = (pred_dates >= intervention_ts).argmax()
boot_dist = pred_matrix[holdout_col_idx, :]
skew = float(pd.Series(boot_dist).skew())
print(f" Bootstrap distribution skewness at first holdout date: {skew:.3f}")
Mean CI width - quantile : 2.316 Mean CI width - symmetric_sd : 2.382 Bootstrap distribution skewness at first holdout date: 0.129
fig, ax = plt.subplots(figsize=(13, 4))
ax.plot(pred_dates, point_pred.predicted, color="#333333", linewidth=1.2,
label="Point prediction", zorder=3)
ax.fill_between(pred_dates, ci_lo_q, ci_hi_q,
color="#4C72B0", alpha=0.30, label=f"Quantile CI (width={width_q:.2f})")
ax.fill_between(pred_dates, ci_lo_sd, ci_hi_sd,
color="#DD8452", alpha=0.30, label=f"Symmetric SD CI (width={width_sd:.2f})")
ax.axvline(intervention_ts, color="#4DAF4A", linestyle="--", linewidth=1.2, label="Intervention")
ax.set_xlabel("Date")
ax.set_ylabel("y")
ax.set_title("3d. Quantile CI vs symmetric SD CI", fontsize=10)
ax.legend(fontsize=8, loc="upper left")
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
plt.savefig(OUT_DIR / "step5d_ci_methods.png", dpi=150)
display(fig)
5e. Block length sensitivity¶
MBB's one tuning knob is block_length:
- too short: residual autocorrelation is broken -> CIs under-estimate uncertainty.
- too long: too few independent blocks -> quantile estimates become noisy.
Note that block_length is applied to the training residuals (~4 years of daily data here), not the 42-day holdout - so even block_length=28 remains well-defined.
BLOCK_LENGTHS = [3, 7, 14, 28]
bl_colors = ["#4C72B0", "#55A868", "#B2182B", "#8172B3"]
ci_results = {}
for bl in BLOCK_LENGTHS:
print(f" Running n_sim={N_SIM} with block_length={bl} ...")
bl_rng = np.random.default_rng(42)
bl_seeds = bl_rng.integers(0, 2**31, size=N_SIM)
bl_matrix = np.full((n_dates, N_SIM), np.nan)
for i in range(N_SIM):
preds = _single_mbb_sim(
i, model, splits.train_df, splits.full_predict_df,
fitted_values, residuals,
bl, "y", "ds", None, int(bl_seeds[i]),
)
bl_matrix[:, i] = preds
lo, hi = BaseBootstrap.calculate_ci(bl_matrix, point_pred.predicted,
method="quantile", level=0.95)
ci_results[bl] = (lo, hi, (hi - lo).mean())
print(f" mean CI width = {ci_results[bl][2]:.3f}")
print("\n Summary:")
print(f" {'block_length':>14} {'mean CI width':>14}")
for bl, (_, _, w) in ci_results.items():
print(f" {bl:>14} {w:>14.3f}")
Running n_sim=100 with block_length=3 ...
mean CI width = 2.359 Running n_sim=100 with block_length=7 ...
mean CI width = 2.357 Running n_sim=100 with block_length=14 ...
mean CI width = 2.316 Running n_sim=100 with block_length=28 ...
mean CI width = 2.337
Summary:
block_length mean CI width
3 2.359
7 2.357
14 2.316
28 2.337
fig, ax = plt.subplots(figsize=(13, 5))
for part in [splits.train_df, splits.test_df, splits.holdout_df]:
ax.plot(part["ds"], part["y"], color="#333333", linewidth=0.5, alpha=0.6)
ax.plot([], [], color="#333333", linewidth=0.6, label="Observed")
ax.plot(pred_dates, point_pred.predicted, color="#333333", linewidth=1.2,
linestyle="--", label="Point prediction", zorder=4)
for bl, color in zip(BLOCK_LENGTHS, bl_colors):
lo, hi, w = ci_results[bl]
ax.fill_between(pred_dates, lo, hi, color=color, alpha=0.20,
label=f"block_length={bl} (mean width={w:.2f})")
ax.axvline(intervention_ts, color="black", linestyle="--", linewidth=1.0, label="Intervention")
ax.set_xlabel("Date")
ax.set_ylabel("y")
ax.set_title("3e. MBB CI width vs block length (all at 95%, n_sim=100)", fontsize=10)
ax.legend(fontsize=8, loc="upper left")
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
plt.savefig(OUT_DIR / "step5e_block_length_sensitivity.png", dpi=150)
display(fig)
Key takeaways¶
- MBB resamples residuals in contiguous blocks (not iid) to preserve the autocorrelation of the time series.
pred_matrixis(n_dates x n_sims)- each column is one counterfactual trajectory from a bootstrap world.- CIs are percentiles of
pred_matrixacross simulations at each date - no parametric assumptions needed. quantileis preferable tosymmetric_sdwhen the bootstrap distribution is skewed.block_lengthis MBB's one tuning knob (see step 3 for the broadertune_model()framework that tunes the model hyperparameters;block_lengthitself is tuned separately by sensitivity analysis here in 5e); 14 (two weeks) is a sensible default for daily data with weekly seasonality.