# bootstrap_empirical_null_eta_only.py
# Parametric-bootstrap empirical null for manager effects (eta only).


import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from joblib import Parallel, delayed
import multiprocessing as mp

# Optional progress bar
try:
    from tqdm import tqdm
    from tqdm_joblib import tqdm_joblib
    TQDM_OK = True
except Exception:
    TQDM_OK = False


# -------------------
# CONFIG
# -------------------
INFILE   = r"C:\Users\dmkah\OneDrive\Documents\x5\mwar_bs_ready_11.24.csv"
OUT_BASE = r"C:\Users\dmkah\OneDrive\Documents\x5"
OUT_LONG = OUT_BASE + r"\bootstrap_en_eta_long.csv"
OUT_META = OUT_BASE + r"\bootstrap_en_meta.json"

B        = 20000       # bootstrap reps
SEED     = 20250925
MAXITER  = 1000
CLIP     = 1e-12
N_JOBS   = 16         # set to an int; -1 to use all cores

def _simulate_one(rep_idx, games_i, p0, Xf_mat, weights, L0, g_by_mgr, MAXITER, child_seed):
    """    One H0 replicate:
      - simulate Binomial(games_i, p0)
      - fit full model (var_weights=games)
      - compute per-manager eta at z=0 via L0 @ beta
      - center to games-weighted mean zero
    Returns: (rep_idx, eta_b) where eta_b has length = n_managers
    """
    rng = np.random.default_rng(child_seed)
    w_b = rng.binomial(n=games_i, p=p0)
    y_b = w_b / games_i

    try:
        mdl_b = sm.GLM(y_b, Xf_mat, family=sm.families.Binomial(), var_weights=weights)
        res_b = mdl_b.fit(maxiter=MAXITER)
        beta_b = np.asarray(res_b.params, dtype="float64")
        eta_b  = L0 @ beta_b
        # games-weighted mean-zero centering across managers
        mean_w = np.average(eta_b, weights=g_by_mgr)
        eta_b  = eta_b - mean_w
        return rep_idx, eta_b
    except Exception:
        # on non-convergence, return NaNs (same length)
        m = L0.shape[0]
        return rep_idx, np.full(m, np.nan, dtype="float64")

def main():
    # 1) Load
    df = pd.read_csv(INFILE)
    need = {"mgrID","mgr_n","yearID","w","games","z_wpg"}
    miss = need - set(df.columns)
    if miss:
        raise ValueError(f"Missing columns: {miss}")

    for c in ["mgr_n","w","games","z_wpg"]:
        df[c] = pd.to_numeric(df[c], errors="raise")

    if (df["games"] <= 0).any():
        raise ValueError("All rows must have games > 0 for Binomial GLM with proportions.")

    y_rate  = (df["w"] / df["games"]).astype("float64")
    weights = df["games"].astype("float64").to_numpy()
    games_i = df["games"].to_numpy().astype(int)

    # Manager levels and baseline
    mgr_levels   = np.array(sorted(df["mgr_n"].unique().astype(int)), dtype=int)
    baseline_mgr = int(df["mgr_n"].min())

    # 2) Build designs
    # Full model: const + z_wpg + manager dummies (baseline omitted in params)
    mgr_dum  = pd.get_dummies(df["mgr_n"].astype(int), prefix="mgr", drop_first=False)
    base_col = f"mgr_{baseline_mgr}"
    if base_col not in mgr_dum.columns:
        base_col = sorted(mgr_dum.columns)[0]
    mgr_dum = mgr_dum.drop(columns=[base_col])

    X_full = pd.concat(
        [pd.Series(1.0, index=df.index, name="const"),
         df[["z_wpg"]].astype("float64"),
         mgr_dum.astype("float64")],
        axis=1
    )
    Xf_cols = list(X_full.columns)
    Xf_mat  = X_full.to_numpy(dtype="float64", copy=False)

    # Null model: const + z_wpg (no manager FEs)
    X_null = pd.concat(
        [pd.Series(1.0, index=df.index, name="const"),
         df[["z_wpg"]].astype("float64")],
        axis=1
    )
    Xn_mat = X_null.to_numpy(dtype="float64", copy=False)

    # L0 maps full-model params to "eta at z=0" for each manager
    m = len(mgr_levels)
    p = Xf_mat.shape[1]
    L0 = np.zeros((m, p), dtype="float64")
    # intercept contributes 1 for everyone at z=0
    L0[:, Xf_cols.index("const")] = 1.0
    # add each manager's dummy (baseline has no column)
    name_to_col = {name: j for j, name in enumerate(Xf_cols)}
    for i, mgr in enumerate(mgr_levels):
        name = f"mgr_{mgr}"
        j = name_to_col.get(name, None)
        if j is None:
            # baseline gets no extra column; that's fine
            continue
        L0[i, j] = 1.0

    # Precompute total games by manager, aligned to mgr_levels
    g_by_mgr = (df.groupby("mgr_n")["games"].sum()
                  .reindex(mgr_levels)
                  .to_numpy(dtype="float64"))

    # 3) Fit null to get p0; clip for stability
    mdl0 = sm.GLM(y_rate, Xn_mat, family=sm.families.Binomial(), var_weights=weights)
    fit0 = mdl0.fit(maxiter=MAXITER)
    beta0 = np.asarray(fit0.params, dtype="float64")
    eta0  = Xn_mat @ beta0
    p0    = 1.0 / (1.0 + np.exp(-eta0))
    p0    = np.clip(p0, CLIP, 1.0 - CLIP)

    # 4) Child seeds
    ss = np.random.SeedSequence(SEED)
    child_seeds = [int(s.generate_state(1)[0]) for s in ss.spawn(B)]

    # 5) Parallel bootstrap
    print(f"Running parametric bootstrap (eta only) with B={B}, jobs={N_JOBS if N_JOBS != -1 else mp.cpu_count()}...")
    if TQDM_OK:
        pbar = tqdm(total=B, desc="Bootstrap EN")
        with tqdm_joblib(pbar):
            results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=0)(
                delayed(_simulate_one)(
                    b, games_i, p0, Xf_mat, weights, L0, g_by_mgr, MAXITER, child_seeds[b]
                ) for b in iterator
            )
        pbar.close()
    else:
        results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=5)(
            delayed(_simulate_one)(
                b, games_i, p0, Xf_mat, weights, L0, g_by_mgr, MAXITER, child_seeds[b]
            ) for b in iterator
        )

    # 6) Pack long frame: (rep, mgr_n, eta_b)
    reps, etas = zip(*results)
    rep_col = np.repeat(np.array(reps, dtype=np.int32), m)
    eta_col = np.concatenate(etas)
    mgr_col = np.tile(mgr_levels, B)

    long_df = pd.DataFrame({"rep": rep_col, "mgr_n": mgr_col, "eta_b": eta_col})
    long_df = long_df.merge(df[["mgrID","mgr_n"]].drop_duplicates(), on="mgr_n", how="left")

    # 7) Save
    long_df.to_csv(OUT_LONG, index=False)
    meta = {
        "B": B,
        "seed": SEED,
        "maxiter": MAXITER,
        "clip": CLIP,
        "n_obs": int(df.shape[0]),
        "n_mgr": int(m),
        "baseline_mgr": int(baseline_mgr),
        "design_full_p": int(p),
        "missing_dummy_levels": [],
        "null_spec": "intercept + z_wpg (no manager FEs); simulate Binomial(games_i, p0_i)",
        "stat": "eta_b = L0 @ beta_full, then games-weighted mean-zero by manager each rep",
        "files": {"long_csv": OUT_LONG}
    }
    with open(OUT_META, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    print(f"Saved: {OUT_LONG}")
    print(f"Saved: {OUT_META}")

if __name__ == "__main__":
    mp.freeze_support()
    main()
