import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Set font to match Stata
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['font.family'] = 'sans-serif'

# Load posterior draws
print("Loading posterior draws...")
df = pd.read_csv(r'[DF_6]')

# Convert to wins per 162 games, centered on average manager
df['prob_raw'] = 1 / (1 + np.exp(-df['u']))
mean_prob = df['prob_raw'].mean()  # Empirical mean across all draws
df['w162'] = (df['prob_raw'] - mean_prob) * 162

print(f"Loaded {len(df)} posterior draws for {df['mgrID'].nunique()} managers")
print()

# ========== PER-MANAGER TAIL PROBABILITIES ==========
print("Calculating tail probabilities for each manager...")

manager_stats = []
for mgr in sorted(df['mgrID'].unique()):
    mgr_draws = df[df['mgrID'] == mgr]['w162'].values
    
    # +/- 2.0 thresholds
    p_ge_pos2 = np.mean(mgr_draws >= 2.0)
    p_le_neg2 = np.mean(mgr_draws <= -2.0)
    p_in_rope2 = np.mean((mgr_draws >= -2.0) & (mgr_draws <= 2.0))
    
    # +/- 1.5 thresholds
    p_ge_pos1_5 = np.mean(mgr_draws >= 1.5)
    p_le_neg1_5 = np.mean(mgr_draws <= -1.5)
    p_in_rope1_5 = np.mean((mgr_draws >= -1.5) & (mgr_draws <= 1.5))
    
    manager_stats.append({
        'mgrID': mgr,
        'p_ge_pos2': p_ge_pos2,
        'p_le_neg2': p_le_neg2,
        'p_in_rope2': p_in_rope2,
        'p_ge_pos1_5': p_ge_pos1_5,
        'p_le_neg1_5': p_le_neg1_5,
        'p_in_rope1_5': p_in_rope1_5,
        'mean_w162': mgr_draws.mean(),
        'median_w162': np.median(mgr_draws)
    })

manager_df = pd.DataFrame(manager_stats)

# Save per-manager statistics
manager_stats_path = r'C:\Users\dmk38\Documents\x5\manager_tail_probabilities.csv'
manager_df.to_csv(manager_stats_path, index=False)
print(f"✓ Manager tail probabilities saved to: {manager_stats_path}")
print()

# Show top managers by P(>= +2)
print("Top 10 managers by P(>= +2.0 wins/162):")
top_pos2 = manager_df.nlargest(10, 'p_ge_pos2')
print(top_pos2[['mgrID', 'p_ge_pos2', 'p_ge_pos1_5', 'mean_w162']].to_string(index=False))
print()

# Show top managers by P(<= -2)
print("Top 10 managers by P(<= -2.0 wins/162):")
top_neg2 = manager_df.nlargest(10, 'p_le_neg2')
print(top_neg2[['mgrID', 'p_le_neg2', 'p_le_neg1_5', 'mean_w162']].to_string(index=False))
print()

# ========== POPULATION-LEVEL ANALYSIS ==========

# All draws pooled together represent the population distribution
all_draws = df['w162'].values

# Calculate population incidence for both thresholds
print("Population incidence (±2.0 threshold):")
p_ge_pos2 = np.mean(all_draws >= 2.0)
p_le_neg2 = np.mean(all_draws <= -2.0)
p_in_rope2 = np.mean((all_draws >= -2.0) & (all_draws <= 2.0))

N_managers = df['mgrID'].nunique()
expected_pos2 = p_ge_pos2 * N_managers
expected_neg2 = p_le_neg2 * N_managers

print(f"  >= +2.0 wins/162: {p_ge_pos2:.4f}   (E[count]={expected_pos2:.2f}, N={N_managers})")
print(f"  <= -2.0 wins/162: {p_le_neg2:.4f}   (E[count]={expected_neg2:.2f}, N={N_managers})")
print(f"  in ROPE [-2,+2]:  {p_in_rope2:.4f}   (E[count]={p_in_rope2*N_managers:.2f}, N={N_managers})")
print()

print("Population incidence (±1.5 threshold):")
p_ge_pos1_5 = np.mean(all_draws >= 1.5)
p_le_neg1_5 = np.mean(all_draws <= -1.5)
p_in_rope1_5 = np.mean((all_draws >= -1.5) & (all_draws <= 1.5))

expected_pos1_5 = p_ge_pos1_5 * N_managers
expected_neg1_5 = p_le_neg1_5 * N_managers

print(f"  >= +1.5 wins/162: {p_ge_pos1_5:.4f}   (E[count]={expected_pos1_5:.2f}, N={N_managers})")
print(f"  <= -1.5 wins/162: {p_le_neg1_5:.4f}   (E[count]={expected_neg1_5:.2f}, N={N_managers})")
print(f"  in ROPE [-1.5,+1.5]: {p_in_rope1_5:.4f}   (E[count]={p_in_rope1_5*N_managers:.2f}, N={N_managers})")
print()

# 2) Calculate 95% HDI for population
def calculate_hdi(draws, credible_mass=0.95):
    sorted_draws = np.sort(draws)
    n = len(sorted_draws)
    interval_size = int(np.ceil(credible_mass * n))
    n_intervals = n - interval_size + 1
    interval_widths = sorted_draws[interval_size-1:] - sorted_draws[:n_intervals]
    min_idx = np.argmin(interval_widths)
    hdi_lower = sorted_draws[min_idx]
    hdi_upper = sorted_draws[min_idx + interval_size - 1]
    return hdi_lower, hdi_upper

hdi_lower, hdi_upper = calculate_hdi(all_draws, credible_mass=0.95)
print(f"95% HDI: [{hdi_lower:.2f}, {hdi_upper:.2f}] wins/162")
print()

# 3) Create population density using KDE
print("Estimating population density...")
kde = gaussian_kde(all_draws)
w_grid = np.linspace(-10, 10, 4001)
f_pop = kde(w_grid)

# 4) Plot - Blue density with gray ROPE band
print("Creating plot...")
fig, ax = plt.subplots(figsize=(10, 6))

# Fill entire density in blue
ax.fill_between(w_grid, 0, f_pop, color='#00a3ff', alpha=0.6, edgecolor='none', zorder=2)

# Draw density outline
ax.plot(w_grid, f_pop, 'k-', linewidth=1.5, zorder=3)

# Overlay gray band for ROPE region (-2 to +2)
ax.axvspan(-2, 2, color='gray', alpha=0.3, zorder=0)

# Add 95% HDI bar ABOVE the axis
density_max = f_pop.max()
ci_y = 0.02 * density_max
ax.plot([hdi_lower, hdi_upper], [ci_y, ci_y], 'k-', linewidth=3, zorder=5, solid_capstyle='butt')

# Formatting
ax.set_xlim(-8, 8)
ax.set_ylim(bottom=0)
ax.set_xticks(np.arange(-8, 9, 1))
ax.tick_params(axis='x', labelsize=16.5)
ax.tick_params(axis='y', left=False, labelleft=False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(1.5)
ax.spines['bottom'].set_position(('data', 0))

plt.tight_layout()
plt.savefig(r'C:\Users\dmk38\Documents\x5\population_density.svg', format='svg', bbox_inches='tight')
print("\n✓ Plot saved to: C:\\Users\\dmk38\\Documents\\x5\\population_density.svg")
plt.show()

# ========== FINAL SUMMARY ==========
print("\n" + "="*70)
print("FINAL SUMMARY: POPULATION FRACTIONS BEYOND THRESHOLDS")
print("="*70)

print("\nPopulation Summary Statistics:")
print(f"  Mean: {all_draws.mean():.3f} wins/162")
print(f"  Median: {np.median(all_draws):.3f} wins/162")
print(f"  SD: {all_draws.std():.3f} wins/162")
print(f"  95% HDI: [{hdi_lower:.2f}, {hdi_upper:.2f}] wins/162")
print()

print("Population Fractions (from all posterior draws):")
print("-" * 70)
print(f"{'Threshold':<20} {'Fraction':<12} {'Expected Count':<15} {'Actual %'}")
print("-" * 70)
print(f"{'≥ +2.0 wins/162':<20} {p_ge_pos2:<12.4f} {expected_pos2:<15.2f} {100*p_ge_pos2:.2f}%")
print(f"{'≤ -2.0 wins/162':<20} {p_le_neg2:<12.4f} {expected_neg2:<15.2f} {100*p_le_neg2:.2f}%")
print(f"{'In [-2.0, +2.0]':<20} {p_in_rope2:<12.4f} {p_in_rope2*N_managers:<15.2f} {100*p_in_rope2:.2f}%")
print()
print(f"{'≥ +1.5 wins/162':<20} {p_ge_pos1_5:<12.4f} {expected_pos1_5:<15.2f} {100*p_ge_pos1_5:.2f}%")
print(f"{'≤ -1.5 wins/162':<20} {p_le_neg1_5:<12.4f} {expected_neg1_5:<15.2f} {100*p_le_neg1_5:.2f}%")
print(f"{'In [-1.5, +1.5]':<20} {p_in_rope1_5:<12.4f} {p_in_rope1_5*N_managers:<15.2f} {100*p_in_rope1_5:.2f}%")
print("-" * 70)

print("\nManager-Level Statistics (based on individual manager probabilities):")
print("-" * 70)
# How many managers have >50% probability of being beyond each threshold
n_high_prob_pos2 = (manager_df['p_ge_pos2'] > 0.5).sum()
n_high_prob_neg2 = (manager_df['p_le_neg2'] > 0.5).sum()
n_high_prob_pos1_5 = (manager_df['p_ge_pos1_5'] > 0.5).sum()
n_high_prob_neg1_5 = (manager_df['p_le_neg1_5'] > 0.5).sum()

print(f"Managers with >50% prob of being ≥ +2.0:  {n_high_prob_pos2:>4} ({100*n_high_prob_pos2/N_managers:.1f}%)")
print(f"Managers with >50% prob of being ≤ -2.0:  {n_high_prob_neg2:>4} ({100*n_high_prob_neg2/N_managers:.1f}%)")
print(f"Managers with >50% prob of being ≥ +1.5:  {n_high_prob_pos1_5:>4} ({100*n_high_prob_pos1_5/N_managers:.1f}%)")
print(f"Managers with >50% prob of being ≤ -1.5:  {n_high_prob_neg1_5:>4} ({100*n_high_prob_neg1_5/N_managers:.1f}%)")
print("="*70)