import numpy as np
import pandas as pd
from scipy import stats
from dataclasses import dataclass
from pathlib import Path
import matplotlib.pyplot as plt

# ============= ANALYTICAL SOLUTION FOR WEIGHTED QUADRATIC LOSS =============

def truncnorm_mean(mu, sigma, a, b):
    """
    Mean of truncated normal N(mu, sigma^2) on interval [a, b]
    
    E[θ | a ≤ θ ≤ b] where θ ~ N(mu, sigma^2)
    """
    if sigma < 1e-10:
        return mu
    
    # Standardize
    alpha = (a - mu) / sigma if not np.isinf(a) else -np.inf
    beta = (b - mu) / sigma if not np.isinf(b) else np.inf
    
    # PDF and CDF at boundaries
    phi_alpha = stats.norm.pdf(alpha) if not np.isinf(alpha) else 0
    phi_beta = stats.norm.pdf(beta) if not np.isinf(beta) else 0
    Phi_alpha = stats.norm.cdf(alpha) if not np.isinf(alpha) else 0
    Phi_beta = stats.norm.cdf(beta) if not np.isinf(beta) else 1
    
    denom = Phi_beta - Phi_alpha
    if denom < 1e-10:
        return mu
    
    # Truncated mean formula
    return mu + sigma * (phi_alpha - phi_beta) / denom


def compute_weighted_bayes_estimate(MAP_w162, sd_w162, thresholds, default_weight=1.0):
    """
    Compute Bayes-optimal estimate under piecewise weighted quadratic loss
    (Works directly on wins per 162 scale)
    
    Parameters:
    -----------
    MAP_w162 : float
        MAP estimate (wins per 162 games)
    sd_w162 : float
        Standard deviation (wins per 162 games)
    thresholds : list of tuples
        Each tuple: (direction, threshold_wins, weight)
        direction: 'up' or 'down'
        threshold_wins: threshold in wins per 162 games
        weight: error penalty factor
    default_weight : float
        Default weight for regions with no threshold
    
    Returns:
    --------
    theta_hat_optimal : float
        Bayes-optimal estimate (wins per 162 games)
    """
    
    if not thresholds:
        return MAP_w162
    
    # Extract all threshold values and sort them
    threshold_vals = sorted(set(t[1] for t in thresholds))
    
    # Create regions: (-∞, t1], (t1, t2], ..., (tn, ∞)
    regions = []
    regions.append((-np.inf, threshold_vals[0]))
    for i in range(len(threshold_vals) - 1):
        regions.append((threshold_vals[i], threshold_vals[i+1]))
    regions.append((threshold_vals[-1], np.inf))
    
    # For each region, determine weight
    def get_region_weight(lower, upper):
        """
        Determine weight for region (lower, upper]
        
        An 'up' threshold at t applies if lower >= t (entire region is >= t)
        A 'down' threshold at t applies if upper <= t (entire region is <= t)
        """
        weight = default_weight
        for direction, threshold_val, penalty in thresholds:
            if direction == 'up' and lower >= threshold_val:
                weight = max(weight, penalty)
            elif direction == 'down' and upper <= threshold_val:
                weight = max(weight, penalty)
        return weight
    
    # Compute weighted expectation: E[w(θ)·θ] / E[w(θ)]
    numerator = 0
    denominator = 0
    
    for lower, upper in regions:
        # Probability of this region
        if np.isinf(lower):
            p_lower = 0
        else:
            p_lower = stats.norm.cdf(lower, MAP_w162, sd_w162)
        
        if np.isinf(upper):
            p_upper = 1
        else:
            p_upper = stats.norm.cdf(upper, MAP_w162, sd_w162)
        
        p_region = p_upper - p_lower
        
        if p_region < 1e-10:
            continue
        
        # Truncated mean for this region
        mu_region = truncnorm_mean(MAP_w162, sd_w162, lower, upper)
        
        # Weight for this region
        w_region = get_region_weight(lower, upper)
        
        numerator += w_region * mu_region * p_region
        denominator += w_region * p_region
    
    if denominator < 1e-10:
        return MAP_w162
    
    return numerator / denominator


def get_user_thresholds():
    """Interactively collect threshold specifications from user"""
    print("\n" + "="*70)
    print("THRESHOLD CONFIGURATION")
    print("="*70)
    print("\nYou will define thresholds where errors are more costly.")
    print("  - 'UP' threshold: Errors costly when true θ ≥ threshold")
    print("  - 'DOWN' threshold: Errors costly when true θ ≤ threshold")
    
    while True:
        try:
            n = int(input("\nHow many thresholds would you like to define? (1 or more): "))
            if n >= 1:
                break
            print("Please enter a number >= 1")
        except ValueError:
            print("Please enter a valid integer")
    
    thresholds = []
    
    for i in range(n):
        print(f"\n--- Threshold {i+1} of {n} ---")
        
        # Direction
        while True:
            direction = input("Weight threshold UP or DOWN? (up/down): ").strip().lower()
            if direction in ['up', 'down']:
                break
            print("Please enter 'up' or 'down'")
        
        # Threshold value in wins
        while True:
            try:
                wins_threshold = float(input(f"Threshold value (wins per 162 games, e.g., 2.0 or -2.0): "))
                break
            except ValueError:
                print("Please enter a valid number")
        
        # Error penalty factor
        while True:
            try:
                weight = float(input(f"Size of error penalty (factor, e.g., 2.0 for 2x costly): "))
                if weight >= 1.0:
                    break
                print("Weight should be >= 1.0")
            except ValueError:
                print("Please enter a valid number")
        
        thresholds.append({
            'direction': direction,
            'wins_threshold': wins_threshold,
            'weight': weight
        })
        
        print(f"  ✓ {'UP' if direction == 'up' else 'DOWN'} threshold at {wins_threshold:+.1f} wins, weight {weight}x")
    
    return thresholds


# ============= WEIGHTED LOSS ESTIMATION CLASS =============

@dataclass
class WLConfig:
    """Configuration for weighted loss estimation"""
    data_dir: str = r"C:\Users\dmk38\Documents\x5"
    thresholds: list = None


class WeightedLossEstimator:
    """Apply weighted quadratic loss to existing MAP estimates"""
    
    def __init__(self, config: WLConfig = None):
        self.config = config or WLConfig()
        self.thresholds_tuples = []
        self.results = None
        
        # Convert threshold dicts to tuples for the calculation function
        if self.config.thresholds:
            for t in self.config.thresholds:
                self.thresholds_tuples.append(
                    (t['direction'], t['wins_threshold'], t['weight'])
                )
    
    def apply_weighted_loss(self, data: pd.DataFrame):
        """
        Apply weighted loss to MAP estimates
        
        Parameters:
        -----------
        data : pd.DataFrame with columns:
            - mgrID : manager identifier
            - MAP_w162 : MAP estimate (wins per 162 games)
            - sd_w162 : standard deviation (wins per 162 games)
        """
        df = data.copy()
        
        print("\nApplying weighted quadratic loss...")
        
        # Compute adjusted estimates
        df['wins_adjusted'] = df.apply(
            lambda row: compute_weighted_bayes_estimate(
                row['MAP_w162'],
                row['sd_w162'],
                self.thresholds_tuples
            ),
            axis=1
        )
        
        # Compute adjustment
        df['adjustment'] = df['wins_adjusted'] - df['MAP_w162']
        
        # Identify which zones each manager is in
        for i, t in enumerate(self.config.thresholds):
            # Check if MAP estimate is in the decision zone
            if t['direction'] == 'up':
                df[f'in_zone_{i+1}'] = df['MAP_w162'] >= t['wins_threshold']
            else:  # down
                df[f'in_zone_{i+1}'] = df['MAP_w162'] <= t['wins_threshold']
            
            # Compute probability of being in this zone
            if t['direction'] == 'up':
                df[f'prob_zone_{i+1}'] = 1 - stats.norm.cdf(
                    t['wins_threshold'], df['MAP_w162'], df['sd_w162']
                )
            else:  # down
                df[f'prob_zone_{i+1}'] = stats.norm.cdf(
                    t['wins_threshold'], df['MAP_w162'], df['sd_w162']
                )
        
        # Overall indicator for being in any decision zone
        zone_cols = [c for c in df.columns if c.startswith('in_zone_')]
        df['in_any_decision_zone'] = df[zone_cols].any(axis=1)
        
        # Identify threshold crossers (moved across threshold due to penalty)
        df['threshold_crosser'] = False
        for i, t in enumerate(self.config.thresholds):
            if t['direction'] == 'up':
                df['threshold_crosser'] |= (
                    (df['MAP_w162'] < t['wins_threshold']) & 
                    (df['wins_adjusted'] >= t['wins_threshold'])
                )
            else:  # down
                df['threshold_crosser'] |= (
                    (df['MAP_w162'] > t['wins_threshold']) & 
                    (df['wins_adjusted'] <= t['wins_threshold'])
                )
        
        # Compute effective shrinkage change
        with np.errstate(divide='ignore', invalid='ignore'):
            df['shrinkage_ratio'] = np.where(
                np.abs(df['MAP_w162']) > 1e-6,
                df['wins_adjusted'] / df['MAP_w162'],
                1.0
            )
        
        self.results = df
        
        print(f"\nSummary:")
        print(f"  Total managers: {len(df)}")
        for i, t in enumerate(self.config.thresholds):
            n_in_zone = df[f'in_zone_{i+1}'].sum()
            direction_str = "≥" if t['direction'] == 'up' else "≤"
            print(f"  Threshold {i+1} (θ {direction_str} {t['wins_threshold']:+.1f} wins): {n_in_zone} managers")
        print(f"  Threshold crossers (moved into zones due to penalty): {df['threshold_crosser'].sum()}")
        print(f"  Mean |adjustment|: {np.abs(df['adjustment']).mean():.3f} wins")
        print(f"  Max adjustment: {df['adjustment'].max():.3f} wins")
        print(f"  Min adjustment: {df['adjustment'].min():.3f} wins")
        
        return self
    
    def get_results(self) -> pd.DataFrame:
        """Return results DataFrame"""
        return self.results
    
    def plot_diagnostics(self, save_path: Path = None):
        """Visualize adjustments"""
        results = self.results
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 10))
        
        # Identify threshold crossers
        threshold_crossers = results['threshold_crosser']
        
        # Plot 1: Adjustment by position
        ax = axes[0, 0]
        
        ax.scatter(results.loc[~threshold_crossers, 'MAP_w162'], 
                   results.loc[~threshold_crossers, 'adjustment'],
                   alpha=0.4, s=15, color='gray', zorder=1, label='Standard')
        ax.scatter(results.loc[threshold_crossers, 'MAP_w162'],
                   results.loc[threshold_crossers, 'adjustment'],
                   alpha=0.7, s=35, color='red', zorder=2, label='Crossed threshold')
        
        # Zone boundaries
        for t in self.config.thresholds:
            ax.axvline(t['wins_threshold'], color='red', linestyle='--', linewidth=2, alpha=0.7)
        ax.axhline(0, color='black', linestyle='-', alpha=0.3, linewidth=1)
        
        ax.set_xlabel('MAP Estimate (wins per 162 games)', fontsize=11)
        ax.set_ylabel('Adjustment (wins)', fontsize=11)
        ax.set_title('Adjustments by Position', fontsize=12, fontweight='bold')
        
        # Plot 2: Before vs After
        ax = axes[0, 1]
        
        ax.scatter(results.loc[~threshold_crossers, 'MAP_w162'],
                   results.loc[~threshold_crossers, 'wins_adjusted'],
                   alpha=0.4, s=15, color='gray', zorder=1)
        ax.scatter(results.loc[threshold_crossers, 'MAP_w162'],
                   results.loc[threshold_crossers, 'wins_adjusted'],
                   alpha=0.7, s=35, color='red', zorder=2)
        
        for t in self.config.thresholds:
            ax.axvline(t['wins_threshold'], color='red', linestyle='--', linewidth=1.5, alpha=0.7)
            
        
        ax.plot([results['MAP_w162'].min(), results['MAP_w162'].max()],
                [results['MAP_w162'].min(), results['MAP_w162'].max()],
                'k--', alpha=0.5, linewidth=1.5, label='45° line')
        
        ax.set_xlabel('MAP Estimate', fontsize=11)
        ax.set_ylabel('Adjusted Estimate (Weighted loss)', fontsize=11)
        ax.set_title('Before vs After', fontsize=12, fontweight='bold')
        
        # Plot 3: Probability of being in decision zones
        ax = axes[1, 0]
        
        prob_cols = [c for c in results.columns if c.startswith('prob_zone_')]
        if prob_cols:
            max_prob = results[prob_cols].max(axis=1)
            scatter = ax.scatter(results['MAP_w162'],
                                results['adjustment'],
                                c=max_prob,
                                cmap='Reds',
                                s=25,
                                alpha=0.6,
                                vmin=0, vmax=1)
            plt.colorbar(scatter, ax=ax, label='Max zone probability')
        
        for t in self.config.thresholds:
            ax.axvline(t['wins_threshold'], color='red', linestyle='--', linewidth=1.5, alpha=0.7)
        ax.axhline(0, color='black', linestyle='-', alpha=0.3, linewidth=1)
        
        ax.set_xlabel('MAP Estimate (wins per 162 games)', fontsize=11)
        ax.set_ylabel('Adjustment (wins)', fontsize=11)
        ax.set_title('Adjustments by Zone Probability', fontsize=12, fontweight='bold')
        
        # Plot 4: Distribution of adjustments
        ax = axes[1, 1]
        ax.hist(results['adjustment'], bins=50, alpha=0.7, color='blue', edgecolor='black')
        ax.axvline(0, color='black', linestyle='-', linewidth=2)
        ax.set_xlabel('Adjustment (wins)', fontsize=11)
        ax.set_ylabel('Count', fontsize=11)
        ax.set_title('Distribution of Adjustments', fontsize=12, fontweight='bold')
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, format='svg', bbox_inches='tight')
            print(f"\nDiagnostic plots saved to {save_path}")
        
        return fig


# ============= MAIN EXECUTION =============

if __name__ == "__main__":
    data_dir = Path(r"[your drive partition]")
    
    print("="*70)
    print("WEIGHTED QUADRATIC LOSS ESTIMATION")
    print("="*70)
    
    # Get user-specified thresholds
    user_thresholds = get_user_thresholds()
    
    print("\nLoading data...")
    print("Expected columns: mgrID, MAP_w162, sd_w162")
    print("(Already on wins per 162 games scale)")
    
    # Read data
    data_file = "[DF_8]"  
    data = pd.read_csv(data_dir / data_file)
    
    # Verify required columns
    required_cols = ['mgrID', 'MAP_w162', 'sd_w162']
    missing = [c for c in required_cols if c not in data.columns]
    if missing:
        print(f"\nERROR: Missing required columns: {missing}")
        print(f"Available columns: {list(data.columns)}")
        exit()
    
    print(f"\nLoaded {len(data)} manager-seasons")
    print(f"MAP range: [{data['MAP_w162'].min():.2f}, {data['MAP_w162'].max():.2f}] wins/162")
    print(f"SD range: [{data['sd_w162'].min():.2f}, {data['sd_w162'].max():.2f}] wins/162")
    
    # Initialize with user configuration
    config = WLConfig(thresholds=user_thresholds)
    
    # Apply weighted loss
    estimator = WeightedLossEstimator(config)
    estimator.apply_weighted_loss(data)
    results = estimator.get_results()
    
    # Show results
    print(f"\n{'='*70}")
    print(f"TOP ADJUSTMENTS")
    print(f"{'='*70}")
    
    # Largest positive adjustments
    top_pos = results.nlargest(10, 'adjustment')[
        ['mgrID', 'MAP_w162', 'wins_adjusted', 'adjustment', 'in_any_decision_zone']
    ]
    print("\nLargest positive adjustments:")
    print(top_pos.to_string(index=False, float_format=lambda x: f'{x:.3f}'))
    
    # Largest negative adjustments
    top_neg = results.nsmallest(10, 'adjustment')[
        ['mgrID', 'MAP_w162', 'wins_adjusted', 'adjustment', 'in_any_decision_zone']
    ]
    print("\nLargest negative adjustments:")
    print(top_neg.to_string(index=False, float_format=lambda x: f'{x:.3f}'))
    
    # Create plots
    print(f"\n{'='*70}")
    print(f"CREATING DIAGNOSTIC PLOTS")
    print(f"{'='*70}")
    
    output_name = data_file.replace('.csv', '').replace('.dta', '')
    fig = estimator.plot_diagnostics(save_path=data_dir / f"{output_name}_weighted_loss.svg")
    plt.show()
    
    # Save results
    results.to_csv(data_dir / f"{output_name}_weighted_loss_results.csv", index=False)
    print(f"\nResults saved to {data_dir / f'{output_name}_weighted_loss_results.csv'}")