import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def calculate_optimal_threshold(results_path):
    """
    Calculates optimal threshold using Youden's J statistic.
    """
    if not os.path.exists(results_path):
        logger.error(f"Results file not found: {results_path}")
        return None

    try:
        df = pd.read_csv(results_path)
        logger.info(f"Loaded {len(df)} predictions from {results_path}")
        
        df = df.dropna(subset=['pneumothorax_score'])
        if len(df) == 0:
            logger.error("No valid predictions found.")
            return None

        # Binary Labels
        y_true = (df['true_label'] == 'Pneumothorax').astype(int)
        y_scores = df['pneumothorax_score']

        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        
        # Youden's J = Sensitivity + Specificity - 1
        # Sensitivity = TPR
        # Specificity = 1 - FPR
        # J = TPR + (1 - FPR) - 1 = TPR - FPR
        j_scores = tpr - fpr
        best_idx = np.argmax(j_scores)
        best_threshold = thresholds[best_idx]
        
        logger.info(f"Optimal Threshold (Youden's J): {best_threshold:.4f}")
        logger.info(f"Sensitivity: {tpr[best_idx]:.4f}")
        logger.info(f"Specificity: {1 - fpr[best_idx]:.4f}")
        
        return best_threshold

    except Exception as e:
        logger.error(f"Failed to calculate threshold: {e}")
        return None

if __name__ == "__main__":
    results_file = "results/kaggle_predictions.csv"
    calculate_optimal_threshold(results_file)