import pandas as pd import numpy as np from sklearn.metrics import roc_curve import logging import os # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def calculate_optimal_threshold(results_path): """ Calculates optimal threshold using Youden's J statistic. """ if not os.path.exists(results_path): logger.error(f"Results file not found: {results_path}") return None try: df = pd.read_csv(results_path) logger.info(f"Loaded {len(df)} predictions from {results_path}") df = df.dropna(subset=['pneumothorax_score']) if len(df) == 0: logger.error("No valid predictions found.") return None # Binary Labels y_true = (df['true_label'] == 'Pneumothorax').astype(int) y_scores = df['pneumothorax_score'] fpr, tpr, thresholds = roc_curve(y_true, y_scores) # Youden's J = Sensitivity + Specificity - 1 # Sensitivity = TPR # Specificity = 1 - FPR # J = TPR + (1 - FPR) - 1 = TPR - FPR j_scores = tpr - fpr best_idx = np.argmax(j_scores) best_threshold = thresholds[best_idx] logger.info(f"Optimal Threshold (Youden's J): {best_threshold:.4f}") logger.info(f"Sensitivity: {tpr[best_idx]:.4f}") logger.info(f"Specificity: {1 - fpr[best_idx]:.4f}") return best_threshold except Exception as e: logger.error(f"Failed to calculate threshold: {e}") return None if __name__ == "__main__": results_file = "results/kaggle_predictions.csv" calculate_optimal_threshold(results_file)