# -*- coding: utf-8 -*- """ DACTRL — Statistical Tests & Bootstrap Confidence Intervals ============================================================ Post-processing only — no model rerun needed. Tests performed: 1. Wilcoxon signed-rank: DACTRL-TSM K=10 vs each baseline (per-patient F1) 2. Wilcoxon: DACTRL-TSM K=10 vs K=0 (self-improvement) 3. Bootstrap CI (N=10000) on DACTRL-TSM F1 and AUC at each K 4. Effect size (Cohen's d) for each pairwise comparison 5. Paired t-test (parametric check) Inputs (from completed experiment runs): results/dactrl_seeg_clean_eval/tables/clean_eval_per_patient.csv results/dactrl_auc_results/tables/auc_f1_per_patient.csv results/simple_baselines/tables/simple_baselines_per_patient.csv Outputs: results/dactrl_stats_bootstrap/tables/wilcoxon_tests.csv results/dactrl_stats_bootstrap/tables/bootstrap_ci.csv results/dactrl_stats_bootstrap/tables/effect_sizes.csv results/dactrl_stats_bootstrap/figures/bootstrap_f1_auc.png results/dactrl_stats_bootstrap/figures/per_patient_comparison.png """ import os; os.environ.setdefault('PYTHONIOENCODING', 'utf-8') from pathlib import Path from datetime import datetime import numpy as np import pandas as pd import matplotlib; matplotlib.use('Agg') import matplotlib.pyplot as plt from scipy import stats OUT_ROOT = Path(r"D:\Projects\phd\PSEG\pges_toolkit\results\dactrl_stats_bootstrap") FIG_DIR = OUT_ROOT / "figures" TAB_DIR = OUT_ROOT / "tables" for d in [OUT_ROOT, FIG_DIR, TAB_DIR]: d.mkdir(parents=True, exist_ok=True) RESULTS = Path(r"D:\Projects\phd\PSEG\pges_toolkit\results") N_BOOT = 10_000 RNG = np.random.RandomState(42) def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) def cohens_d(a, b): pooled_std = np.sqrt((np.std(a, ddof=1)**2 + np.std(b, ddof=1)**2) / 2) return (np.mean(a) - np.mean(b)) / (pooled_std + 1e-10) def bootstrap_ci(values, n_boot=N_BOOT, ci=0.95, rng=RNG): boot_means = [rng.choice(values, size=len(values), replace=True).mean() for _ in range(n_boot)] lo = np.percentile(boot_means, (1 - ci) / 2 * 100) hi = np.percentile(boot_means, (1 + ci) / 2 * 100) return float(np.mean(values)), float(lo), float(hi) if __name__ == '__main__': log("=" * 60) log("DACTRL — Statistical Tests & Bootstrap CI") log("=" * 60) # ── Load data ────────────────────────────────────────────────── clean = pd.read_csv(RESULTS / "dactrl_seeg_clean_eval/tables/clean_eval_per_patient.csv") auc = pd.read_csv(RESULTS / "dactrl_auc_results/tables/auc_f1_per_patient.csv") base = pd.read_csv(RESULTS / "simple_baselines/tables/simple_baselines_per_patient.csv") tsm_k10 = clean[clean['K'] == 10].set_index('pid')['F1_mean'] tsm_k0 = clean[clean['K'] == 0].set_index('pid')['F1_mean'] tsm_k2 = clean[clean['K'] == 2].set_index('pid')['F1_mean'] tsm_k20 = clean[clean['K'] == 20].set_index('pid')['F1_mean'] pids = sorted(tsm_k10.index) log(f"N patients: {len(pids)}") # ── 1. Bootstrap CI on F1 and AUC at each K ─────────────────── log("\n=== Bootstrap CI (95%, N=10000) ===") boot_rows = [] for k in [0, 2, 5, 10, 20]: f1_vals = clean[clean['K'] == k]['F1_mean'].values auc_vals = auc[auc['K'] == k]['AUC'].values if 'AUC' in auc.columns else None f1_mean, f1_lo, f1_hi = bootstrap_ci(f1_vals) row = {'K': k, 'F1_mean': f1_mean, 'F1_CI_lo': f1_lo, 'F1_CI_hi': f1_hi} if auc_vals is not None and len(auc_vals) > 0: auc_mean, auc_lo, auc_hi = bootstrap_ci(auc_vals) row.update({'AUC_mean': auc_mean, 'AUC_CI_lo': auc_lo, 'AUC_CI_hi': auc_hi}) log(f" K={k:2d} F1={f1_mean:.4f} [{f1_lo:.4f}, {f1_hi:.4f}] " f"AUC={auc_mean:.4f} [{auc_lo:.4f}, {auc_hi:.4f}]") else: log(f" K={k:2d} F1={f1_mean:.4f} [{f1_lo:.4f}, {f1_hi:.4f}]") boot_rows.append(row) boot_df = pd.DataFrame(boot_rows) boot_df.to_csv(TAB_DIR / "bootstrap_ci.csv", index=False) # ── 2. Wilcoxon tests: TSM K=10 vs baselines ────────────────── log("\n=== Wilcoxon Signed-Rank Tests (TSM K=10 vs comparators) ===") wilcox_rows = [] comparators = { 'TSM_K0': tsm_k0, 'TSM_K2': tsm_k2, 'TSM_K20': tsm_k20, } # Add baselines (per-patient, matched by pid) for method in base['method'].unique(): sub = base[base['method'] == method].set_index('pid')['F1'] comparators[method] = sub tsm_arr = tsm_k10[pids].values for name, comp_series in comparators.items(): common = [p for p in pids if p in comp_series.index] if len(common) < 5: continue comp_arr = comp_series[common].values tsm_matched = tsm_k10[common].values stat, pval = stats.wilcoxon(tsm_matched, comp_arr, alternative='greater', zero_method='wilcox') _, pval2 = stats.wilcoxon(tsm_matched, comp_arr, alternative='two-sided', zero_method='wilcox') tstat, tpval = stats.ttest_rel(tsm_matched, comp_arr) d = cohens_d(tsm_matched, comp_arr) direction = "TSM>" if np.mean(tsm_matched) > np.mean(comp_arr) else "TSM<" sig = "***" if pval2 < 0.001 else ("**" if pval2 < 0.01 else ("*" if pval2 < 0.05 else "ns")) log(f" TSM_K10 vs {name:<20} {direction} W={stat:.0f} " f"p(1-sided)={pval:.4f} p(2-sided)={pval2:.4f} {sig} d={d:.3f}") wilcox_rows.append({ 'comparison': f'TSM_K10_vs_{name}', 'TSM_K10_mean': np.mean(tsm_matched), 'comparator_mean': np.mean(comp_arr), 'delta': np.mean(tsm_matched) - np.mean(comp_arr), 'wilcoxon_stat': stat, 'p_onesided': pval, 'p_twosided': pval2, 'significance': sig, 'cohens_d': d, 'ttest_t': tstat, 'ttest_p': tpval, 'n_patients': len(common), }) wilcox_df = pd.DataFrame(wilcox_rows) wilcox_df.to_csv(TAB_DIR / "wilcoxon_tests.csv", index=False) # ── 3. Effect sizes summary ──────────────────────────────────── log("\n=== Effect Sizes (Cohen's d, TSM K=10 vs comparators) ===") eff_df = wilcox_df[['comparison', 'delta', 'cohens_d', 'significance']].copy() eff_df.to_csv(TAB_DIR / "effect_sizes.csv", index=False) log(eff_df.to_string(index=False)) # ── Figure 1: Bootstrap CI plot ──────────────────────────────── fig, axes = plt.subplots(1, 2, figsize=(12, 5)) k_vals = boot_df['K'].values for ax, metric, color, label in zip( axes, [('F1_mean','F1_CI_lo','F1_CI_hi'), ('AUC_mean','AUC_CI_lo','AUC_CI_hi')], ['#2166ac', '#d6604d'], ['F1 Score', 'AUC-ROC']): m, lo, hi = metric if m not in boot_df.columns: continue ax.plot(k_vals, boot_df[m], 'o-', color=color, linewidth=2, markersize=8) ax.fill_between(k_vals, boot_df[lo], boot_df[hi], alpha=0.25, color=color, label='95% Bootstrap CI') for k, mean, l, h in zip(k_vals, boot_df[m], boot_df[lo], boot_df[hi]): ax.annotate(f'{mean:.3f}', (k, mean), textcoords='offset points', xytext=(0, 10), ha='center', fontsize=8, color=color) ax.set_xlabel('K (support examples per class)', fontsize=11) ax.set_ylabel(label, fontsize=11) ax.set_title(f'DACTRL-TSM {label}\nwith 95% Bootstrap CI (N={N_BOOT})', fontsize=11) ax.set_ylim(0.5, 1.05); ax.legend(fontsize=9); ax.grid(True, alpha=0.3) plt.suptitle('DACTRL-TSM: Bootstrap Confidence Intervals (LOSO, N=14)', fontsize=13) plt.tight_layout() plt.savefig(FIG_DIR / "bootstrap_f1_auc.png", dpi=150, bbox_inches='tight') # ── Figure 2: Per-patient F1 comparison at K=10 ─────────────── fig2, ax2 = plt.subplots(figsize=(11, 6)) x = np.arange(len(pids)) width = 0.18 methods_to_plot = [ ('DACTRL-TSM K=10', tsm_k10, '#2166ac'), ('DACTRL-TSM K=0', tsm_k0, '#74add1'), ] for method in ['SVM_Kshot', 'XGBoost', 'RandomForest']: sub = base[base['method'] == method].set_index('pid')['F1'] colors_m = {'SVM_Kshot': '#d73027', 'XGBoost': '#fc8d59', 'RandomForest': '#4dac26'} methods_to_plot.append((method, sub, colors_m[method])) for ci, (label, series, color) in enumerate(methods_to_plot): vals = [series.get(p, float('nan')) for p in pids] ax2.bar(x + ci * width, vals, width, label=label, color=color, alpha=0.8, edgecolor='white') ax2.set_xticks(x + width * 2) ax2.set_xticklabels(pids, rotation=45, ha='right') ax2.set_ylim(0, 1.1) ax2.set_ylabel('F1 Score', fontsize=11) ax2.set_title('Per-Patient F1: DACTRL-TSM vs Baselines', fontsize=12) ax2.legend(fontsize=9, loc='lower right'); ax2.grid(True, axis='y', alpha=0.3) plt.tight_layout() plt.savefig(FIG_DIR / "per_patient_comparison.png", dpi=150, bbox_inches='tight') log(f"\nSaved -> {TAB_DIR}") log("Done.")