# Colab setup: clone the repository when opened directly from GitHub,
# then install the small set of packages that may be missing.
from pathlib import Path
import subprocess
import sys

REPO_URL = "https://github.com/Kirscher/when-explanations-lie.git"
REPO_DIR = Path("when-explanations-lie")
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    if not Path("src").exists() and not (REPO_DIR / "src").exists():
        subprocess.run(["git", "clone", "--depth", "1", REPO_URL, str(REPO_DIR)], check=True)
    if (REPO_DIR / "src").exists():
        %cd when-explanations-lie

    missing_packages = []
    for module_name, package_name in {
        "medmnist": "medmnist",
        "captum": "captum",
    }.items():
        try:
            __import__(module_name)
        except ModuleNotFoundError:
            missing_packages.append(package_name)

    if missing_packages:
        %pip -q install {" ".join(missing_packages)} tqdm scikit-learn nbformat nbconvert

from pathlib import Path
import copy
import sys

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
sys.path.insert(0, str(REPO_ROOT))

import medmnist
from medmnist import INFO
import sklearn

from src.data import (
    DATA_FLAG,
    RandomLabelDataset,
    ShortcutDataset,
    count_labels,
    load_pneumoniamnist,
    make_subset,
    marker_mask_for_label,
)
from src.explainers import GradCAM, integrated_gradients, occlusion_sensitivity, smoothgrad, vanilla_gradient
from src.metrics import (
    binary_clinical_metrics,
    confusion_matrix_np,
    deletion_auc,
    deletion_curve,
    map_correlation,
    perturb_image,
    shortcut_region_attribution_fraction,
)
from src.models import SmallMedicalCNN, evaluate, find_example, reset_module_parameters, train_one_epoch
from src.utils import get_device, print_versions, set_seed
from src.visualization import (
    denorm,
    overlay_explanation,
    plot_confusion_matrix,
    plot_explanation_grid,
    plot_metric_bar,
    show_batch,
    show_image_tensor,
)

%matplotlib inline

set_seed(42, deterministic=False)
device = get_device()
print("Using device:", device)
print_versions({"scikit-learn": sklearn, "MedMNIST": medmnist})

Using device: cuda
Python: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0]
PyTorch: 2.11.0+cu130
NumPy: 2.4.4
scikit-learn: 1.8.0
MedMNIST: 3.0.2

FAST_DEV_RUN = False
IMAGE_SIZE = 64
BATCH_SIZE = 128
NUM_WORKERS = 0
LR = 1e-3

if FAST_DEV_RUN:
    MAX_TRAIN_SAMPLES = 2048
    MAX_VAL_SAMPLES = 512
    MAX_TEST_SAMPLES = 512
    EPOCHS_BASELINE = 5
    EPOCHS_RANDOM_LABEL = 3
    EPOCHS_SHORTCUT = 5
    SMOOTHGRAD_SAMPLES = 12
    IG_STEPS = 24
else:
    MAX_TRAIN_SAMPLES = None
    MAX_VAL_SAMPLES = None
    MAX_TEST_SAMPLES = None
    EPOCHS_BASELINE = 10
    EPOCHS_RANDOM_LABEL = 5
    EPOCHS_SHORTCUT = 8
    SMOOTHGRAD_SAMPLES = 20
    IG_STEPS = 32

print({
    "FAST_DEV_RUN": FAST_DEV_RUN,
    "EPOCHS_BASELINE": EPOCHS_BASELINE,
    "EPOCHS_RANDOM_LABEL": EPOCHS_RANDOM_LABEL,
    "EPOCHS_SHORTCUT": EPOCHS_SHORTCUT,
})

{'FAST_DEV_RUN': False, 'EPOCHS_BASELINE': 10, 'EPOCHS_RANDOM_LABEL': 5, 'EPOCHS_SHORTCUT': 8}

info = INFO[DATA_FLAG]
print("Dataset:", info["description"][:500], "...")

bundle = load_pneumoniamnist(
    medmnist_module=medmnist,
    info=info,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    max_train_samples=MAX_TRAIN_SAMPLES,
    max_val_samples=MAX_VAL_SAMPLES,
    max_test_samples=MAX_TEST_SAMPLES,
)

train_ds, val_ds, test_ds = bundle.train_ds, bundle.val_ds, bundle.test_ds
train_loader, val_loader, test_loader = bundle.train_loader, bundle.val_loader, bundle.test_loader
CLASS_NAMES = bundle.class_names
NUM_CLASSES = bundle.num_classes

print("Classes:", CLASS_NAMES)
print("Used split sizes:", len(train_ds), len(val_ds), len(test_ds))
show_batch(train_loader, CLASS_NAMES, n=12)

counts = count_labels(train_ds, NUM_CLASSES)
plt.figure(figsize=(5, 3))
plt.bar(CLASS_NAMES, counts)
plt.title("Training class distribution")
plt.ylabel("Number of images")
plt.xticks(rotation=20)
plt.show()
print(dict(zip(CLASS_NAMES, counts.tolist())))

Dataset: The PneumoniaMNIST is based on a prior dataset of 5,856 pediatric chest X-Ray images. The task is binary-class classification of pneumonia against normal. We split the source training set with a ratio of 9:1 into training and validation set and use its source validation set as the test set. The source images are gray-scale, and their sizes are (384−2,916)×(127−2,713). We center-crop the images and resize them into 1×28×28. ...

Classes: ['normal', 'pneumonia']
Used split sizes: 4708 524 624

{'normal': 1214, 'pneumonia': 3494}

baseline_model = SmallMedicalCNN(num_classes=NUM_CLASSES, image_size=IMAGE_SIZE).to(device)
optimizer = torch.optim.Adam(baseline_model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

for epoch in range(EPOCHS_BASELINE):
    train_loss, train_acc = train_one_epoch(baseline_model, train_loader, optimizer, criterion, device)
    val_out = evaluate(baseline_model, val_loader, device, criterion)
    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_out["loss"])
    history["val_acc"].append(val_out["acc"])
    print(
        f"Epoch {epoch + 1:02d}/{EPOCHS_BASELINE} | "
        f"train loss {train_loss:.3f}, train acc {train_acc:.3f} | "
        f"val loss {val_out['loss']:.3f}, val acc {val_out['acc']:.3f}"
    )

plt.figure(figsize=(6, 4))
plt.plot(history["train_acc"], marker="o", label="train")
plt.plot(history["val_acc"], marker="o", label="validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Baseline training curve")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

test_out = evaluate(baseline_model, test_loader, device, criterion)
print(f"Test accuracy: {test_out['acc']:.3f}")

Epoch 01/10 | train loss 0.451, train acc 0.778 | val loss 0.242, val acc 0.939

Epoch 02/10 | train loss 0.203, train acc 0.894 | val loss 0.164, val acc 0.945

Epoch 03/10 | train loss 0.157, train acc 0.937 | val loss 0.144, val acc 0.948

Epoch 04/10 | train loss 0.121, train acc 0.957 | val loss 0.106, val acc 0.962

Epoch 05/10 | train loss 0.120, train acc 0.955 | val loss 0.096, val acc 0.968

Epoch 06/10 | train loss 0.106, train acc 0.962 | val loss 0.086, val acc 0.968

Epoch 07/10 | train loss 0.097, train acc 0.963 | val loss 0.083, val acc 0.964

Epoch 08/10 | train loss 0.085, train acc 0.972 | val loss 0.074, val acc 0.971

Epoch 09/10 | train loss 0.081, train acc 0.971 | val loss 0.069, val acc 0.971

Epoch 10/10 | train loss 0.072, train acc 0.975 | val loss 0.074, val acc 0.966

labels_np = test_out["labels"].numpy()
preds_np = test_out["preds"].numpy()
probs_np = test_out["probs"].numpy()

cm = confusion_matrix_np(labels_np, preds_np, NUM_CLASSES)
plot_confusion_matrix(cm, CLASS_NAMES, title="Baseline test confusion matrix")

clinical_metrics = binary_clinical_metrics(labels_np, preds_np, probs_np, positive_class=1)
for name, value in clinical_metrics.items():
    print(f"{name}: {value:.3f}")

accuracy: 0.875
sensitivity_recall_pneumonia: 0.987
specificity_normal: 0.688
balanced_accuracy: 0.838
auroc: 0.941

def find_or_fallback(model, loader, desired_label):
    try:
        return find_example(model, loader, device, desired_label=desired_label, correct=True)
    except ValueError:
        return find_example(model, loader, device, desired_label=desired_label, correct=False)

x_pneumonia, y_pneumonia, pred_pneumonia, prob_pneumonia = find_or_fallback(baseline_model, test_loader, desired_label=1)
x_normal, y_normal, pred_normal, prob_normal = find_or_fallback(baseline_model, test_loader, desired_label=0)

print("Pneumonia example:", CLASS_NAMES[y_pneumonia], "pred=", CLASS_NAMES[pred_pneumonia], "probs=", prob_pneumonia.numpy())
print("Normal example:", CLASS_NAMES[y_normal], "pred=", CLASS_NAMES[pred_normal], "probs=", prob_normal.numpy())
show_image_tensor(x_pneumonia, f"True: {CLASS_NAMES[y_pneumonia]} | Pred: {CLASS_NAMES[pred_pneumonia]}")
show_image_tensor(x_normal, f"True: {CLASS_NAMES[y_normal]} | Pred: {CLASS_NAMES[pred_normal]}")

Pneumonia example: pneumonia pred= pneumonia probs= [3.4789142e-07 9.9999964e-01]
Normal example: normal pred= normal probs= [0.99732214 0.00267792]

vg = vanilla_gradient(baseline_model, x_pneumonia, pred_pneumonia)
vg_signed = vanilla_gradient(baseline_model, x_pneumonia, pred_pneumonia, signed=True)
overlay_explanation(x_pneumonia, vg, "Vanilla gradient")
overlay_explanation(x_pneumonia, vg_signed, "Signed gradient (direction retained)")

sg = smoothgrad(baseline_model, x_pneumonia, pred_pneumonia, n_samples=SMOOTHGRAD_SAMPLES, noise_std=0.08)
overlay_explanation(x_pneumonia, sg, "SmoothGrad")

baseline_mode = "gray"  # options: "gray", "black"
ig = integrated_gradients(
    baseline_model,
    x_pneumonia,
    pred_pneumonia,
    baseline_mode=baseline_mode,
    steps=IG_STEPS,
)
overlay_explanation(x_pneumonia, ig, f"Integrated Gradients ({baseline_mode} baseline)")

from captum.attr import IntegratedGradients as CaptumIntegratedGradients

x_for_captum = x_pneumonia.to(device)
if baseline_mode == "gray":
    captum_baseline = torch.zeros_like(x_for_captum)
elif baseline_mode == "black":
    captum_baseline = torch.full_like(x_for_captum, -1.0)
else:
    raise ValueError("baseline_mode must be 'gray' or 'black'")

captum_ig = CaptumIntegratedGradients(baseline_model)
captum_attr, captum_delta = captum_ig.attribute(
    x_for_captum,
    baselines=captum_baseline,
    target=pred_pneumonia,
    n_steps=IG_STEPS,
    method="riemann_right",
    return_convergence_delta=True,
)
captum_map = captum_attr.detach().abs().max(dim=1)[0][0].cpu()

print("Correlation between tutorial IG and Captum IG:", map_correlation(ig, captum_map))
print("Captum convergence delta:", captum_delta.detach().cpu().numpy())
plot_explanation_grid(
    x_pneumonia,
    {"Tutorial IG": ig, "Captum IG": captum_map},
    "Integrated Gradients: transparent implementation vs Captum",
)

Correlation between tutorial IG and Captum IG: 0.9998345971107483
Captum convergence delta: [0.06916523]

occ = occlusion_sensitivity(baseline_model, x_pneumonia, pred_pneumonia, patch_size=8, stride=4, fill_value=0.0)
overlay_explanation(x_pneumonia, occ, "Occlusion sensitivity")

gradcam = GradCAM(baseline_model, baseline_model.features[6])
gcam = gradcam(x_pneumonia, pred_pneumonia)
gradcam.remove()
overlay_explanation(x_pneumonia, gcam, "Grad-CAM")

heatmaps_baseline = {
    "Gradient": vg,
    "SmoothGrad": sg,
    "Integrated Gradients": ig,
    "Occlusion": occ,
    "Grad-CAM": gcam,
}
plot_explanation_grid(x_pneumonia, heatmaps_baseline, "Baseline model explanations")

random_model = SmallMedicalCNN(num_classes=NUM_CLASSES, image_size=IMAGE_SIZE).to(device)
random_model.eval()

with torch.no_grad():
    random_pred = random_model(x_pneumonia).argmax(dim=1).item()

vg_random_pred = vanilla_gradient(random_model, x_pneumonia, random_pred)
vg_random_fixed_target = vanilla_gradient(random_model, x_pneumonia, pred_pneumonia)

print("Trained prediction:", CLASS_NAMES[pred_pneumonia])
print("Random model prediction:", CLASS_NAMES[random_pred])
print("Gradient map correlation trained vs random, random target:", map_correlation(vg, vg_random_pred))
print("Gradient map correlation trained vs random, fixed target:", map_correlation(vg, vg_random_fixed_target))

plot_explanation_grid(
    x_pneumonia,
    {"Trained model": vg, "Random model, fixed target": vg_random_fixed_target},
    "Model randomization test",
)

Trained prediction: pneumonia
Random model prediction: pneumonia
Gradient map correlation trained vs random, random target: 0.04296400025486946
Gradient map correlation trained vs random, fixed target: 0.04296400025486946

classifier_randomized = copy.deepcopy(baseline_model)
reset_module_parameters(classifier_randomized.classifier)
classifier_randomized.eval()

features_randomized = copy.deepcopy(baseline_model)
reset_module_parameters(features_randomized.features)
features_randomized.eval()

vg_classifier_rand = vanilla_gradient(classifier_randomized, x_pneumonia, pred_pneumonia)
vg_features_rand = vanilla_gradient(features_randomized, x_pneumonia, pred_pneumonia)

print("Correlation original vs classifier-randomized, fixed target:", map_correlation(vg, vg_classifier_rand))
print("Correlation original vs features-randomized, fixed target:", map_correlation(vg, vg_features_rand))

plot_explanation_grid(
    x_pneumonia,
    {"Original": vg, "Classifier randomized": vg_classifier_rand, "Features randomized": vg_features_rand},
    "Layer randomization test",
)

Correlation original vs classifier-randomized, fixed target: 0.3412981927394867
Correlation original vs features-randomized, fixed target: 0.08532992750406265

random_label_base = make_subset(train_ds, max_samples=min(512, len(train_ds)), seed=7)
random_label_ds = RandomLabelDataset(random_label_base, num_classes=NUM_CLASSES, seed=123)
random_label_loader = DataLoader(random_label_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

random_label_model = SmallMedicalCNN(num_classes=NUM_CLASSES, image_size=IMAGE_SIZE).to(device)
optimizer = torch.optim.Adam(random_label_model.parameters(), lr=LR)

for epoch in range(EPOCHS_RANDOM_LABEL):
    loss, acc = train_one_epoch(random_label_model, random_label_loader, optimizer, criterion, device)
    print(f"Label-noise epoch {epoch + 1:02d}/{EPOCHS_RANDOM_LABEL} | train loss {loss:.3f}, train acc {acc:.3f}")

with torch.no_grad():
    pred_rl = random_label_model(x_pneumonia).argmax(dim=1).item()
vg_rl = vanilla_gradient(random_label_model, x_pneumonia, pred_pneumonia)
print("Label-noise model prediction:", CLASS_NAMES[pred_rl])
print("Correlation baseline vs label-noise explanation, fixed target:", map_correlation(vg, vg_rl))
plot_explanation_grid(x_pneumonia, {"Baseline": vg, "Label-noise model": vg_rl}, "Label-noise sanity check")

Label-noise epoch 01/5 | train loss 0.701, train acc 0.531

Label-noise epoch 02/5 | train loss 0.697, train acc 0.486

Label-noise epoch 03/5 | train loss 0.694, train acc 0.496

Label-noise epoch 04/5 | train loss 0.690, train acc 0.537

Label-noise epoch 05/5 | train loss 0.688, train acc 0.535
Label-noise model prediction: normal
Correlation baseline vs label-noise explanation, fixed target: 0.03335356339812279

shortcut_train_ds = ShortcutDataset(train_ds, marker_size=8)
shortcut_val_ds = ShortcutDataset(val_ds, marker_size=8)
shortcut_test_ds = ShortcutDataset(test_ds, marker_size=8)
loader_kwargs = {"batch_size": BATCH_SIZE, "num_workers": NUM_WORKERS, "pin_memory": torch.cuda.is_available()}
shortcut_train_loader = DataLoader(shortcut_train_ds, shuffle=True, **loader_kwargs)
shortcut_val_loader = DataLoader(shortcut_val_ds, shuffle=False, **loader_kwargs)
shortcut_test_loader = DataLoader(shortcut_test_ds, shuffle=False, **loader_kwargs)
show_batch(shortcut_train_loader, CLASS_NAMES, n=12)

shortcut_model = SmallMedicalCNN(num_classes=NUM_CLASSES, image_size=IMAGE_SIZE).to(device)
optimizer = torch.optim.Adam(shortcut_model.parameters(), lr=LR)
shortcut_history = {"train_acc": [], "val_acc": []}

for epoch in range(EPOCHS_SHORTCUT):
    train_loss, train_acc = train_one_epoch(shortcut_model, shortcut_train_loader, optimizer, criterion, device)
    val_out = evaluate(shortcut_model, shortcut_val_loader, device, criterion)
    shortcut_history["train_acc"].append(train_acc)
    shortcut_history["val_acc"].append(val_out["acc"])
    print(f"Shortcut epoch {epoch + 1:02d}/{EPOCHS_SHORTCUT} | train acc {train_acc:.3f} | val acc {val_out['acc']:.3f}")

plt.figure(figsize=(6, 4))
plt.plot(shortcut_history["train_acc"], marker="o", label="marked train")
plt.plot(shortcut_history["val_acc"], marker="o", label="marked validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Shortcut model training curve")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Shortcut epoch 01/8 | train acc 0.912 | val acc 1.000

Shortcut epoch 02/8 | train acc 0.999 | val acc 1.000

Shortcut epoch 03/8 | train acc 1.000 | val acc 1.000

Shortcut epoch 04/8 | train acc 1.000 | val acc 1.000

Shortcut epoch 05/8 | train acc 1.000 | val acc 1.000

Shortcut epoch 06/8 | train acc 1.000 | val acc 1.000

Shortcut epoch 07/8 | train acc 1.000 | val acc 1.000

Shortcut epoch 08/8 | train acc 1.000 | val acc 1.000

shortcut_marked_test = evaluate(shortcut_model, shortcut_test_loader, device, criterion)
shortcut_clean_test = evaluate(shortcut_model, test_loader, device, criterion)
print(f"Shortcut model accuracy on marked test images: {shortcut_marked_test['acc']:.3f}")
print(f"Shortcut model accuracy on clean test images:  {shortcut_clean_test['acc']:.3f}")

Shortcut model accuracy on marked test images: 1.000
Shortcut model accuracy on clean test images:  0.671

x_short, y_short, pred_short, prob_short = find_or_fallback(shortcut_model, shortcut_test_loader, desired_label=1)
print("Shortcut example true:", CLASS_NAMES[y_short], "pred:", CLASS_NAMES[pred_short], "probs:", prob_short.numpy())
show_image_tensor(x_short, "Marked shortcut image")

vg_short = vanilla_gradient(shortcut_model, x_short, pred_short)
sg_short = smoothgrad(shortcut_model, x_short, pred_short, n_samples=SMOOTHGRAD_SAMPLES, noise_std=0.08)
ig_short = integrated_gradients(shortcut_model, x_short, pred_short, baseline_mode=baseline_mode, steps=IG_STEPS)
occ_short = occlusion_sensitivity(shortcut_model, x_short, pred_short, patch_size=8, stride=4)
gradcam_short = GradCAM(shortcut_model, shortcut_model.features[6])
gcam_short = gradcam_short(x_short, pred_short)
gradcam_short.remove()

heatmaps_shortcut = {
    "Gradient": vg_short,
    "SmoothGrad": sg_short,
    "Integrated Gradients": ig_short,
    "Occlusion": occ_short,
    "Grad-CAM": gcam_short,
}
plot_explanation_grid(x_short, heatmaps_shortcut, "Shortcut model explanations")

Shortcut example true: pneumonia pred: pneumonia probs: [1.0583699e-12 1.0000000e+00]

def remove_markers(x, marker_size=8, fill_value=0.0):
    x = x.clone()
    x[:, :, :marker_size, :marker_size] = fill_value
    x[:, :, -marker_size:, -marker_size:] = fill_value
    return x

x_short_clean = remove_markers(x_short, marker_size=8, fill_value=0.0)
with torch.no_grad():
    probs_with = F.softmax(shortcut_model(x_short), dim=1)[0].cpu()
    probs_without = F.softmax(shortcut_model(x_short_clean), dim=1)[0].cpu()

print("With marker probs:   ", {CLASS_NAMES[i]: float(probs_with[i]) for i in range(NUM_CLASSES)})
print("Without marker probs:", {CLASS_NAMES[i]: float(probs_without[i]) for i in range(NUM_CLASSES)})

plt.figure(figsize=(8, 3.5))
plt.subplot(1, 2, 1)
plt.imshow(denorm(x_short[0, 0].detach().cpu()), cmap="gray")
plt.title("With marker")
plt.axis("off")
plt.subplot(1, 2, 2)
plt.imshow(denorm(x_short_clean[0, 0].detach().cpu()), cmap="gray")
plt.title("Marker removed")
plt.axis("off")
plt.show()

With marker probs:    {'normal': 1.0580105585036859e-12, 'pneumonia': 1.0}
Without marker probs: {'normal': 0.6006954312324524, 'pneumonia': 0.3993045687675476}

mask_short = marker_mask_for_label(y_short, IMAGE_SIZE, IMAGE_SIZE, marker_size=8)
fractions = {name: shortcut_region_attribution_fraction(h, mask_short) for name, h in heatmaps_shortcut.items()}
plot_metric_bar(
    fractions,
    ylabel="Fraction of attribution in marker",
    title="How much attribution falls in the non-clinical shortcut?",
    ylim=(0, max(0.2, max(fractions.values()) * 1.2)),
)
fractions

{'Gradient': 0.2587019184277326,
 'SmoothGrad': 0.2724421616220347,
 'Integrated Gradients': 0.5697475097826117,
 'Occlusion': 0.9999995758673301,
 'Grad-CAM': 0.0200277234358911}

print("Self-correlation:", map_correlation(vg, vg))
print("Baseline vs model-randomized fixed target:", map_correlation(vg, vg_random_fixed_target))
print("Baseline vs label-noise fixed target:", map_correlation(vg, vg_rl))

Self-correlation: 0.9999999403953552
Baseline vs model-randomized fixed target: 0.04296400025486946
Baseline vs label-noise fixed target: 0.03335356339812279

def plot_deletion_curves(model, x, heatmaps, target_class, title="Deletion curves"):
    aucs = {}
    plt.figure(figsize=(7, 4.5))
    for name, heatmap in heatmaps.items():
        xs, ys = deletion_curve(model, x, heatmap, target_class, steps=20, fill_value=0.0)
        aucs[name] = deletion_auc(xs, ys)
        plt.plot(xs, ys, marker="o", markersize=3, label=f"{name} AUC={aucs[name]:.3f}")
    plt.xlabel("Fraction of pixels deleted")
    plt.ylabel("Target class probability")
    plt.title(title)
    plt.legend(fontsize=8)
    plt.grid(alpha=0.3)
    plt.show()
    return aucs

baseline_deletion_aucs = plot_deletion_curves(
    baseline_model, x_pneumonia, heatmaps_baseline, pred_pneumonia, title="Deletion curves: baseline model"
)
shortcut_deletion_aucs = plot_deletion_curves(
    shortcut_model, x_short, heatmaps_shortcut, pred_short, title="Deletion curves: shortcut model"
)
print("Baseline deletion AUC:", baseline_deletion_aucs)
print("Shortcut deletion AUC:", shortcut_deletion_aucs)

Baseline deletion AUC: {'Gradient': 0.9978689438139554, 'SmoothGrad': 0.9985082889324985, 'Integrated Gradients': 0.9898751714426908, 'Occlusion': 0.5917592504847562, 'Grad-CAM': 0.996942297759233}
Shortcut deletion AUC: {'Gradient': 0.8972911030869, 'SmoothGrad': 0.8860569059252157, 'Integrated Gradients': 0.8641085574490717, 'Occlusion': 0.2962369700023828, 'Grad-CAM': 0.7814376527858258}

def explanation_stability(model, x, method_fn, target_class, n=8, noise_std=0.03):
    base_map = method_fn(model, x, target_class).detach().cpu()
    corrs = []
    same_prediction = 0
    with torch.no_grad():
        base_pred = model(x).argmax(dim=1).item()
    for _ in range(n):
        x_pert = perturb_image(x, noise_std=noise_std)
        with torch.no_grad():
            pert_pred = model(x_pert).argmax(dim=1).item()
        same_prediction += int(pert_pred == base_pred)
        pert_map = method_fn(model, x_pert, target_class).detach().cpu()
        corrs.append(map_correlation(base_map, pert_map))
    return {
        "mean_corr": float(np.mean(corrs)),
        "std_corr": float(np.std(corrs)),
        "same_prediction_rate": same_prediction / n,
    }

stability_results = {
    "Gradient": explanation_stability(baseline_model, x_pneumonia, vanilla_gradient, pred_pneumonia, n=8),
    "Integrated Gradients": explanation_stability(
        baseline_model,
        x_pneumonia,
        lambda m, x, c: integrated_gradients(m, x, c, steps=12),
        pred_pneumonia,
        n=5,
    ),
}
print(stability_results)

names = list(stability_results.keys())
means = [stability_results[n]["mean_corr"] for n in names]
stds = [stability_results[n]["std_corr"] for n in names]
plt.figure(figsize=(6, 3.5))
plt.bar(names, means, yerr=stds, capsize=4)
plt.ylim(0, 1.05)
plt.ylabel("Mean heatmap correlation")
plt.title("Explanation stability under small noise")
plt.xticks(rotation=20)
plt.show()

{'Gradient': {'mean_corr': 0.743223138153553, 'std_corr': 0.011179810885290594, 'same_prediction_rate': 1.0}, 'Integrated Gradients': {'mean_corr': 0.8565796971321106, 'std_corr': 0.012707924825512515, 'same_prediction_rate': 1.0}}

summary_rows = []
for name in heatmaps_baseline:
    summary_rows.append({
        "method": name,
        "baseline_deletion_auc": baseline_deletion_aucs.get(name, np.nan),
        "shortcut_deletion_auc": shortcut_deletion_aucs.get(name, np.nan),
        "shortcut_marker_fraction": fractions.get(name, np.nan),
    })
for row in summary_rows:
    print(row)

methods = [row["method"] for row in summary_rows]
marker_vals = [row["shortcut_marker_fraction"] for row in summary_rows]
base_auc_vals = [row["baseline_deletion_auc"] for row in summary_rows]
short_auc_vals = [row["shortcut_deletion_auc"] for row in summary_rows]

plt.figure(figsize=(7, 3.5))
plt.bar(methods, marker_vals)
plt.ylabel("Shortcut marker attribution fraction")
plt.title("Did the method reveal the shortcut?")
plt.xticks(rotation=30, ha="right")
plt.show()

plt.figure(figsize=(7, 3.5))
xpos = np.arange(len(methods))
width = 0.35
plt.bar(xpos - width / 2, base_auc_vals, width, label="baseline")
plt.bar(xpos + width / 2, short_auc_vals, width, label="shortcut")
plt.ylabel("Deletion AUC")
plt.title("Deletion AUC comparison")
plt.xticks(xpos, methods, rotation=30, ha="right")
plt.legend()
plt.show()

{'method': 'Gradient', 'baseline_deletion_auc': 0.9978689438139554, 'shortcut_deletion_auc': 0.8972911030869, 'shortcut_marker_fraction': 0.2587019184277326}
{'method': 'SmoothGrad', 'baseline_deletion_auc': 0.9985082889324985, 'shortcut_deletion_auc': 0.8860569059252157, 'shortcut_marker_fraction': 0.2724421616220347}
{'method': 'Integrated Gradients', 'baseline_deletion_auc': 0.9898751714426908, 'shortcut_deletion_auc': 0.8641085574490717, 'shortcut_marker_fraction': 0.5697475097826117}
{'method': 'Occlusion', 'baseline_deletion_auc': 0.5917592504847562, 'shortcut_deletion_auc': 0.2962369700023828, 'shortcut_marker_fraction': 0.9999995758673301}
{'method': 'Grad-CAM', 'baseline_deletion_auc': 0.996942297759233, 'shortcut_deletion_auc': 0.7814376527858258, 'shortcut_marker_fraction': 0.0200277234358911}

When Explanations Lie: Stress-Testing Saliency Maps Before Clinical Deployment¶

Abstract¶

Who is this tutorial for?¶

Learning objectives¶

Clinical motivation: from pixels to patients¶

Setup and reproducibility¶

Dataset: PneumoniaMNIST¶

Training a baseline pneumonia classifier¶

Clinical metrics beyond accuracy¶

Saliency methods: intuition and implementation¶

Explanation vocabulary¶

Vanilla gradients¶

SmoothGrad¶

Integrated Gradients¶

Extension: compare Integrated Gradients with Captum¶

Occlusion sensitivity¶

Grad-CAM¶

Why plausible heatmaps can be misleading¶

Sanity check 1: model randomization¶

Sanity check 2: layer randomization¶

Sanity check 3: label-noise sanity check¶

Shortcut learning experiment¶

Add a synthetic non-clinical marker¶

Train a shortcut model¶

Compare clean vs marked test performance¶

Remove the marker and measure probability sensitivity¶

Quantify attribution to the shortcut region¶

Quantitative explanation evaluation¶

Map correlation¶

Deletion curves¶

Deletion AUC¶

What deletion AUC shows in this run¶

Stability under perturbations¶

Shortcut-region attribution fraction¶

Clinical interpretation and limitations¶

Learner exercises and expected observations¶

From pixels to patients: what this tutorial teaches¶

Checklist before trusting saliency maps in clinical AI¶

Key takeaways¶

AI assistance disclosure¶

References¶