Data leakage occurs when information from the test set inadvertently influences the training process, leading to overly optimistic performance estimates that fail to generalize. In medical machine learning, where patient safety depends on honest validation, detecting and preventing leakage is critical.
Data leakage occurs when information from the test set inadvertently influences the training process, leading to overly optimistic performance estimates that don't generalize to real-world data. In medical ML, this can mean the difference between a model that appears to work perfectly in validation but fails catastrophically in clinical practice. trustcv provides comprehensive tools to detect, report, and prevent all major forms of data leakage.
Four categories of leakage commonly encountered in medical machine learning
The same patient's data appears in both training and test sets. This is the most common form of leakage in healthcare ML, occurring when patients have multiple samples (repeated measurements, multiple images, multi-modal data like CT + MRI).
Using future information to predict past events. This happens when time series data is shuffled or when post-treatment data is used to predict pre-treatment outcomes.
Spatially correlated samples end up in both train and test sets. Adjacent tissue samples, neighboring geographic regions, or overlapping image patches share information that inflates apparent performance.
Related samples through hierarchical structure appear in both sets. Hospital-specific practices, scanner-specific artifacts, or site-specific protocols create hidden correlations.
Side-by-side comparison of wrong vs. correct approaches for each leakage type
from sklearn.model_selection import KFold
import numpy as np
# Multiple images per patient
n_images = 1000
n_patients = 200
patient_ids = np.repeat(
np.arange(n_patients),
n_images // n_patients
)
X = np.random.randn(n_images, 224, 224, 3)
y = np.random.randint(0, 2, n_images)
# WRONG: ignores patient grouping
standard_cv = KFold(n_splits=5)
for train_idx, test_idx in standard_cv.split(X):
# Same patient can be in train AND test!
pass
from trustcv import GroupKFoldMedical
from trustcv import DataLeakageChecker
checker = DataLeakageChecker()
# CORRECT: groups by patient ID
grouped_cv = GroupKFoldMedical(n_splits=5)
for train_idx, test_idx in grouped_cv.split(
X, y, groups=patient_ids
):
# Verify no leakage in this fold
report = checker.check(
X, y, groups=patient_ids, n_splits=5
)
# No leakage: patients stay together
from sklearn.model_selection import KFold
n_timepoints = 365
n_patients = 100
timestamps = np.repeat(
np.arange(n_timepoints), n_patients
)
X = np.random.randn(
n_timepoints * n_patients, 20
)
y = np.random.randint(
0, 2, n_timepoints * n_patients
)
# WRONG: shuffles time series data!
cv = KFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(X):
# Future data leaks into training!
pass
from trustcv import PurgedKFoldCV
# CORRECT: respects temporal order with gap
purged_cv = PurgedKFoldCV(
n_splits=5,
purge_gap=7 # 7-day gap
)
for train_idx, test_idx in purged_cv.split(
X, y, groups=timestamps
):
# Purged CV prevents temporal leakage
# No future data leaks into training
from sklearn.model_selection import KFold
n_patches = 500
coordinates = np.random.randn(
n_patches, 2
) * 100 # x, y positions
X = np.random.randn(n_patches, 2048)
y = np.random.randint(0, 2, n_patches)
# WRONG: ignores spatial correlation
random_cv = KFold(n_splits=5, shuffle=True)
for train_idx, test_idx in random_cv.split(X):
# Adjacent patches in train and test!
pass
from trustcv import BufferedSpatialCV
# CORRECT: spatial blocking with buffer
spatial_cv = BufferedSpatialCV(
n_splits=5,
spatial_coordinates=coordinates,
buffer_size=10 # 10-unit buffer
)
for train_idx, test_idx in spatial_cv.split(X):
# Buffer zones prevent spatial leakage
# No adjacent patches in train and test
from sklearn.model_selection import KFold
# Multi-center clinical trial data
# countries -> hospitals -> patients -> samples
countries = np.array(countries)
hospitals = np.array(hospitals)
patients = np.array(patients)
# WRONG: ignores hierarchical structure
cv = KFold(n_splits=5)
for train_idx, test_idx in cv.split(
range(len(patients))
):
# Leakage at patient, hospital,
# and country levels!
pass
from trustcv import HierarchicalGroupKFold
# CORRECT: respects hierarchy levels
hierarchical_cv = HierarchicalGroupKFold(
n_splits=5,
hierarchy_levels=[
'country', 'hospital', 'patient'
]
)
# Properly separates at chosen level
# Entire hospitals kept together
# No cross-contamination between groups
Use DataLeakageChecker and LeakageDetectionCallback to automatically detect leakage during cross-validation
Standalone checker you can use to verify any train/test split for all types of leakage.
from trustcv import DataLeakageChecker
checker = DataLeakageChecker(verbose=True)
# Check all leakage types at once
report = checker.check(
X, y,
groups=patient_ids, # patient grouping
timestamps=timestamps, # temporal order
coordinates=coordinates, # spatial data
n_splits=5,
random_state=42
)
# Review the report
print(report.summary)
print(f"Has leakage: {report.has_leakage}")
print(f"Severity: {report.severity}")
print(f"Types: {report.leakage_types}")
Integrate leakage detection directly into your UniversalCVRunner pipeline for automatic fold-by-fold checking.
from trustcv import UniversalCVRunner
from trustcv.core.callbacks import LeakageDetectionCallback
# Create callback with your metadata
leakage_cb = LeakageDetectionCallback(
patient_ids=patient_ids,
timestamps=timestamps,
coordinates=spatial_coords
)
# Run CV with automatic leakage detection
runner = UniversalCVRunner(cv_splitter=your_cv)
results = runner.run(
model=your_model,
data=(X, y),
callbacks=[leakage_cb]
)
# Callback automatically:
# - Checks each fold on_fold_start
# - Reports severity and violations
# - Prints summary on_cv_end
Quick reference for preventing each type of leakage
When you have multiple samples per patient, always group by patient ID to keep all of a patient's data on the same side of the split.
cv = GroupKFoldMedical(n_splits=5)
for train_idx, test_idx in cv.split(
X, y, groups=patient_ids
):
# Safe from patient leakage
pass
For time series data, add a purge gap between training and test periods to prevent temporal information from leaking across folds.
# Add purge gap for time series
cv = PurgedKFoldCV(
n_splits=5,
purge_gap=30 # 30-day gap
)
For spatially correlated data, add buffer zones between train and test regions to eliminate spatial autocorrelation.
# Add buffer zones for spatial data
cv = BufferedSpatialCV(
buffer_size=100 # 100m buffer
)
In multi-center studies, group by the highest hierarchical level (e.g., hospital) to prevent cross-contamination.
# Group by highest level
cv = GroupKFoldMedical(n_splits=5)
for train_idx, test_idx in cv.split(
X, y, groups=hospital_ids
):
# Entire hospitals kept together
pass
Frequently encountered mistakes and how to fix them
| Pitfall | Consequence | Solution |
|---|---|---|
| Using patient's left and right eye images in different sets | Model learns patient-specific features | Group by patient ID |
| Training on 2023 data, testing on 2022 | Impossible in real-world practice | Use TimeSeriesSplit |
| Adjacent tissue samples in train/test | Spatial correlation inflates performance | Use SpatialBlockCV |
| Same MRI scanner in all training data | Model learns scanner artifacts | Stratify by scanner |
| Data augmentation before splitting | Augmented versions in both sets | Augment after splitting |