Evidence-based guidelines for rigorous, reproducible, and regulation-ready cross-validation in clinical machine learning projects.
The most important do's and don'ts at a glance.
Follow this decision tree to choose the right cross-validation strategy.
Side-by-side comparison of wrong and correct approaches.
1. Data Leakage — Preprocessing Before Splitting
# Preprocessing before splitting - LEAKAGE!
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test = train_test_split(X_scaled)
# Split first, then preprocess
X_train, X_test = train_test_split(X)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Only transform!
2. Patient Data Mixing
# Regular k-fold ignores patient grouping
kf = KFold(n_splits=5)
for train, test in kf.split(X):
# Same patient can be in both sets!
from trustcv.splitters import GroupKFold
pgkf = GroupKFold(n_splits=5)
for train, test in pgkf.split(X, groups=patient_ids):
# Patient data stays together
3. Ignoring Class Imbalance
# With 95% negative, 5% positive cases
cv_scores = cross_val_score(model, X, y, cv=5)
# Some folds might have NO positive cases!
skf = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(model, X, y, cv=skf)
# Each fold maintains 95/5 ratio
4. Temporal Leakage
# Random splitting of time-series data
X_train, X_test = train_test_split(
temporal_data, random_state=42
)
# Future data leaks into training!
from trustcv.splitters import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
for train, test in tscv.split(X, timestamps=dates):
# Always train on past, test on future
Key factors unique to clinical and biomedical machine learning.
Minimum samples per class per fold:
def check_sample_size(y, n_splits=5):
"""Check if sample size adequate for CV"""
unique, counts = np.unique(y, return_counts=True)
min_class_size = counts.min()
samples_per_fold = min_class_size // n_splits
if samples_per_fold < 30:
warnings.warn(
f"Only {samples_per_fold} samples/fold "
"for minority class. Consider fewer "
"splits or different method."
)
return samples_per_fold
When data comes from multiple hospitals or sites, ensure site effects do not bias results by splitting at the site level.
from trustcv.splitters import HierarchicalGroupKFold
# Ensure site effects don't bias results
hgkf = HierarchicalGroupKFold(
n_splits=5,
hierarchy_level='site' # Split by site
)
# Prevents overfitting to site-specific patterns
For repeated measurements over time, you must respect both patient grouping and temporal ordering.
from trustcv import TrustCVValidator
validator = TrustCVValidator(
method='grouped_temporal',
patient_grouping=True,
temporal_ordering=True
)
# Each patient's visits stay together
# and temporal order is preserved
For extremely imbalanced datasets (<1% positive), use fewer folds and apply oversampling only on training data.
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3) # Fewer splits
for train_idx, test_idx in skf.split(X, y):
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
# SMOTE on training data only
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(
X_train, y_train
)
model.fit(X_bal, y_bal)
score = model.score(X_test, y_test)
A full best-practice pipeline combining leakage checks, preprocessing, medical-aware validation, and regulatory reporting.
from trustcv import TrustCVValidator
from trustcv.checkers import DataLeakageChecker
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# 1. Check for data leakage
checker = DataLeakageChecker()
leakage_report = checker.check_cv_splits(
X_train, X_test,
patient_ids_train, patient_ids_test
)
if leakage_report.has_leakage:
raise ValueError(f"Data leakage detected: {leakage_report}")
# 2. Create preprocessing pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100))
])
# 3. Set up medical-aware validation
validator = TrustCVValidator(
method='stratified_group_kfold',
n_splits=5,
check_leakage=True,
check_balance=True,
compliance='FDA'
)
# 4. Perform validation
results = validator.validate(
model=pipeline, X=X, y=y,
groups=patient_ids
)
# 5. Get comprehensive results
print(results.summary())
# 6. Export results
print(results.to_dict())
Click each item to mark it as done. Verify all items before training any model.