Sitemap
AWS Tip

Best AWS, DevOps, Serverless, and more from top Medium writers .

Follow publication

Cross-Validation Techniques for Machine Learning: A Guide to Improve Model Performance

12 min readJan 27, 2023

--

Photo by Kelly Sikkema on Unsplash
Train-test split. Image by the author.
Diagram of k-fold cross-validation. Source: Wikipedia

Hold Out Method

import pandas as pd
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df["Target"] = boston.target
df.head()
print(df.shape) #(506, 14)
df.head(). Image by the author.
X = boston.data
y = boston.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape) #(404, 13)
print(X_test.shape) #(102, 13)
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
estimator = xgb.XGBRegressor(objective ='reg:squarederror', seed = 10)
estimator.fit(X_train, y_train)

# Predict the model
y_pred = estimator.predict(X_test)

# RMSE Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: % f" %(rmse))
#RMSE: 2.561353

K-Fold

K-Fold. Source: Scikit-learn
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

print(kf)
#KFold(n_splits=5, random_state=None, shuffle=False)

#Generate indices to split data into training and test set.
fold_generator = kf.split(X)
print(fold_generator)
#<generator object _BaseKFold.split at 0x7f96e85e2dd0>

for i, (train_index, test_index) in enumerate(fold_generator):
print(f"Fold {i}:")
print(f" Train: index={train_index}")
print(f" Test: index={test_index}")

"""
Fold 0:
Train: index=[102 103 104 105 106.....
Test: index=[ 0 1 2 3 4 5 ....
Fold 1:
Train: index=[ 0 1 2 3 4 5 6 ..
....
"""
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (12, 5))
colors = ["orange","red","blue","gray","black"]
counter = 0
for i, (train_index, test_index) in enumerate(kf.split(X)):
plt.scatter(train_index,np.full_like(train_index, counter+1), c =colors[i])
plt.scatter(test_index, np.full_like(test_index, counter), c =colors[i])
counter += 2.5
plt.show()
Folds. Image by the author.
import xgboost as xgb
estimator = xgb.XGBRegressor(seed=10)

scores = []
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
estimator.fit(X_train, y_train)
scores.append(estimator.score(X_test, y_test)) #returns r2

# scores
print(scores) #[0.7343818374985939, 0.8490298604014259, 0.8257969178459365, 0.5237461839046007, 0.29743000915088447]
print(np.mean(scores)) #0.6460769617602883
from sklearn.model_selection import cross_val_score
result = cross_val_score(estimator , X, y, cv = kf, scoring="r2")
print(result)
#[0.73438184 0.84902986 0.82579692 0.52374618 0.29743001]
from sklearn.model_selection import GridSearchCV

model = XGBRegressor()

# hyperparameter grid
param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.2, 0.3]}

grid_search = GridSearchCV(model, param_grid, cv=kf)
grid_search.fit(X, y)
print(grid_search.best_params_)
#{'learning_rate': 0.1, 'max_depth': 3}
import pandas as pd
df = pd.DataFrame(grid_search.cv_results_)
print(df)
CV Scores. Image by the author.

Stratified K-Fold

Stratified K-Fold. Image by the author.
import pandas as pd
data = pd.read_csv("creditcard.csv")
data.head()
Dataframe. Image by the author.
import matplotlib.pyplot as plt
data.dropna(inplace=True)
counts = data["Class"].value_counts()
plt.bar(counts.index, counts.values)
plt.xlabel("Target Value")
plt.ylabel("Count")
plt.xticks([0, 1])
plt.show()
print(counts)
"""
0 284315
1 492
Name: Class, dtype: int64
"""
Distribution in a world where the good win. Image by the author.
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

X = data.drop(['Class'], axis=1)
y = data['Class']

skf = StratifiedKFold(n_splits=5)
estimator = xgb.XGBClassifier(seed=10)

for train_index, test_index in skf.split(X, y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)
print(f"Fold Distribution of Train: {np.bincount(y_train)} - Test: {np.bincount(y_test)}")

"""
Fold Distribution of Train: [227452 393] - Test: [56863 99]
Fold Distribution of Train: [227452 393] - Test: [56863 99]
Fold Distribution of Train: [227452 394] - Test: [56863 98]
Fold Distribution of Train: [227452 394] - Test: [56863 98]
Fold Distribution of Train: [227452 394] - Test: [56863 98]
"""
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

X = data.drop(['Class'], axis=1)
y = data['Class']

skf = StratifiedKFold(n_splits=5)
estimator = xgb.XGBClassifier(seed=10)

for train_index, test_index in skf.split(X, y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

"""
precision recall f1-score support

0 1.00 0.97 0.99 56863
1 0.06 0.96 0.11 99

accuracy 0.97 56962
macro avg 0.53 0.97 0.55 56962
weighted avg 1.00 0.97 0.98 56962

precision recall f1-score support

0 1.00 1.00 1.00 56863
1 0.96 0.76 0.85 99

accuracy 1.00 56962
macro avg 0.98 0.88 0.92 56962
weighted avg 1.00 1.00 1.00 56962

precision recall f1-score support
.
.
.
"""
from sklearn.model_selection import cross_val_score
estimator = xgb.XGBClassifier(seed=10)
result = cross_val_score(estimator , X, y, cv = skf, scoring="accuracy")
print(result)

#[0.97180577 0.999526 0.99899932 0.99963133 0.99943821]
from sklearn.model_selection import GridSearchCV
estimator = xgb.XGBClassifier(seed=10)
skf = StratifiedKFold(n_splits=5)
# hyperparameter grid
param_grid = {'max_depth': [3, 5], 'learning_rate': [0.1, 0.2]}
grid_search = GridSearchCV(estimator, param_grid, cv=skf)
grid_search.fit(X, y)
print(grid_search.best_params_)
#{'learning_rate': 0.2, 'max_depth': 5}

import pandas as pd
df = pd.DataFrame(grid_search.cv_results_)
print(df)
CV Scores. Image by the author.

Leave One Out

Leave one out. Image by the author.
#Let's go back to the Boston dataset

from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
fold_generator = loo.split(X)
for i, (train_index, test_index) in enumerate(fold_generator):
print(f"Fold {i}:")
print(f" Train: index={len(train_index)}")
print(f" Test: index={len(test_index)}")

"""
Fold 0:
Train: index=505
Test: index=1
Fold 1:
Train: index=505
Test: index=1
Fold 2:
Train: index=505
Test: index=1
Fold 3:
Train: index=505
Test: index=1
.
.
.

"""

Monte Carlo Cross Validation

Dubitzky, Werner, Martin Granzow, and Daniel P. Berrar, eds. Fundamentals of data mining in genomics and proteomics. Springer Science & Business Media, 2007.
from sklearn.model_selection import ShuffleSplit
shuffle=ShuffleSplit(test_size=0.2,train_size=0.8,n_splits=5)
for i, (train_index, test_index) in enumerate(shuffle.split(X)):
print(f"Fold {i}:")
print(f" Train: index={train_index}")
print(f" Test: index={test_index}")

"""
Fold 0:
Train: index=[154 367 362 336 86 442 282 ...
Test: index=[251 14 238 107 484 ...
Fold 1:
Train: index=[250 300 11 104 437 80 ...
Test: index=[261 232 382 376 18 468 ...
.
.
.
"""

Bootstrapping

import numpy as np
from sklearn.utils import resample

# the number of bootstrap samples to draw
n_iter = 100

# train test split percentage
percentage = 0.8
for i in range(n_iter):
X_train, y_train = resample(X, y, replace=True, n_samples=int(X.shape[0]*percentage))
X_test, y_test = resample(X, y, replace=True, n_samples=int(X.shape[0]*(1-percentage)))
print(f"Shapes: X_train: {X_train.shape} y_train: {y_train.shape} X_test: {X_test.shape} y_test: {y_test.shape}")

"""
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
Shapes: X_train: (404, 13) y_train: (404,) X_test: (101, 13) y_test: (101,)
"""

Conclusion

Read More

Sources

--

--

AWS Tip
AWS Tip

Published in AWS Tip

Best AWS, DevOps, Serverless, and more from top Medium writers .

Okan Yenigün
Okan Yenigün

Written by Okan Yenigün

1M+ reads | AI, Data & Software Eng. demystified | Actionable insights for builders | Follow for your tech edge.

No responses yet

Write a response