Note
Go to the end to download the full example code.
Porto Seguro: balancing samples in mini-batches with Keras#
This example compares two strategies to train a neural-network on the Porto Seguro Kaggle data set [1]. The data set is imbalanced and we show that balancing each mini-batch allows to improve performance and reduce the training time.
References#
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
print(__doc__)
Data loading#
from collections import Counter
import numpy as np
import pandas as pd
First, you should download the Porto Seguro data set from Kaggle. See the link in the introduction.
training_data = pd.read_csv("./input/train.csv")
testing_data = pd.read_csv("./input/test.csv")
y_train = training_data[["id", "target"]].set_index("id")
X_train = training_data.drop(["target"], axis=1).set_index("id")
X_test = testing_data.set_index("id")
The data set is imbalanced and it will have an effect on the fitting.
print(f"The data set is imbalanced: {Counter(y_train['target'])}")
Define the pre-processing pipeline#
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
def convert_float64(X):
return X.astype(np.float64)
We want to standard scale the numerical features while we want to one-hot
encode the categorical features. In this regard, we make use of the
ColumnTransformer
.
numerical_columns = [
name for name in X_train.columns if "_calc_" in name and "_bin" not in name
]
numerical_pipeline = make_pipeline(
FunctionTransformer(func=convert_float64, validate=False), StandardScaler()
)
categorical_columns = [name for name in X_train.columns if "_cat" in name]
categorical_pipeline = make_pipeline(
SimpleImputer(missing_values=-1, strategy="most_frequent"),
OneHotEncoder(categories="auto"),
)
preprocessor = ColumnTransformer(
[
("numerical_preprocessing", numerical_pipeline, numerical_columns),
(
"categorical_preprocessing",
categorical_pipeline,
categorical_columns,
),
],
remainder="drop",
)
# Create an environment variable to avoid using the GPU. This can be changed.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Dropout
Create a neural-network#
from tensorflow.keras.models import Sequential
def make_model(n_features):
model = Sequential()
model.add(Dense(200, input_shape=(n_features,), kernel_initializer="glorot_normal"))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(100, kernel_initializer="glorot_normal", use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.25))
model.add(Dense(50, kernel_initializer="glorot_normal", use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.15))
model.add(Dense(25, kernel_initializer="glorot_normal", use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
We create a decorator to report the computation time
The first model will be trained using the fit
method and with imbalanced
mini-batches.
import tensorflow
from sklearn.metrics import roc_auc_score
from sklearn.utils.fixes import parse_version
tf_version = parse_version(tensorflow.__version__)
@timeit
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
model = make_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000)
if tf_version < parse_version("2.6"):
# predict_proba was removed in tensorflow 2.6
predict_method = "predict_proba"
else:
predict_method = "predict"
y_pred = getattr(model, predict_method)(X_test, batch_size=1000)
return roc_auc_score(y_test, y_pred)
In the contrary, we will use imbalanced-learn to create a generator of mini-batches which will yield balanced mini-batches.
from imblearn.keras import BalancedBatchGenerator
@timeit
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
model = make_model(X_train.shape[1])
training_generator = BalancedBatchGenerator(
X_train, y_train, batch_size=1000, random_state=42
)
model.fit(training_generator, epochs=5, verbose=1)
y_pred = model.predict(X_test, batch_size=1000)
return roc_auc_score(y_test, y_pred)
Classification loop#
We will perform a 10-fold cross-validation and train the neural-network with the two different strategies previously presented.
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
y_local_train = y_train.iloc[train_idx].values.ravel()
X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
y_local_test = y_train.iloc[valid_idx].values.ravel()
elapsed_time, roc_auc = fit_predict_imbalanced_model(
X_local_train, y_local_train, X_local_test, y_local_test
)
cv_time_imbalanced.append(elapsed_time)
cv_results_imbalanced.append(roc_auc)
elapsed_time, roc_auc = fit_predict_balanced_model(
X_local_train, y_local_train, X_local_test, y_local_test
)
cv_time_balanced.append(elapsed_time)
cv_results_balanced.append(roc_auc)
Plot of the results and computation time#
df_results = pd.DataFrame(
{
"Balanced model": cv_results_balanced,
"Imbalanced model": cv_results_imbalanced,
}
)
df_results = df_results.unstack().reset_index()
df_time = pd.DataFrame(
{"Balanced model": cv_time_balanced, "Imbalanced model": cv_time_imbalanced}
)
df_time = df_time.unstack().reset_index()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure()
sns.boxplot(y="level_0", x=0, data=df_time)
sns.despine(top=True, right=True, left=True)
plt.xlabel("time [s]")
plt.ylabel("")
plt.title("Computation time difference using a random under-sampling")
plt.figure()
sns.boxplot(y="level_0", x=0, data=df_results, whis=10.0)
sns.despine(top=True, right=True, left=True)
ax = plt.gca()
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x)))
plt.xlabel("ROC-AUC")
plt.ylabel("")
plt.title("Difference in terms of ROC-AUC using a random under-sampling")
Estimated memory usage: 0 MB