Przy budowie modelów ML występuje problrm dobrania najlepszych parametrów. Poniżej dowiemy się jak dobierać optymalne hiperparametry dla modelu.
import numpy as np
import pandas as pd
#import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pylab as plt
from pylab import plot, show, subplot, specgram, imshow, savefig
from sklearn import preprocessing
#from sklearn import cross_validation, metrics
from sklearn.preprocessing import Normalizer
#from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plote
plt.style.use('ggplot')
df = pd.read_csv('c:/1/bank.csv')
df.head()
Skalowanie standardowe tylko dla wartości dyskretnych
Wybieram kolumny tkstowe, dyskretne, do głębszej analizy. Lepsze było to wybieranie dyskretne i ciągłe.
encoding_list = ['job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week','poutcome']
df[encoding_list] = df[encoding_list].apply(LabelEncoder().fit_transform)
df[encoding_list].head()
Tworzymy zestaw treningowy i zestaw testowy, budujemy model
y = df['y']
X = df.drop('y', axis=1)
Złoty podział zioru na testowy i treningowy
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.33, stratify = y, random_state = 148)
wielkości zbiorów
print ('Zbiór X treningowy: ',Xtrain.shape)
print ('Zbiór X testowy: ', Xtest.shape)
print ('Zbiór y treningowy: ', ytrain.shape)
print ('Zbiór y testowy: ', ytest.shape)
Dane dyskretne są zdygitalizowane
Xtrain.head(4)
Random Forest Classifier
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
forestVC = RandomForestClassifier (random_state = 1,
n_estimators = 750,
max_depth = 15,
min_samples_split = 5, min_samples_leaf = 1)
modelF = forestVC.fit(Xtrain, ytrain)
y_predF = modelF.predict(Xtest)
Blok oceny jakości modelu Random Forest Classifier
ypred = modelF.predict(Xtest)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
co_matrix = metrics.confusion_matrix(ytest, ypred)
co_matrix
print(classification_report(ytest, ypred))
print("Accuracy: ",np.round(metrics.accuracy_score(ytest, ypred), decimals=2))
print("Precision: ",np.round(metrics.precision_score(ytest, ypred), decimals=2))
print("Recall: ",np.round(metrics.recall_score(ytest, ypred), decimals=2))
print("F1 score: ",np.round(metrics.f1_score(ytest, ypred), decimals=2))
Wykresy doboru hiperparmetrów za pomocą validation_curve
1. Wybór najlepszego hiperparametru n_estimators
n_estimators : parametr n_estimators określa liczbę drzew w lesie modelu. Domyślna wartość tego parametru to 10, co oznacza, że w losowym lesie zostanie zbudowanych 10 różnych drzew decyzyjnych.
Zostało wybrane nastepujące liczby dla param_range = [100, 200, 300, 400, 500, 600,700,800,900] .
Wykres pokazuje, że od n_estimators = 200 zaczyna się stała wartość.
from sklearn.model_selection import validation_curve
param_range = [100, 200, 300, 400, 500, 600,700,800,900]
train_scoreNum, test_scoreNum = validation_curve(
RandomForestClassifier(),
X = Xtrain, y = ytrain,
param_name = 'n_estimators',
param_range = param_range, cv=3)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scoreNum, axis=1)
train_std = np.std(train_scoreNum, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scoreNum, axis=1)
test_std = np.std(test_scoreNum, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="red")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
2. Wybór najlepszego hiperparametru max_depth
max_depth : Parametr max_depth określa maksymalną głębokość każdego drzewa. Domyślna wartość parametru max_depth to None, co oznacza, że każde drzewo będzie się rozwijać, aż każdy liść będzie czysty. Czysty liść to taki, w którym wszystkie dane na liściu pochodzą z tej samej klasy.
Zostało wybrane nastepujące liczby dla param_range = [0, 5, 10, 15, 20, 30, 40, 50,60] . Wykres pokazuje, że od max_depth = 10 jest wysoka zdolność predykcji.
param_range = [2, 5, 10, 15, 20, 30, 40, 50,60]
train_scoreNum, test_scoreNum = validation_curve(
RandomForestClassifier(),
X = Xtrain, y = ytrain,
param_name = 'max_depth',
param_range = param_range, cv=3)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scoreNum, axis=1)
train_std = np.std(train_scoreNum, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scoreNum, axis=1)
test_std = np.std(test_scoreNum, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="red")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of max_depth")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
3. Wybór najlepszego hiperparametru min_samples_split
min_samples_split : parametr min_samples_split określa minimalną liczbę próbek wymaganych do podziału wewnętrznego węzła liścia. Wartością domyślną tego parametru jest 2, co oznacza, że węzeł wewnętrzny musi mieć co najmniej dwie próbki, aby można go było podzielić, aby uzyskać bardziej szczegółową klasyfikację.
Wtydaje mi się że najlepszy parametr będzie 2
param_range = [2, 3, 4, 5, 6, 7, 8, 9,11,15]
train_scoreNum, test_scoreNum = validation_curve(
RandomForestClassifier(),
X = Xtrain, y = ytrain,
param_name = 'min_samples_split',
param_range = param_range, cv=3)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scoreNum, axis=1)
train_std = np.std(train_scoreNum, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scoreNum, axis=1)
test_std = np.std(test_scoreNum, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="red")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of min_samples_split")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
3. Wybór najlepszego hiperparametru min_samples_leaf
min_samples_leaf: parametr min_samples_leaf określa minimalną liczbę próbek wymaganych w węźle liścia. Wartością domyślną tego parametru jest 1, co oznacza, że każdy liść musi mieć co najmniej 1 próbkę, którą klasyfikuje.
Według mnie najlepsze jest min_samples_leaf = 1
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9,11,15]
train_scoreNum, test_scoreNum = validation_curve(
RandomForestClassifier(),
X = Xtrain, y = ytrain,
param_name = 'min_samples_leaf',
param_range = param_range, cv=3)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scoreNum, axis=1)
train_std = np.std(train_scoreNum, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scoreNum, axis=1)
test_std = np.std(test_scoreNum, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="red")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of min_samples_leaf")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
Model Random Forest Classifier z nowymi parametrami
random_state = 1, n_estimators = 20, max_depth = 10, min_samples_split = 2, min_samples_leaf = 1
forestVCG = RandomForestClassifier (random_state = 1,
n_estimators = 200,
max_depth = 10,
min_samples_split = 2, min_samples_leaf = 1)
modelF = forestVCG.fit(Xtrain, ytrain)
y_predF = modelF.predict(Xtest)
ypred = forestVCG.predict(Xtest)
co_matrix = metrics.confusion_matrix(ytest, ypred)
co_matrix
print(classification_report(ytest, ypred))
print("Accuracy: ",np.round(metrics.accuracy_score(ytest, ypred), decimals=2))
print("Precision: ",np.round(metrics.precision_score(ytest, ypred), decimals=2))
print("Recall: ",np.round(metrics.recall_score(ytest, ypred), decimals=2))
print("F1 score: ",np.round(metrics.f1_score(ytest, ypred), decimals=2))
Model Random Forest Classifier z nowymi parametrami
- Accuracy: 0.91
- Precision: 0.7
- Recall: 0.39
- F1 score: 0.5
Stare parametry
- Accuracy: 0.91
- Precision: 0.66
- Recall: 0.48
- F1 score: 0.56
Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = 5, cv=2)
LR_Grid.fit(Xtrain, ytrain)
ypred = LR_Grid.predict(Xtest)
## confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
co_matrix = metrics.confusion_matrix(ytest, ypred)
co_matrix
print(classification_report(ytest, ypred))
print("Accuracy: ",np.round(metrics.accuracy_score(ytest, ypred), decimals=2))
print("Precision: ",np.round(metrics.precision_score(ytest, ypred), decimals=2))
print("Recall: ",np.round(metrics.recall_score(ytest, ypred), decimals=2))
print("F1 score: ",np.round(metrics.f1_score(ytest, ypred), decimals=2))
Wykresy doboru hiperparmetrów dla Logistic Regression za pomocą validation_curve
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(LogisticRegression(), X=Xtrain, y=ytrain,
param_name='C', param_range=param_range, cv=10)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scoreNum, axis=1)
train_std = np.std(train_scoreNum, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scoreNum, axis=1)
test_std = np.std(test_scoreNum, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="red")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of min_samples_leaf")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()