090520201852
Someone recently told me that I do not write enough so I will write:
It is very nice when we have AUC of 85%. Especially cool when we have AUC test and AUC) train at the same level.
Unfortunately, this model is suitable for the trash and needs to be improved a bit. Now this is the model who was supposed to find who had a stroke and said no one had a stroke. Because there were 1-2% people with stroke, so the model is right in 98%. This is a trap – you are interested in stroke stroke. So I did oversampling to balance the resulting populations 0 and 1. After oversampling, those with stroke were the same as those without stroke. the break point (red) on the ROC curve has changed. Unfortunately, somebody got used to creating meta variables on the second level of stasking. So I started experimenting with thrashold settings. I invite you to follow my efforts.
I got so hooked that I started to contribute to stackoverflow!
# https://github.com/dawidkopczyk/blog/blob/master/stacking.py
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
import scikitplot as skplt
from plot_metric.functions import BinaryClassification
from sklearn.metrics import precision_recall_curve
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
print(classification_report(ytest, model.predict(Xtest)))
## ----------AUC-----------------------------------------
print('---------------------')
AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
print('AUC_train: AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
print('AUC_test: print('---------------------')
print("Accuracy Training data: ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Accuracy Test data: ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
y_probas1 = model.predict_proba(Xtest)[:,1]
y_probas2 = model.predict_proba(Xtest)
### ---plot_roc_curve--------------------------------------------------------
plt.figure(figsize=(13,4))
plt.subplot(1, 2, 1)
bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
bc.plot_roc_curve()
### --------precision_recall_curve------------------------------------------
plt.subplot(1, 2, 2)
precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)
plt.plot(recall, precision, marker='.', label=model)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc=(-0.30, -0.6))
plt.show()
## ----------plot_roc-----------------------------------------
skplt.metrics.plot_roc(ytest, y_probas2)
# General
import numpy as np
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
# Utilities
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from copy import copy as make_copy
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from plot_metric.functions import BinaryClassification
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import warnings
SEED = 2018
warnings.filterwarnings("ignore")
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction.csv')
print(df.shape)
df.head(2)
import numpy as np
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
del df['ID']
df = df.dropna(how='any')
df.isnull().sum()
df.shape
y = df['Stroke']
X = df.drop('Stroke', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
y_train.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
Define Base (level 0) and Stacking (level 1) estimators
base_clf = [LogisticRegression(), RandomForestClassifier(), ### jakie model chce trenować
AdaBoostClassifier(), GaussianNB()]
stck_clf = LogisticRegression() ### układanie w stos odbyw się za pomocą LogisticRegression
#stck_clf = RandomForestClassifier()
Evaluate Base estimators separately
## Wstępna ocena bazowych estymatorów (modeli)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
for t in base_clf:
# Set seed
if 'kot' in t.get_params().keys(): # pobierz z modeli, które chce trenować kluczowe hiperparametry
t.set_params(random_state=SEED) ## Podaje parametry PODSTAWOWE modelu, doyślne, fabryczne!!
## to znaczy, że jak podam specjalny hiperparament w modelu to będzie on uwzględniony
# Fit model
t.fit(X_train, y_train) # Podstawiam do kolejnego modelu z pętli
# Predict
y_pred = t.predict(X_test) #predykcja kolejnego modelu z pętli
# Valuation
acc = accuracy_score(y_test, y_pred)
#pre = precision_score(y_test, y_pred,average = 'macro')
#auc = roc_auc_score(y_test, y_pred)
print('{} accuracy: {:.2f}plt.figure(figsize=(7,3))
y_probas1 = t.predict_proba(X_test)[:,1]
bc= BinaryClassification(y_test, y_probas1, labels=[t.__class__.__name__]).plot_roc_curve()
plt.show()
AUC_train_1 = metrics.roc_auc_score(y_train,t.predict_proba(X_train)[:,1])
print('AUC_train: AUC_test_1 = metrics.roc_auc_score(y_test,t.predict_proba(X_test)[:,1])
print('AUC_test: print(classification_report(y_test, t.predict(X_test)))
print('===============================================================')
Create Hold Out predictions (meta-features)
def hold_out_predict(clf, X, y, cv):
"""Performing cross validation hold out predictions for stacking"""
# USTALA WYMIARY
n_classes = len(np.unique(y)) # Sprawdza jakie są klasy: len(np.unique(y)) = 2
meta_features = np.zeros((X.shape[0], n_classes)) ## BUDUJE SZKIELEK WEKTORA META CECH
# Buduje wektor o ilości wierszy 10000 i 2 KOLUMN
# składający się z samych zer
n_splits = cv.get_n_splits(X, y) # Zwraca liczbę iteracji podziału w walidatorze krzyżowym.= 4
# Loop over folds
print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
for train_idx, hold_out_idx in cv.split(X, y):
# Split data
X_train = X[train_idx] # Podmienia zmienne X_train w pętli
y_train = y[train_idx] # Podmienia zmienne y_train w pętli
X_hold_out = X[hold_out_idx]
# Fit estimator to K-1 parts and predict on hold out part
est = make_copy(clf)
est.fit(X_train, y_train)
y_hold_out_pred = est.predict_proba(X_hold_out)
# Fill in meta features
meta_features[hold_out_idx] = y_hold_out_pred
return meta_features # meta wymiar to wektor 1000 na 2 kolumny składający się z samych zer
Create meta-features for training data
# Define 4-fold CV ## można dać dowolną liczbę faud
cv = KFold(n_splits=6, random_state=SEED) ## wpisuje ilości podziałow w cross-validation
# Loop over classifier to produce meta features
meta_train = []
for clf in base_clf:
# Create hold out predictions for a classifier
meta_train_clf = hold_out_predict(clf, X_train, y_train, cv)
# Remove redundant column
meta_train_clf = np.delete(meta_train_clf, 0, axis=1).ravel()
# Gather meta training data
meta_train.append(meta_train_clf)
meta_train = np.array(meta_train).T
Create meta-features for testing data
meta_test = []
for i in base_clf:
# Create hold out predictions for a classifier
i.fit(X_train, y_train)
meta_test_clf = i.predict_proba(X_test)
# Remove redundant column
meta_test_clf = np.delete(meta_test_clf, 0, axis=1).ravel()
# Gather meta training data
meta_test.append(meta_test_clf)
meta_test = np.array(meta_test).T
Predict on Stacking Classifier
# Set seed
if 'random_state' in stck_clf.get_params().keys():
stck_clf.set_params(random_state=SEED)
# Optional (Add original features to meta)
original_flag = False
if original_flag:
meta_train = np.concatenate((meta_train, X_train), axis=1)
meta_test = np.concatenate((meta_test, X_test), axis=1)
# Fit model
stck_clf.fit(meta_train, y_train)
# Predict
y_pred = stck_clf.predict(meta_test)
# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred,average = 'macro')
auc = roc_auc_score(y_test, y_pred)
print('Stacking {} AUC: {:.4f}
Classification_Assessment(stck_clf ,meta_train, y_train, meta_test, y_test)
OVERSAMPLING
First, a thick definition of homemade
def oversampling(ytrain, Xtrain):
import matplotlib.pyplot as plt
global Xtrain_OV
global ytrain_OV
calss1 = np.round((sum(ytrain == 1)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
calss0 = np.round((sum(ytrain == 0)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
print("y = 0: ", sum(ytrain == 0),'-------',calss0,
print("y = 1: ", sum(ytrain == 1),'-------',calss1,
print('--------------------------------------------------------')
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
print()
Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
Proporcja = np.round(Proporcja, decimals=0)
Proporcja = Proporcja.astype(int)
ytrain_OV = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
Xtrain_OV = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
ytrain_OV = pd.concat([ytrain, ytrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.concat([Xtrain, Xtrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.DataFrame(Xtrain_OV)
ytrain_OV = pd.DataFrame(ytrain_OV)
print("Before oversampling Xtrain: ", Xtrain.shape)
print("Before oversampling ytrain: ", ytrain.shape)
print('--------------------------------------------------------')
print("After oversampling Xtrain_OV: ", Xtrain_OV.shape)
print("After oversampling ytrain_OV: ", ytrain_OV.shape)
print('--------------------------------------------------------')
ax = plt.subplot(1, 2, 1)
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
kot = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
kot = pd.concat([ytrain, kot], axis = 0).reset_index(drop = True)
ax = plt.subplot(1, 2, 2)
kot.value_counts(dropna = False, normalize=True).plot(kind='pie',title='After oversampling')
plt.show
Reads the data again.
y = df['Stroke']
X = df.drop('Stroke', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
I’m working on a matrix.
oversampling(y_train, X_train)
X_train = Xtrain_OV.values
X_test = X_test.values
y_train = ytrain_OV.values
y_test = y_test.values
Define Base (level 0) and Stacking (level 1) estimators¶
base_clf = [LogisticRegression(), RandomForestClassifier(), ### jakie model chce trenować
AdaBoostClassifier(), GaussianNB()]
stck_OV = LogisticRegression() ### układanie w stos odbyw się za pomocą LogisticRegression
#stck_clf = RandomForestClassifier()
Evaluate Base estimators separately¶
## Wstępna ocena bazowych estymatorów (modeli)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
for t in base_clf:
# Set seed
if 'kot' in t.get_params().keys(): # pobierz z modeli, które chce trenować kluczowe hiperparametry
t.set_params(random_state=SEED) ## Podaje parametry PODSTAWOWE modelu, doyślne, fabryczne!!
## to znaczy, że jak podam specjalny hiperparament w modelu to będzie on uwzględniony
# Fit model
t.fit(X_train, y_train) # Podstawiam do kolejnego modelu z pętli
# Predict
y_pred = t.predict(X_test) #predykcja kolejnego modelu z pętli
# Valuation
acc = accuracy_score(y_test, y_pred)
#pre = precision_score(y_test, y_pred,average = 'macro')
#auc = roc_auc_score(y_test, y_pred)
print('{} accuracy: {:.2f}plt.figure(figsize=(7,3))
y_probas1 = t.predict_proba(X_test)[:,1]
bc= BinaryClassification(y_test, y_probas1, labels=[t.__class__.__name__]).plot_roc_curve()
plt.show()
AUC_train_1 = metrics.roc_auc_score(y_train,t.predict_proba(X_train)[:,1])
print('AUC_train: AUC_test_1 = metrics.roc_auc_score(y_test,t.predict_proba(X_test)[:,1])
print('AUC_test: print(classification_report(y_test, t.predict(X_test)))
print('===============================================================')
Create Hold Out predictions (meta-features)¶
def hold_out_predict(clf, X, y, cv):
"""Performing cross validation hold out predictions for stacking"""
# USTALA WYMIARY
n_classes = len(np.unique(y)) # Sprawdza jakie są klasy: len(np.unique(y)) = 2
meta_features = np.zeros((X.shape[0], n_classes)) ## BUDUJE SZKIELEK WEKTORA META CECH
# Buduje wektor o ilości wierszy 10000 i 2 KOLUMN
# składający się z samych zer
n_splits = cv.get_n_splits(X, y) # Zwraca liczbę iteracji podziału w walidatorze krzyżowym.= 4
# Loop over folds
print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
for train_idx, hold_out_idx in cv.split(X, y):
# Split data
X_train = X[train_idx] # Podmienia zmienne X_train w pętli
y_train = y[train_idx] # Podmienia zmienne y_train w pętli
X_hold_out = X[hold_out_idx]
# Fit estimator to K-1 parts and predict on hold out part
est = make_copy(clf)
est.fit(X_train, y_train)
y_hold_out_pred = est.predict_proba(X_hold_out)
# Fill in meta features
meta_features[hold_out_idx] = y_hold_out_pred
return meta_features # meta wymiar to wektor 1000 na 2 kolumny składający się z samych zer
Create meta-features for training data¶
# Define 4-fold CV ## można dać dowolną liczbę faud
cv = KFold(n_splits=6, random_state=SEED) ## wpisuje ilości podziałow w cross-validation
# Loop over classifier to produce meta features
meta_train = []
for clf in base_clf:
# Create hold out predictions for a classifier
meta_train_clf = hold_out_predict(clf, X_train, y_train, cv)
# Remove redundant column
meta_train_clf = np.delete(meta_train_clf, 0, axis=1).ravel()
# Gather meta training data
meta_train.append(meta_train_clf)
meta_train = np.array(meta_train).T
Create meta-features for testing data¶
meta_test = []
for i in base_clf:
# Create hold out predictions for a classifier
i.fit(X_train, y_train)
meta_test_clf = i.predict_proba(X_test)
# Remove redundant column
meta_test_clf = np.delete(meta_test_clf, 0, axis=1).ravel()
# Gather meta training data
meta_test.append(meta_test_clf)
meta_test = np.array(meta_test).T
Predict on Stacking Classifier¶
# Set seed
if 'random_state' in stck_OV.get_params().keys():
stck_OV.set_params(random_state=SEED)
# Optional (Add original features to meta)
original_flag = False
if original_flag:
meta_train = np.concatenate((meta_train, X_train), axis=1)
meta_test = np.concatenate((meta_test, X_test), axis=1)
# Fit model
stck_OV.fit(meta_train, y_train)
# Predict
y_pred = stck_OV.predict(meta_test)
# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred,average = 'macro')
auc = roc_auc_score(y_test, y_pred)
print('Stacking {} AUC: {:.4f}
Classification_Assessment(stck_OV ,meta_train, ytrain_OV, meta_test, y_test)
If we were processing Titanic data, I would expect less of a catastrophe like this. All in all I can’t explain what is wrong because it should play normally, because in models the thrashold point (red point) was at the top of the ROC cross. Unfortunately, when creating the second-level stacking classification, something got lost. This is not a mistake, because I repeated this analysis several times. Something is wrong and I don’t know what and I have no idea.
Now we will start playing thrashold sensitivity control so that the model finally begins classifying the result variables 1.
I wrote on the basis of the previous code, a program that will modernize the threshold. A thicket of numbers and names begins, so I introduced colors to print.