
090520201852
Someone recently told me that I do not write enough so I will write:
It is very nice when we have AUC of 85
Unfortunately, this model is suitable for the trash and needs to be improved a bit. Now this is the model who was supposed to find who had a stroke and said no one had a stroke. Because there were 1-2
I got so hooked that I started to contribute to stackoverflow!
# https://github.com/dawidkopczyk/blog/blob/master/stacking.py
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
import scikitplot as skplt
from plot_metric.functions import BinaryClassification
from sklearn.metrics import precision_recall_curve
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
print(classification_report(ytest, model.predict(Xtest)))
## ----------AUC-----------------------------------------
print('---------------------')
AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
print('AUC_train:
AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
print('AUC_test:
print('---------------------')
print("Accuracy Training data: ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Accuracy Test data: ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
y_probas1 = model.predict_proba(Xtest)[:,1]
y_probas2 = model.predict_proba(Xtest)
### ---plot_roc_curve--------------------------------------------------------
plt.figure(figsize=(13,4))
plt.subplot(1, 2, 1)
bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
bc.plot_roc_curve()
### --------precision_recall_curve------------------------------------------
plt.subplot(1, 2, 2)
precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)
plt.plot(recall, precision, marker='.', label=model)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc=(-0.30, -0.6))
plt.show()
## ----------plot_roc-----------------------------------------
skplt.metrics.plot_roc(ytest, y_probas2)
# General
import numpy as np
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
# Utilities
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from copy import copy as make_copy
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from plot_metric.functions import BinaryClassification
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import warnings
SEED = 2018
warnings.filterwarnings("ignore")
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction.csv')
print(df.shape)
df.head(2)
import numpy as np
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
del df['ID']
df = df.dropna(how='any')
df.isnull().sum()
df.shape
y = df['Stroke']
X = df.drop('Stroke', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
y_train.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
Define Base (level 0) and Stacking (level 1) estimators
base_clf = [LogisticRegression(), RandomForestClassifier(), ### jakie model chce trenować
AdaBoostClassifier(), GaussianNB()]
stck_clf = LogisticRegression() ### układanie w stos odbyw się za pomocą LogisticRegression
#stck_clf = RandomForestClassifier()
Evaluate Base estimators separately
## Wstępna ocena bazowych estymatorów (modeli)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
for t in base_clf:
# Set seed
if 'kot' in t.get_params().keys(): # pobierz z modeli, które chce trenować kluczowe hiperparametry
t.set_params(random_state=SEED) ## Podaje parametry PODSTAWOWE modelu, doyślne, fabryczne!!
## to znaczy, że jak podam specjalny hiperparament w modelu to będzie on uwzględniony
# Fit model
t.fit(X_train, y_train) # Podstawiam do kolejnego modelu z pętli
# Predict
y_pred = t.predict(X_test) #predykcja kolejnego modelu z pętli
# Valuation
acc = accuracy_score(y_test, y_pred)
#pre = precision_score(y_test, y_pred,average = 'macro')
#auc = roc_auc_score(y_test, y_pred)
print('{} accuracy: {:.2f}
plt.figure(figsize=(7,3))
y_probas1 = t.predict_proba(X_test)[:,1]
bc= BinaryClassification(y_test, y_probas1, labels=[t.__class__.__name__]).plot_roc_curve()
plt.show()
AUC_train_1 = metrics.roc_auc_score(y_train,t.predict_proba(X_train)[:,1])
print('AUC_train:
AUC_test_1 = metrics.roc_auc_score(y_test,t.predict_proba(X_test)[:,1])
print('AUC_test:
print(classification_report(y_test, t.predict(X_test)))
print('===============================================================')
Create Hold Out predictions (meta-features)
def hold_out_predict(clf, X, y, cv):
"""Performing cross validation hold out predictions for stacking"""
# USTALA WYMIARY
n_classes = len(np.unique(y)) # Sprawdza jakie są klasy: len(np.unique(y)) = 2
meta_features = np.zeros((X.shape[0], n_classes)) ## BUDUJE SZKIELEK WEKTORA META CECH
# Buduje wektor o ilości wierszy 10000 i 2 KOLUMN
# składający się z samych zer
n_splits = cv.get_n_splits(X, y) # Zwraca liczbę iteracji podziału w walidatorze krzyżowym.= 4
# Loop over folds
print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
for train_idx, hold_out_idx in cv.split(X, y):
# Split data
X_train = X[train_idx] # Podmienia zmienne X_train w pętli
y_train = y[train_idx] # Podmienia zmienne y_train w pętli
X_hold_out = X[hold_out_idx]
# Fit estimator to K-1 parts and predict on hold out part
est = make_copy(clf)
est.fit(X_train, y_train)
y_hold_out_pred = est.predict_proba(X_hold_out)
# Fill in meta features
meta_features[hold_out_idx] = y_hold_out_pred
return meta_features # meta wymiar to wektor 1000 na 2 kolumny składający się z samych zer
Create meta-features for training data
# Define 4-fold CV ## można dać dowolną liczbę faud
cv = KFold(n_splits=6, random_state=SEED) ## wpisuje ilości podziałow w cross-validation
# Loop over classifier to produce meta features
meta_train = []
for clf in base_clf:
# Create hold out predictions for a classifier
meta_train_clf = hold_out_predict(clf, X_train, y_train, cv)
# Remove redundant column
meta_train_clf = np.delete(meta_train_clf, 0, axis=1).ravel()
# Gather meta training data
meta_train.append(meta_train_clf)
meta_train = np.array(meta_train).T
Create meta-features for testing data
meta_test = []
for i in base_clf:
# Create hold out predictions for a classifier
i.fit(X_train, y_train)
meta_test_clf = i.predict_proba(X_test)
# Remove redundant column
meta_test_clf = np.delete(meta_test_clf, 0, axis=1).ravel()
# Gather meta training data
meta_test.append(meta_test_clf)
meta_test = np.array(meta_test).T
Predict on Stacking Classifier
# Set seed
if 'random_state' in stck_clf.get_params().keys():
stck_clf.set_params(random_state=SEED)
# Optional (Add original features to meta)
original_flag = False
if original_flag:
meta_train = np.concatenate((meta_train, X_train), axis=1)
meta_test = np.concatenate((meta_test, X_test), axis=1)
# Fit model
stck_clf.fit(meta_train, y_train)
# Predict
y_pred = stck_clf.predict(meta_test)
# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred,average = 'macro')
auc = roc_auc_score(y_test, y_pred)
print('Stacking {} AUC: {:.4f}
Classification_Assessment(stck_clf ,meta_train, y_train, meta_test, y_test)
OVERSAMPLING
First, a thick definition of homemade
def oversampling(ytrain, Xtrain):
import matplotlib.pyplot as plt
global Xtrain_OV
global ytrain_OV
calss1 = np.round((sum(ytrain == 1)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
calss0 = np.round((sum(ytrain == 0)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
print("y = 0: ", sum(ytrain == 0),'-------',calss0,'
print("y = 1: ", sum(ytrain == 1),'-------',calss1,'
print('--------------------------------------------------------')
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
print()
Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
Proporcja = np.round(Proporcja, decimals=0)
Proporcja = Proporcja.astype(int)
ytrain_OV = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
Xtrain_OV = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
ytrain_OV = pd.concat([ytrain, ytrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.concat([Xtrain, Xtrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.DataFrame(Xtrain_OV)
ytrain_OV = pd.DataFrame(ytrain_OV)
print("Before oversampling Xtrain: ", Xtrain.shape)
print("Before oversampling ytrain: ", ytrain.shape)
print('--------------------------------------------------------')
print("After oversampling Xtrain_OV: ", Xtrain_OV.shape)
print("After oversampling ytrain_OV: ", ytrain_OV.shape)
print('--------------------------------------------------------')
ax = plt.subplot(1, 2, 1)
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
kot = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
kot = pd.concat([ytrain, kot], axis = 0).reset_index(drop = True)
ax = plt.subplot(1, 2, 2)
kot.value_counts(dropna = False, normalize=True).plot(kind='pie',title='After oversampling')
plt.show
Reads the data again.
y = df['Stroke']
X = df.drop('Stroke', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
I’m working on a matrix.
oversampling(y_train, X_train)
X_train = Xtrain_OV.values
X_test = X_test.values
y_train = ytrain_OV.values
y_test = y_test.values
Define Base (level 0) and Stacking (level 1) estimators¶
base_clf = [LogisticRegression(), RandomForestClassifier(), ### jakie model chce trenować
AdaBoostClassifier(), GaussianNB()]
stck_OV = LogisticRegression() ### układanie w stos odbyw się za pomocą LogisticRegression
#stck_clf = RandomForestClassifier()
Evaluate Base estimators separately¶
## Wstępna ocena bazowych estymatorów (modeli)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
for t in base_clf:
# Set seed
if 'kot' in t.get_params().keys(): # pobierz z modeli, które chce trenować kluczowe hiperparametry
t.set_params(random_state=SEED) ## Podaje parametry PODSTAWOWE modelu, doyślne, fabryczne!!
## to znaczy, że jak podam specjalny hiperparament w modelu to będzie on uwzględniony
# Fit model
t.fit(X_train, y_train) # Podstawiam do kolejnego modelu z pętli
# Predict
y_pred = t.predict(X_test) #predykcja kolejnego modelu z pętli
# Valuation
acc = accuracy_score(y_test, y_pred)
#pre = precision_score(y_test, y_pred,average = 'macro')
#auc = roc_auc_score(y_test, y_pred)
print('{} accuracy: {:.2f}
plt.figure(figsize=(7,3))
y_probas1 = t.predict_proba(X_test)[:,1]
bc= BinaryClassification(y_test, y_probas1, labels=[t.__class__.__name__]).plot_roc_curve()
plt.show()
AUC_train_1 = metrics.roc_auc_score(y_train,t.predict_proba(X_train)[:,1])
print('AUC_train:
AUC_test_1 = metrics.roc_auc_score(y_test,t.predict_proba(X_test)[:,1])
print('AUC_test:
print(classification_report(y_test, t.predict(X_test)))
print('===============================================================')
Create Hold Out predictions (meta-features)¶
def hold_out_predict(clf, X, y, cv):
"""Performing cross validation hold out predictions for stacking"""
# USTALA WYMIARY
n_classes = len(np.unique(y)) # Sprawdza jakie są klasy: len(np.unique(y)) = 2
meta_features = np.zeros((X.shape[0], n_classes)) ## BUDUJE SZKIELEK WEKTORA META CECH
# Buduje wektor o ilości wierszy 10000 i 2 KOLUMN
# składający się z samych zer
n_splits = cv.get_n_splits(X, y) # Zwraca liczbę iteracji podziału w walidatorze krzyżowym.= 4
# Loop over folds
print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
for train_idx, hold_out_idx in cv.split(X, y):
# Split data
X_train = X[train_idx] # Podmienia zmienne X_train w pętli
y_train = y[train_idx] # Podmienia zmienne y_train w pętli
X_hold_out = X[hold_out_idx]
# Fit estimator to K-1 parts and predict on hold out part
est = make_copy(clf)
est.fit(X_train, y_train)
y_hold_out_pred = est.predict_proba(X_hold_out)
# Fill in meta features
meta_features[hold_out_idx] = y_hold_out_pred
return meta_features # meta wymiar to wektor 1000 na 2 kolumny składający się z samych zer
Create meta-features for training data¶
# Define 4-fold CV ## można dać dowolną liczbę faud
cv = KFold(n_splits=6, random_state=SEED) ## wpisuje ilości podziałow w cross-validation
# Loop over classifier to produce meta features
meta_train = []
for clf in base_clf:
# Create hold out predictions for a classifier
meta_train_clf = hold_out_predict(clf, X_train, y_train, cv)
# Remove redundant column
meta_train_clf = np.delete(meta_train_clf, 0, axis=1).ravel()
# Gather meta training data
meta_train.append(meta_train_clf)
meta_train = np.array(meta_train).T
Create meta-features for testing data¶
meta_test = []
for i in base_clf:
# Create hold out predictions for a classifier
i.fit(X_train, y_train)
meta_test_clf = i.predict_proba(X_test)
# Remove redundant column
meta_test_clf = np.delete(meta_test_clf, 0, axis=1).ravel()
# Gather meta training data
meta_test.append(meta_test_clf)
meta_test = np.array(meta_test).T
Predict on Stacking Classifier¶
# Set seed
if 'random_state' in stck_OV.get_params().keys():
stck_OV.set_params(random_state=SEED)
# Optional (Add original features to meta)
original_flag = False
if original_flag:
meta_train = np.concatenate((meta_train, X_train), axis=1)
meta_test = np.concatenate((meta_test, X_test), axis=1)
# Fit model
stck_OV.fit(meta_train, y_train)
# Predict
y_pred = stck_OV.predict(meta_test)
# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred,average = 'macro')
auc = roc_auc_score(y_test, y_pred)
print('Stacking {} AUC: {:.4f}
Classification_Assessment(stck_OV ,meta_train, ytrain_OV, meta_test, y_test)
If we were processing Titanic data, I would expect less of a catastrophe like this. All in all I can’t explain what is wrong because it should play normally, because in models the thrashold point (red point) was at the top of the ROC cross. Unfortunately, when creating the second-level stacking classification, something got lost. This is not a mistake, because I repeated this analysis several times. Something is wrong and I don’t know what and I have no idea.
Now we will start playing thrashold sensitivity control so that the model finally begins classifying the result variables 1.
I wrote on the basis of the previous code, a program that will modernize the threshold. A thicket of numbers and names begins, so I introduced colors to print.
Threshold¶
def Classification_Assessment_by_Threshold(model ,Xtrain, ytrain, Xtest, ytest, threshold):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.metrics import accuracy_score
import scikitplot as skplt
from plot_metric.functions import BinaryClassification
from sklearn.metrics import precision_recall_curve
### --------color------------------
import colorama
from colorama import Fore, Style
### ---------------New Threshold----------------------------------------
PRED_Threshold = np.where((model.predict_proba(Xtest)[:, 1])>= threshold,1,0)
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print(Fore.BLUE + "Recall Test data (new_threshold): ", np.round(recall_score(ytest, PRED_Threshold), decimals=4))
print("Precision Test data (new_threshold): ", np.round(precision_score(ytest,PRED_Threshold), decimals=4))
print("----------------------------------------------------------------------")
print(Style.RESET_ALL)
print(confusion_matrix(ytest, model.predict(Xtest)))
print(Fore.BLUE +"Confusion Matrix Test data - new_threshold: ",threshold)
print(confusion_matrix(ytest, PRED_Threshold))
print(Style.RESET_ALL)
print("----------------------------------------------------------------------")
# https://stackoverflow.com/questions/39473297/how-do-i-print-colored-output-with-python-3
print('Valuation for test data only:')
print(classification_report(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print(Fore.BLUE +'Valuation for test data only (new_threshold):',threshold)
print(classification_report(ytest, PRED_Threshold))
print(Style.RESET_ALL)
## ----------AUC-----------------------------------------
print('---------------------')
AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
print('AUC_train:
AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
print('AUC_test:
print(Fore.BLUE +'AUC_test with new_threshold:', threshold)
AUC_test_3 = metrics.roc_auc_score(ytest,PRED_Threshold)
print('AUC_test:
print('---------------------')
print(Style.RESET_ALL)
print("Accuracy Training data: ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Accuracy Test data : ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4))
print(Fore.BLUE +"Accuracy Test data (new_threshold) : ", np.round(accuracy_score(ytest, PRED_Threshold), decimals=4))
print("----------------------------------------------------------------------")
print(Style.RESET_ALL)
print('Valuation for test data only:')
y_probas1 = PRED_Threshold
y_probas3 = model.predict_proba(Xtest)[:,1]
y_probas2 = model.predict_proba(Xtest)
### ---plot_roc_curve--------------------------------------------------------
plt.figure(figsize=(13,4))
plt.subplot(1, 2, 1)
bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
bc2 = BinaryClassification(ytest, y_probas3, labels=["Class 1", "Class 2"])
bc.plot_roc_curve()
bc2.plot_roc_curve()
#plt.axvline(threshold, color = 'blue', linestyle = '--', label = 'new threshold')
# plt.axvline(0.5, color = '#00C251', linestyle = '--', label = 'threshold = 0.5')
### --------precision_recall_curve------------------------------------------
plt.subplot(1, 2, 2)
precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)
plt.plot(recall, precision, marker='.', label=model)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
#plt.legend(loc=(-0.30, -0.7))
plt.show()
## ----------plot_roc-----------------------------------------
skplt.metrics.plot_roc(ytest, y_probas2)
threshold = 0.3
Classification_Assessment_by_Threshold(stck_clf ,meta_train, y_train, meta_test, y_test, threshold)