180520202025
This is the first of the third part of research related to improving models. the bagging method can be used for all types of classification models. I still have to work on bagging in neural networks. In the meantime, I invite you to read. As usual, the picture is designed to raise the drama of research.
In [1]:
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.metrics import accuracy_score
import scikitplot as skplt
from plot_metric.functions import BinaryClassification
from sklearn.metrics import precision_recall_curve
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
print(classification_report(ytest, model.predict(Xtest)))
## ----------AUC-----------------------------------------
print('---------------------')
AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
print('AUC_train: %.3f' % AUC_train_1)
AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
print('AUC_test: %.3f' % AUC_test_1)
print('---------------------')
print("Accuracy Training data: ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Accuracy Test data: ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
y_probas1 = model.predict_proba(Xtest)[:,1]
y_probas2 = model.predict_proba(Xtest)
### ---plot_roc_curve--------------------------------------------------------
plt.figure(figsize=(13,4))
plt.subplot(1, 2, 1)
bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
bc.plot_roc_curve()
### --------precision_recall_curve------------------------------------------
plt.subplot(1, 2, 2)
precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)
plt.plot(recall, precision, marker='.', label=model)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc=(-0.30, -0.8))
plt.show()
## ----------plot_roc-----------------------------------------
skplt.metrics.plot_roc(ytest, y_probas2)
f1_score_macro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='macro'), decimals=3)
print("f1 score macro ",f1_score_macro)
f1_score_micro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='micro'), decimals=3)
print("f1 score micro ",f1_score_micro)
print('-----------------------------------------------------------------------------')
if f1_score_macro > f1_score_micro:
print("1 (minority) is better classified than 0 (majority) - macro > micro")
else:
print('0 (majority) is better classified than 1 (minority)- micro > macro')
print('Same holds true for AUC')
print('-----------------------------------------------------------------------------')
cal_1 = np.round((sum(ytest == 1)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
cal_0 = np.round((sum(ytest == 0)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
print('1 proportion:',cal_1 )
print('0 proportion:',cal_0 )
In [2]:
def oversampling(ytrain, Xtrain):
import matplotlib.pyplot as plt
global Xtrain_OV
global ytrain_OV
calss1 = np.round((sum(ytrain == 1)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
calss0 = np.round((sum(ytrain == 0)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
print("y = 0: ", sum(ytrain == 0),'-------',calss0,'%')
print("y = 1: ", sum(ytrain == 1),'-------',calss1,'%')
print('--------------------------------------------------------')
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
print()
Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
Proporcja = np.round(Proporcja, decimals=0)
Proporcja = Proporcja.astype(int)
ytrain_OV = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
Xtrain_OV = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
ytrain_OV = pd.concat([ytrain, ytrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.concat([Xtrain, Xtrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.DataFrame(Xtrain_OV)
ytrain_OV = pd.DataFrame(ytrain_OV)
print("Before oversampling Xtrain: ", Xtrain.shape)
print("Before oversampling ytrain: ", ytrain.shape)
print('--------------------------------------------------------')
print("After oversampling Xtrain_OV: ", Xtrain_OV.shape)
print("After oversampling ytrain_OV: ", ytrain_OV.shape)
print('--------------------------------------------------------')
ax = plt.subplot(1, 2, 1)
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
kot = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
kot = pd.concat([ytrain, kot], axis = 0).reset_index(drop = True)
ax = plt.subplot(1, 2, 2)
kot.value_counts(dropna = False, normalize=True).plot(kind='pie',title='After oversampling')
plt.show
In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.ensemble import BaggingClassifier
warnings.filterwarnings("ignore")
%matplotlib inline
df= pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction.csv')
print(df.shape)
print()
print(df.columns)
df.head(3)
Out[3]:
In [4]:
### Narzędzie do automatycznego kodowania zmiennych dyskretnych (własnej roboty)
In [5]:
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
In [6]:
df.fillna(7777, inplace=True)
In [7]:
X = df.drop('Stroke', axis=1)
y = df['Stroke']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
In [8]:
oversampling(y_train, X_train)
In [9]:
X_test = X_test.values
y_test = y_test.values
Xtrain_OV = Xtrain_OV.values
ytrain_OV = ytrain_OV.values
GradientBoostingRegressor in scikit-learn¶
In [10]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
GBC = GradientBoostingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(GBC, Xtrain_OV, ytrain_OV, scoring=’accuracy’, cv=cv, n_jobs=-1, error_score=’raise’)
In [11]:
GBC = GradientBoostingClassifier()
bagging_GBC = BaggingClassifier(base_estimator=GBC, n_estimators=10, max_samples=0.8, max_features=0.8)
clf_list = [GBC, bagging_GBC]
from simple_colors import *
for k in clf_list:
print()
print(red('==================================================================='))
print()
print(red(k, 'bold'))
print()
k.fit(Xtrain_OV, ytrain_OV)
Classification_Assessment(k , Xtrain_OV, ytrain_OV, X_test, y_test)
XGBoost – Extreme Gradient Boosting !
In [12]:
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [13]:
XGB = XGBClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(XGB, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [14]:
bagging_XGB = BaggingClassifier(base_estimator=XGB, n_estimators=10, max_samples=0.8, max_features=0.8)
clf_list = [XGB, bagging_XGB]
from simple_colors import *
for k in clf_list:
print()
print(red('==================================================================='))
print()
print(red(k, 'bold'))
print()
k.fit(Xtrain_OV, ytrain_OV)
Classification_Assessment(k , Xtrain_OV, ytrain_OV, X_test, y_test)
LightGBM – Light Gradient Boosted Machine¶
In [15]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [16]:
LGBM = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(XGB, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [17]:
bagging_LGBM = BaggingClassifier(base_estimator=LGBM, n_estimators=10, max_samples=0.8, max_features=0.8)
clf_list = [LGBM, bagging_LGBM]
from simple_colors import *
for k in clf_list:
print()
print(red('==================================================================='))
print()
print(red(k, 'bold'))
print()
k.fit(Xtrain_OV, ytrain_OV)
Classification_Assessment(k , Xtrain_OV, ytrain_OV, X_test, y_test)
CatBoost – Cat Boost Classifier!¶
In [18]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [19]:
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(XGB, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [20]:
bagging_CBC = BaggingClassifier(base_estimator=CBC, n_estimators=10, max_samples=0.8, max_features=0.8)
clf_list = [CBC, bagging_CBC]
from simple_colors import *
for k in clf_list:
print()
print(red('==================================================================='))
print()
print(red(k, 'bold'))
print()
k.fit(Xtrain_OV, ytrain_OV)
Classification_Assessment(k , Xtrain_OV, ytrain_OV, X_test, y_test)
