Review of models based on gradient falling: XGBoost, LightGBM, CatBoost

240120202201

In [67]:
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    from sklearn.metrics import accuracy_score
    
    import scikitplot as skplt
    from plot_metric.functions import BinaryClassification
    from sklearn.metrics import precision_recall_curve

       
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    print('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
    
    ## ----------AUC-----------------------------------------
     
    print('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    print('AUC_train: AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    print('AUC_test:  print('---------------------')    
    
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("----------------------------------------------------------------------")
    print('Valuation for test data only:')

    y_probas1 = model.predict_proba(Xtest)[:,1]
    y_probas2 = model.predict_proba(Xtest)

### ---plot_roc_curve--------------------------------------------------------
    plt.figure(figsize=(13,4))

    plt.subplot(1, 2, 1)
    bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
    bc.plot_roc_curve() 


### --------precision_recall_curve------------------------------------------

    plt.subplot(1, 2, 2)
    precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)

    plt.plot(recall, precision, marker='.', label=model)
    plt.title('Precision recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc=(-0.30, -0.8))
    plt.show()

## ----------plot_roc-----------------------------------------

    skplt.metrics.plot_roc(ytest, y_probas2)
    
    f1_score_macro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='macro'), decimals=3)
    print("f1 score macro           ",f1_score_macro)
    
    f1_score_micro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='micro'), decimals=3)
    print("f1 score micro           ",f1_score_micro) 
    

    print('-----------------------------------------------------------------------------')
   
    if f1_score_macro > f1_score_micro:
        print("1 (minority) is better classified than 0 (majority) - macro > micro") 
    else:     
        print('0 (majority) is better classified than 1 (minority)- micro > macro')  

    print('Same holds true for AUC')
    print('-----------------------------------------------------------------------------')
    cal_1 = np.round((sum(ytest == 1)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
    cal_0 = np.round((sum(ytest == 0)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
    print('1 proportion:',cal_1 )
    print('0 proportion:',cal_0 )
In [2]:
def oversampling(ytrain, Xtrain):
    import matplotlib.pyplot as plt
    
    global Xtrain_OV
    global ytrain_OV

    calss1 = np.round((sum(ytrain == 1)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
    calss0 = np.round((sum(ytrain == 0)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
    
    print("y = 0: ", sum(ytrain == 0),'-------',calss0,
    print("y = 1: ", sum(ytrain == 1),'-------',calss1,
    print('--------------------------------------------------------')
    
    ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
    plt.show
    print()
    
    Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
    Proporcja = np.round(Proporcja, decimals=0)
    Proporcja = Proporcja.astype(int)
       
    ytrain_OV = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0) 
    Xtrain_OV = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
    
    ytrain_OV = pd.concat([ytrain, ytrain_OV], axis = 0).reset_index(drop = True)
    Xtrain_OV = pd.concat([Xtrain, Xtrain_OV], axis = 0).reset_index(drop = True)
    
    Xtrain_OV = pd.DataFrame(Xtrain_OV)
    ytrain_OV = pd.DataFrame(ytrain_OV)
    

    
    print("Before oversampling Xtrain:     ", Xtrain.shape)
    print("Before oversampling ytrain:     ", ytrain.shape)
    print('--------------------------------------------------------')
    print("After oversampling Xtrain_OV:  ", Xtrain_OV.shape)
    print("After oversampling ytrain_OV:  ", ytrain_OV.shape)
    print('--------------------------------------------------------')
    
    
    ax = plt.subplot(1, 2, 1)
    ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
    plt.show
    
       
    kot = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
    kot = pd.concat([ytrain, kot], axis = 0).reset_index(drop = True)
    ax = plt.subplot(1, 2, 2)
    kot.value_counts(dropna = False, normalize=True).plot(kind='pie',title='After oversampling')
    plt.show
In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 

warnings.filterwarnings("ignore")

df= pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction.csv')
print(df.shape)
print()
print(df.columns)
df.head(3)
(43400, 12)

Index(['ID', 'Gender', 'Age_In_Days', 'Hypertension', 'Heart_Disease',
       'Ever_Married', 'Type_Of_Work', 'Residence', 'Avg_Glucose', 'BMI',
       'Smoking_Status', 'Stroke'],
      dtype='object')
Out[3]:
ID Gender Age_In_Days Hypertension Heart_Disease Ever_Married Type_Of_Work Residence Avg_Glucose BMI Smoking_Status Stroke
0 31153 Male 1104.0 0 0 No children Rural 95.12 18.0 NaN 0
1 30650 Male 21204.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
2 17412 Female 2928.0 0 0 No Private Urban 110.89 17.6 NaN 0
In [4]:
### Narzędzie do automatycznego kodowania zmiennych dyskretnych (własnej roboty)
In [5]:
a,b = df.shape     #<- ile mamy kolumn
b

print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
    i = df.columns[i]
    f = df[i].dtypes
    if f == np.object:
        print(i,"---",f)   
    
        if f == np.object:
        
            df[i] = pd.Categorical(df[i]).codes
        
            continue
DISCRETE FUNCTIONS CODED
------------------------
Gender --- object
Ever_Married --- object
Type_Of_Work --- object
Residence --- object
Smoking_Status --- object
In [6]:
df.fillna(7777, inplace=True)
In [7]:
X = df.drop('Stroke', axis=1) 
y = df['Stroke']  

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
In [8]:
oversampling(y_train, X_train)
y = 0:  34094 ------- 98.0
y = 1:  626 ------- 2.0
--------------------------------------------------------

Before oversampling Xtrain:      (34720, 11)
Before oversampling ytrain:      (34720,)
--------------------------------------------------------
After oversampling Xtrain_OV:   (68524, 11)
After oversampling ytrain_OV:   (68524, 1)
--------------------------------------------------------

GradientBoostingRegressor in scikit-learn

In [9]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot

GBC = GradientBoostingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [10]:
n_scores = cross_val_score(GBC, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [11]:
GBC.fit(Xtrain_OV, ytrain_OV)
y_pred_GBC = GBC.predict(X_test)
In [68]:
Classification_Assessment(GBC , Xtrain_OV, ytrain_OV, X_test, y_test)
Recall Training data:      0.8722
Precision Training data:   0.8
----------------------------------------------------------------------
Recall Test data:          0.7707
Precision Test data:       0.0597
----------------------------------------------------------------------
Confusion Matrix Test data
[[6618 1905]
 [  36  121]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      8523
           1       0.06      0.77      0.11       157

    accuracy                           0.78      8680
   macro avg       0.53      0.77      0.49      8680
weighted avg       0.98      0.78      0.86      8680

---------------------
AUC_train: 0.906
AUC_test:  0.858
---------------------
Accuracy Training data:      0.8262
Accuracy Test data:          0.7764
----------------------------------------------------------------------
Valuation for test data only:
f1 score macro            0.491
f1 score micro            0.776
-----------------------------------------------------------------------------
0 (majority) is better classified than 1 (minority)- micro > macro
Same holds true for AUC
-----------------------------------------------------------------------------
1 proportion: 2.0
0 proportion: 98.0

XGBoost – Extreme Gradient Boosting !

In [13]:
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [14]:
XGB = XGBClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(XGB, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [15]:
XGB.fit(Xtrain_OV, ytrain_OV)
y_pred_GBC = XGB.predict(X_test)
In [62]:
Classification_Assessment(XGB , Xtrain_OV, ytrain_OV, X_test, y_test)
Recall Training data:      1.0
Precision Training data:   0.968
----------------------------------------------------------------------
Recall Test data:          0.2548
Precision Test data:       0.0839
----------------------------------------------------------------------
Confusion Matrix Test data
[[8086  437]
 [ 117   40]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8523
           1       0.08      0.25      0.13       157

    accuracy                           0.94      8680
   macro avg       0.53      0.60      0.55      8680
weighted avg       0.97      0.94      0.95      8680

---------------------
AUC_train: 0.999
AUC_test:  0.816
---------------------
Accuracy Training data:      0.9834
Accuracy Test data:          0.9362
----------------------------------------------------------------------
Valuation for test data only:
f1 score macro            0.547
f1 score micro            0.936
-----------------------------------------------
0 (majority) is better classified than 1 (minority)- micro > macro
Same holds true for AUC
-----------------------------------------------

LightGBM – Light Gradient Boosted Machine

In [18]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [19]:
LGBM = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [20]:
LGBM.fit(Xtrain_OV, ytrain_OV)
y_pred_LGBM = LGBM.predict(X_test)
In [63]:
Classification_Assessment(LGBM , Xtrain_OV, ytrain_OV, X_test, y_test)
Recall Training data:      0.9952
Precision Training data:   0.8964
----------------------------------------------------------------------
Recall Test data:          0.5605
Precision Test data:       0.0728
----------------------------------------------------------------------
Confusion Matrix Test data
[[7403 1120]
 [  69   88]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.99      0.87      0.93      8523
           1       0.07      0.56      0.13       157

    accuracy                           0.86      8680
   macro avg       0.53      0.71      0.53      8680
weighted avg       0.97      0.86      0.91      8680

---------------------
AUC_train: 0.988
AUC_test:  0.837
---------------------
Accuracy Training data:      0.9398
Accuracy Test data:          0.863
----------------------------------------------------------------------
Valuation for test data only:
f1 score macro            0.527
f1 score micro            0.863
-----------------------------------------------
0 (majority) is better classified than 1 (minority)- micro > macro
Same holds true for AUC
-----------------------------------------------

CatBoost – Cat Boost Classifier!

In [22]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [23]:
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [24]:
CBC.fit(Xtrain_OV, ytrain_OV)
y_pred_CBC = CBC.predict(X_test)
In [69]:
Classification_Assessment(CBC , Xtrain_OV, ytrain_OV, X_test, y_test)
Recall Training data:      0.9984
Precision Training data:   0.9188
----------------------------------------------------------------------
Recall Test data:          0.3631
Precision Test data:       0.062
----------------------------------------------------------------------
Confusion Matrix Test data
[[7660  863]
 [ 100   57]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      8523
           1       0.06      0.36      0.11       157

    accuracy                           0.89      8680
   macro avg       0.52      0.63      0.52      8680
weighted avg       0.97      0.89      0.93      8680

---------------------
AUC_train: 0.989
AUC_test:  0.814
---------------------
Accuracy Training data:      0.9549
Accuracy Test data:          0.8891
----------------------------------------------------------------------
Valuation for test data only:
f1 score macro            0.523
f1 score micro            0.889
-----------------------------------------------------------------------------
0 (majority) is better classified than 1 (minority)- micro > macro
Same holds true for AUC
-----------------------------------------------------------------------------
1 proportion: 2.0
0 proportion: 98.0

wskaźnik F1

Wynik F1 można interpretować jako średnią ważoną precision i recall, gdzie wynik F1 osiąga najlepszą wartość przy 1, a najgorszy przy 0. Względny wkład precyzji i przywołania do wyniku F1 jest równy. Wzór na wynik F1 to:

F1 = 2 (precision recall) / (precision + recall)

Precision-Recall metric

Precision jest miarą trafności wyników, a Recall jest miarą liczby zwracanych naprawdę istotnych wyników.

  • Duży obszar pod krzywą reprezentuje zarówno wysoką Recall, jak i wysoki PrecisionF1 ‘micro’ – mikro waży każdą próbkę jednakowoklasa 1 stanowiła 40% danych, F1 dla tej klasy wynosi 0.8klasa 2 stanowiła 60% danych, F1 dla tej klasy wynosi 0.2
    0.8 x 40% + 0.2 x 60% = 0.44

Ponieważ każda próbka jest ważona równo, w wyniku czego wynik jest reprezentatywny nierównowagi danych.