
240120202201
In [67]:
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.metrics import accuracy_score
import scikitplot as skplt
from plot_metric.functions import BinaryClassification
from sklearn.metrics import precision_recall_curve
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
print(classification_report(ytest, model.predict(Xtest)))
## ----------AUC-----------------------------------------
print('---------------------')
AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
print('AUC_train: AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
print('AUC_test: print('---------------------')
print("Accuracy Training data: ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Accuracy Test data: ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print('Valuation for test data only:')
y_probas1 = model.predict_proba(Xtest)[:,1]
y_probas2 = model.predict_proba(Xtest)
### ---plot_roc_curve--------------------------------------------------------
plt.figure(figsize=(13,4))
plt.subplot(1, 2, 1)
bc = BinaryClassification(ytest, y_probas1, labels=["Class 1", "Class 2"])
bc.plot_roc_curve()
### --------precision_recall_curve------------------------------------------
plt.subplot(1, 2, 2)
precision, recall, thresholds = precision_recall_curve(ytest, y_probas1)
plt.plot(recall, precision, marker='.', label=model)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc=(-0.30, -0.8))
plt.show()
## ----------plot_roc-----------------------------------------
skplt.metrics.plot_roc(ytest, y_probas2)
f1_score_macro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='macro'), decimals=3)
print("f1 score macro ",f1_score_macro)
f1_score_micro = np.round(metrics.f1_score(ytest, model.predict(Xtest), average='micro'), decimals=3)
print("f1 score micro ",f1_score_micro)
print('-----------------------------------------------------------------------------')
if f1_score_macro > f1_score_micro:
print("1 (minority) is better classified than 0 (majority) - macro > micro")
else:
print('0 (majority) is better classified than 1 (minority)- micro > macro')
print('Same holds true for AUC')
print('-----------------------------------------------------------------------------')
cal_1 = np.round((sum(ytest == 1)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
cal_0 = np.round((sum(ytest == 0)/(sum(ytest == 0)+sum(ytest == 1))),decimals=2)*100
print('1 proportion:',cal_1 )
print('0 proportion:',cal_0 )
In [2]:
def oversampling(ytrain, Xtrain):
import matplotlib.pyplot as plt
global Xtrain_OV
global ytrain_OV
calss1 = np.round((sum(ytrain == 1)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
calss0 = np.round((sum(ytrain == 0)/(sum(ytrain == 0)+sum(ytrain == 1))),decimals=2)*100
print("y = 0: ", sum(ytrain == 0),'-------',calss0,
print("y = 1: ", sum(ytrain == 1),'-------',calss1,
print('--------------------------------------------------------')
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
print()
Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
Proporcja = np.round(Proporcja, decimals=0)
Proporcja = Proporcja.astype(int)
ytrain_OV = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
Xtrain_OV = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
ytrain_OV = pd.concat([ytrain, ytrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.concat([Xtrain, Xtrain_OV], axis = 0).reset_index(drop = True)
Xtrain_OV = pd.DataFrame(Xtrain_OV)
ytrain_OV = pd.DataFrame(ytrain_OV)
print("Before oversampling Xtrain: ", Xtrain.shape)
print("Before oversampling ytrain: ", ytrain.shape)
print('--------------------------------------------------------')
print("After oversampling Xtrain_OV: ", Xtrain_OV.shape)
print("After oversampling ytrain_OV: ", ytrain_OV.shape)
print('--------------------------------------------------------')
ax = plt.subplot(1, 2, 1)
ytrain.value_counts(dropna = False, normalize=True).plot(kind='pie',title='Before oversampling')
plt.show
kot = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
kot = pd.concat([ytrain, kot], axis = 0).reset_index(drop = True)
ax = plt.subplot(1, 2, 2)
kot.value_counts(dropna = False, normalize=True).plot(kind='pie',title='After oversampling')
plt.show
In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df= pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction.csv')
print(df.shape)
print()
print(df.columns)
df.head(3)
Out[3]:
In [4]:
### Narzędzie do automatycznego kodowania zmiennych dyskretnych (własnej roboty)
In [5]:
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
In [6]:
df.fillna(7777, inplace=True)
In [7]:
X = df.drop('Stroke', axis=1)
y = df['Stroke']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
In [8]:
oversampling(y_train, X_train)
GradientBoostingRegressor in scikit-learn¶
In [9]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
GBC = GradientBoostingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [10]:
n_scores = cross_val_score(GBC, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [11]:
GBC.fit(Xtrain_OV, ytrain_OV)
y_pred_GBC = GBC.predict(X_test)
In [68]:
Classification_Assessment(GBC , Xtrain_OV, ytrain_OV, X_test, y_test)
XGBoost – Extreme Gradient Boosting !¶
In [13]:
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [14]:
XGB = XGBClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(XGB, Xtrain_OV, ytrain_OV, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
In [15]:
XGB.fit(Xtrain_OV, ytrain_OV)
y_pred_GBC = XGB.predict(X_test)
In [62]:
Classification_Assessment(XGB , Xtrain_OV, ytrain_OV, X_test, y_test)
LightGBM – Light Gradient Boosted Machine¶
In [18]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [19]:
LGBM = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [20]:
LGBM.fit(Xtrain_OV, ytrain_OV)
y_pred_LGBM = LGBM.predict(X_test)
In [63]:
Classification_Assessment(LGBM , Xtrain_OV, ytrain_OV, X_test, y_test)
CatBoost – Cat Boost Classifier!¶
In [22]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
In [23]:
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [24]:
CBC.fit(Xtrain_OV, ytrain_OV)
y_pred_CBC = CBC.predict(X_test)
In [69]:
Classification_Assessment(CBC , Xtrain_OV, ytrain_OV, X_test, y_test)
wskaźnik F1¶
Wynik F1 można interpretować jako średnią ważoną precision i recall, gdzie wynik F1 osiąga najlepszą wartość przy 1, a najgorszy przy 0. Względny wkład precyzji i przywołania do wyniku F1 jest równy. Wzór na wynik F1 to:
F1 = 2 (precision recall) / (precision + recall)
Precision-Recall metric¶
Precision jest miarą trafności wyników, a Recall jest miarą liczby zwracanych naprawdę istotnych wyników.
- Duży obszar pod krzywą reprezentuje zarówno wysoką Recall, jak i wysoki PrecisionF1 'micro’ – mikro waży każdą próbkę jednakowoklasa 1 stanowiła 40% danych, F1 dla tej klasy wynosi 0.8klasa 2 stanowiła 60% danych, F1 dla tej klasy wynosi 0.2
0.8 x 40% + 0.2 x 60% = 0.44
Ponieważ każda próbka jest ważona równo, w wyniku czego wynik jest reprezentatywny nierównowagi danych.