
Procedura 2: RandomForestClassifier¶
In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
df = pd.read_csv('c:/1/kaggletrain.csv')
df = df.dropna(how='any')
df.head(2)
Out[1]:
Podział na zmienne zmienne niezależne: 'Age’, 'Pclass’, 'Fare’i zmienną zależną: 'Survived’
Tworzymy zmienne zbiory treningowe i testowe
In [2]:
y = df['Survived']
X = df[['Age', 'Pclass', 'Fare']]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33,
random_state=42)
Tworze model RandomForestClassifier – model Random Forest nr.1
In [3]:
clf = RandomForestClassifier(random_state=0, class_weight="balanced")
Tworze inny model klasyfikacji: GradientBoostingClassifier używając jako estymatora Random Forest
In [4]:
from sklearn.model_selection import StratifiedKFold
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(3), scoring='roc_auc')
Tworze superparametry dla siatki grid
In [5]:
param_grid = {
'estimator__n_estimators': [200, 500],
'estimator__max_features': ['auto', 'sqrt', 'log2'],
'estimator__max_depth' : [4,5,6,7,8],
'estimator__criterion' :['gini', 'entropy']
}
Tworze parametr k_fold
In [6]:
from sklearn.model_selection import StratifiedKFold
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
Tworzymy siatkę grid
In [7]:
CV_rfc = GridSearchCV(estimator=rfecv, param_grid=param_grid, cv= k_fold, scoring = 'roc_auc')
In [8]:
import time
start_time = time.time() ## pomiar czasu: start pomiaru czasu
print(time.ctime())
## Uwaga czas wykonania estymacji trwa bardzo długo!!!
In [9]:
CV_rfc.fit(X_train, y_train)
Out[9]:
In [10]:
print('Pomiar czasu wykonania tego zadania')
print(time.time() - start_time) ## koniec pomiaru czasu
Blok diagnostyczny
In [12]:
import numpy as np
y_pred = CV_rfc.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
co_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion_matrix')
co_matrix
print(classification_report(y_test, y_pred))
print("Accuracy: ",np.round(metrics.accuracy_score(y_test, y_pred), decimals=2))
print("Precision: ",np.round(metrics.precision_score(y_test, y_pred), decimals=2))
print("Recall: ",np.round(metrics.recall_score(y_test, y_pred), decimals=2))
print("F1 score: ",np.round(metrics.f1_score(y_test, y_pred), decimals=2))
Ocena parametrów
In [14]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
print('best_params_: ', CV_rfc.best_params_)
print('best_score_: ',CV_rfc.best_score_)
print('best_estimator_:',CV_rfc.best_estimator_)
Wykres ROC
In [18]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
In [19]:
import numpy as np
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)
Out[19]: