In [1]:

from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pandas as pd 

df = pd.read_csv('c:/1/kaggletrain.csv')
df = df.dropna(how='any')
df.head(2)

Podział na zmienne zmienne niezależne: 'Age’, 'Pclass’, 'Fare’i zmienną zależną: 'Survived’

Tworzymy zmienne zbiory treningowe i testowe

y = df['Survived']
X = df[['Age', 'Pclass', 'Fare']]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

Tworze model RandomForestClassifier – model Random Forest nr.1

from sklearn.pipeline import Pipeline

#this is the classifier used for feature selection
RF1 = RandomForestClassifier(n_estimators=30, 
                                        random_state=42,
                                        class_weight="balanced")

Tworze inny model klasyfikacji: GradientBoostingClassifier używając jako estymatora Random Forest

RFECV = RFECV(estimator=RF1, 
              step=1, 
              cv=5, 
              scoring = 'roc_auc')

Znowu tworze model RandomForestClassifier – model Random Forest nr.2

RF2 = RandomForestClassifier(n_estimators=10, 
                             random_state=42,
                             class_weight="balanced")

Tworzymy siatkę grid dla – model Random Forest nr.2

Grid_RF2 = GridSearchCV(RF2, 
                      param_grid={'max_depth':[2,3]},
                      cv= 5, scoring = 'roc_auc')

Tworze pipeline

pipeline  = Pipeline([('RFECV',RFECV),
                      ('Grid_RF2',Grid_RF2)])

pipeline.fit(X_train, y_train)
pipeline.predict(X_test)

C:ProgramDataAnaconda3libsite-packagessklearnmodel_selection_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

Confusion_matrix
              precision    recall  f1-score   support

           0       0.54      0.57      0.55        23
           1       0.73      0.71      0.72        38

    accuracy                           0.66        61
   macro avg       0.64      0.64      0.64        61
weighted avg       0.66      0.66      0.66        61

Accuracy:    0.66
Precision:   0.73
Recall:      0.71
F1 score:    0.72

best_params_:   {'max_depth': 3}
best_score_:    0.5983649607383937
best_estimator_: RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

0.6378718535469108

Blok diagnostyczny

import numpy as np
y_pred = pipeline.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

co_matrix = metrics.confusion_matrix(y_test, y_pred)

print('Confusion_matrix')
co_matrix

print(classification_report(y_test, y_pred)) 

print("Accuracy:   ",np.round(metrics.accuracy_score(y_test, y_pred), decimals=2))
print("Precision:  ",np.round(metrics.precision_score(y_test, y_pred), decimals=2))
print("Recall:     ",np.round(metrics.recall_score(y_test, y_pred), decimals=2))
print("F1 score:   ",np.round(metrics.f1_score(y_test, y_pred), decimals=2))

Confusion_matrix
              precision    recall  f1-score   support

           0       0.54      0.57      0.55        23
           1       0.73      0.71      0.72        38

    accuracy                           0.66        61
   macro avg       0.64      0.64      0.64        61
weighted avg       0.66      0.66      0.66        61

Accuracy:    0.66
Precision:   0.73
Recall:      0.71
F1 score:    0.72

best_params_:   {'max_depth': 3}
best_score_:    0.5983649607383937
best_estimator_: RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

0.6378718535469108

Ocena parametrów

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

print('best_params_:  ', Grid_RF2.best_params_)

print('best_score_:   ',Grid_RF2.best_score_)
print('best_estimator_:',Grid_RF2.best_estimator_)

best_params_:   {'max_depth': 3}
best_score_:    0.5983649607383937
best_estimator_: RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

0.6378718535469108

Wykres ROC

import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report

fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc = %0.2f)' % auc)
plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()

0.6378718535469108

import numpy as np
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,  y_pred)

0.6378718535469108

	Unnamed: 0	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	female	38.0	1	0	PC 17599	71.2833	C85	C
3	3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S

THE DATA SCIENCE LIBRARY

Wojciech Moszczyński

Procedura 1 RandomForestClassifier

In [1]: