
In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
df = pd.read_csv('c:/1/kaggletrain.csv')
df = df.dropna(how='any')
df.head(2)
Out[1]:
Podział na zmienne zmienne niezależne: 'Age’, 'Pclass’, 'Fare’i zmienną zależną: 'Survived’
Tworzymy zmienne zbiory treningowe i testowe
In [2]:
y = df['Survived']
X = df[['Age', 'Pclass', 'Fare']]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33,
random_state=42)
Tworze model RandomForestClassifier – model Random Forest nr.1
In [3]:
from sklearn.pipeline import Pipeline
#this is the classifier used for feature selection
RF1 = RandomForestClassifier(n_estimators=30,
random_state=42,
class_weight="balanced")
Tworze inny model klasyfikacji: GradientBoostingClassifier używając jako estymatora Random Forest
In [4]:
RFECV = RFECV(estimator=RF1,
step=1,
cv=5,
scoring = 'roc_auc')
Znowu tworze model RandomForestClassifier – model Random Forest nr.2
In [5]:
RF2 = RandomForestClassifier(n_estimators=10,
random_state=42,
class_weight="balanced")
Tworzymy siatkę grid dla – model Random Forest nr.2
In [6]:
Grid_RF2 = GridSearchCV(RF2,
param_grid={'max_depth':[2,3]},
cv= 5, scoring = 'roc_auc')
Tworze pipeline
In [7]:
pipeline = Pipeline([('RFECV',RFECV),
('Grid_RF2',Grid_RF2)])
pipeline.fit(X_train, y_train)
pipeline.predict(X_test)
Out[7]:
Blok diagnostyczny
In [8]:
import numpy as np
y_pred = pipeline.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
co_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion_matrix')
co_matrix
print(classification_report(y_test, y_pred))
print("Accuracy: ",np.round(metrics.accuracy_score(y_test, y_pred), decimals=2))
print("Precision: ",np.round(metrics.precision_score(y_test, y_pred), decimals=2))
print("Recall: ",np.round(metrics.recall_score(y_test, y_pred), decimals=2))
print("F1 score: ",np.round(metrics.f1_score(y_test, y_pred), decimals=2))
Ocena parametrów
In [28]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
print('best_params_: ', Grid_RF2.best_params_)
print('best_score_: ',Grid_RF2.best_score_)
print('best_estimator_:',Grid_RF2.best_estimator_)
Wykres ROC
In [36]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
In [38]:
import numpy as np
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)
Out[38]: