import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/kaggletrain.csv')
df = df.dropna(how='any')
df.dtypes
Out[1]:
In [2]:
df.columns
Out[2]:
In [3]:
df.head(3)
Out[3]:
In [4]:
df.dtypes
Out[4]:
In [5]:
df['Sex'] = pd.Categorical(df.Sex).codes
df['Ticket'] = pd.Categorical(df.Ticket).codes
df['Cabin'] = pd.Categorical(df.Ticket).codes
df['Embarked'] = pd.Categorical(df.Embarked).codes
In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
Simple classification model: Logistic regression model¶
In [7]:
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid.fit(Xtrain, ytrain)
y_pred_LRC = LR_Grid.predict(Xtest)
In [8]:
# ocena modelu
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid.predict(Xtest)))
y_pred_proba = LR_Grid.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC)
auc = metrics.roc_auc_score(ytest, y_pred_LRC)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
print('auc',auc)
plt.show()
The model that uses all independent variables has the value AUC = 0.72
We check which independent variables are the best¶
In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X, y)
Out[9]:
In [10]:
importance = rfc.feature_importances_
In [11]:
importance = np.round(importance, decimals=3)
importance
Out[11]:
We sort variables by importance¶
This is a very useful function when there are many variables.
In [12]:
KOT = dict(zip(X, importance))
KOT_sorted_keys = sorted(KOT, key=KOT.get, reverse=True)
for r in KOT_sorted_keys:
print (r, KOT[r])
KOT
Warning! The variables with the highest values have the highest importance.
We only use the most relevant variables: ‘Sex’, ‘Age’, ‘Fare’, ‘Ticket’¶
In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Age', 'Sex', 'Fare','Ticket']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
Simple classification model: Logistic regression model¶
In [14]:
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid.fit(Xtrain, ytrain)
y_pred_LRC = LR_Grid.predict(Xtest)
In [15]:
# ocena modelu
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid.predict(Xtest)))
y_pred_proba = LR_Grid.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC)
auc = metrics.roc_auc_score(ytest, y_pred_LRC)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
Including the ‘Pclass’ variable has compromised model properties!¶
It is widely known that the first class passengers had a better chance of being saved.
Feature_importances analysis showed that the variable ‘Pclass’ is of little importance in the classification process.
The addition of this variable to the classification model resulted in a deterioration of the AUC surface.
In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Age', 'Sex', 'Fare','Ticket','Pclass']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
In [17]:
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid.fit(Xtrain, ytrain)
y_pred_LRC = LR_Grid.predict(Xtest)
In [18]:
# ocena modelu
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, LR_Grid.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid.predict(Xtest)))
y_pred_proba = LR_Grid.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC)
auc = metrics.roc_auc_score(ytest, y_pred_LRC)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)