200320200904
In this case, the method did not improve the model. However, there are models in which the PCA method is a very important reason for improving the properties of the model.
Loads data from the Titanic database.
In [1]:
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/kaggletrain.csv')
df = df.dropna(how='any')
df.dtypes
Out[1]:
In [2]:
df.columns
Out[2]:
In [3]:
df.head(3)
Out[3]:
Digitizing data in page format¶
In [4]:
df['Sex'] = pd.Categorical(df.Sex).codes
df['Ticket'] = pd.Categorical(df.Ticket).codes
df['Cabin'] = pd.Categorical(df.Ticket).codes
df['Embarked'] = pd.Categorical(df.Embarked).codes
Selection of variables divided into test and training set¶
In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
Data normalization (standardization)¶
PCA works best with a standardized feature set. We will perform standard scalar normalization to normalize our feature set.
In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
Principal component analysis (PCA)¶
In [7]:
from sklearn.decomposition import PCA
pca = PCA()
Xtrain = pca.fit_transform(Xtrain)
Xtest = pca.transform(Xtest)
We did not provide the number of components in the constructor. Therefore, all 9 variables from the set will be returned for both the training and test set.
The PCA class contains, explained_variance_ratio_which returns the variance called by each variable.
In [8]:
explained_variance = pca.explained_variance_ratio_
In [9]:
SOK = np.round(explained_variance, decimals=2)
SOK
Out[9]:
In [10]:
KOT = dict(zip(X, SOK))
KOT_sorted_keys = sorted(KOT, key=KOT.get, reverse=True)
for r in KOT_sorted_keys:
print (r, KOT[r])
KOT
We’re looking for one, the best independent variable in the model¶
In [11]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
Xtrain = pca.fit_transform(Xtrain)
Xtest = pca.transform(Xtest)
In [12]:
from sklearn.ensemble import RandomForestClassifier
RF4 = RandomForestClassifier(max_depth=2, random_state=0)
RF4.fit(Xtrain, ytrain)
# Predicting the Test set results
y_pred1 = RF4.predict(Xtest)
In [13]:
# model assessment
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, RF4.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, RF4.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, RF4.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, RF4.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, RF4.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, RF4.predict(Xtest)))
y_pred_proba = RF4.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred1)
auc = metrics.roc_auc_score(ytest, y_pred1)
plt.plot(fpr, tpr, label='Logistic Regression (auc =
plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
We’re looking for the two best independent variables in the model¶
In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
PCA algorithm¶
In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Xtrain = pca.fit_transform(Xtrain)
Xtest = pca.transform(Xtest)
In [17]:
from sklearn.ensemble import RandomForestClassifier
RF2 = RandomForestClassifier(max_depth=2, random_state=0)
RF2.fit(Xtrain, ytrain)
# Predicting the Test set results
y_pred2 = RF2.predict(Xtest)
In [18]:
# ocena modelu
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, RF2.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, RF2.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, RF2.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, RF2.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, RF2.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, RF2.predict(Xtest)))
y_pred_proba = RF2.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred2)
auc = metrics.roc_auc_score(ytest, y_pred2)
plt.plot(fpr, tpr, label='Logistic Regression (auc =
plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
We are looking for the three best independent variables in the model¶
In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
X = df[[ 'Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
y = df['Survived']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
In [21]:
#### Algorytm PCA
In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
Xtrain = pca.fit_transform(Xtrain)
Xtest = pca.transform(Xtest)
In [23]:
from sklearn.ensemble import RandomForestClassifier
RF3 = RandomForestClassifier(max_depth=2, random_state=0)
RF3.fit(Xtrain, ytrain)
# Predicting the Test set results
y_pred = RF3.predict(Xtest)
In [24]:
# ocena modelu
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, RF3.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, RF3.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, RF3.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, RF3.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, RF3.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, RF3.predict(Xtest)))
y_pred_proba = RF3.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred)
auc = metrics.roc_auc_score(ytest, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc =
plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(Xtrain)
In [26]:
X.columns
Out[26]:
