
230320200907
Principal component analysis (PCA)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv('/home/wojciech/Pulpit/1/Stroke_Prediction_NUM.csv')
print(df.shape)
df.head(5)
Analysis of the result variable’s balance level ¶
del df['Unnamed: 0']
df.Stroke.value_counts(dropna = False, normalize=True)
df.columns
Split into test and result set ¶
df2 = df[['Hypertension','Heart_Disease','Avg_Glucose','BMI','Stroke','Age_years','Gender_C','Ever_Married_C','Type_Of_Work_C','Residence_C','Smoking_Status_C','Age_years_10_C']]
y = df2['Stroke']
X = df2.drop('Stroke', axis=1)
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.33, stratify = y, random_state = 148)
print ('Zbiór X treningowy: ',Xtrain.shape)
print ('Zbiór X testowy: ', Xtest.shape)
print ('Zbiór y treningowy: ', ytrain.shape)
print ('Zbiór y testowy: ', ytest.shape)
print("ytrain = 0: ", sum(ytrain == 0))
print("ytrain = 1: ", sum(ytrain == 1))
Proporcja = sum(ytrain == 0) / sum(ytrain == 1)
Proporcja = np.round(Proporcja, decimals=0)
Proporcja = Proporcja.astype(int)
print('Ilość 0 Stroke na 1 Stroke: ', Proporcja)
ytrain_OVSA = pd.concat([ytrain[ytrain==1]] * Proporcja, axis = 0)
ytrain_OVSA.count()
We have increased the number of result variables 1. We now have the same number of rows of result variables and independent variables. We are now introducing new additional variables 1 to the training set.
Xtrain_OVSA = pd.concat([Xtrain.loc[ytrain==1, :]] * Proporcja, axis = 0)
ytrain_OVSA.count()
ytrain_OVSA = pd.concat([ytrain, ytrain_OVSA], axis = 0).reset_index(drop = True)
Xtrain_OVSA = pd.concat([Xtrain, Xtrain_OVSA], axis = 0).reset_index(drop = True)
print("ilość elementów w zbiorze Xtrain: ", Xtrain.BMI.count())
print("ilość elementów w zbiorze Xtrain_OVSA: ", Xtrain_OVSA.BMI.count())
print("ilość elementów w zbiorze ytrain: ", ytrain.count())
print("ilość elementów w zbiorze ytrain_OVSA: ", ytrain_OVSA.count())
Result set balance level:
ytrain_OVSA.value_counts(dropna = False, normalize=True).plot(kind='pie')
Logistic regression model ¶
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid.fit(Xtrain_OVSA, ytrain_OVSA)
y_pred_LRC = LR_Grid.predict(Xtest)
Model assessment:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain_OVSA, LR_Grid.predict(Xtrain_OVSA)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain_OVSA, LR_Grid.predict(Xtrain_OVSA)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid.predict(Xtest)))
y_pred_proba = LR_Grid.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC)
auc = metrics.roc_auc_score(ytest, y_pred_LRC)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
Principal component analysis (PCA)¶
Standardization of Xtrain_OVSA and Xtest variables
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_PCA = sc.fit_transform(Xtrain_OVSA)
X_test_PCA = sc.transform(Xtest)
print(X_train_PCA.shape)
print(X_test_PCA.shape)
PCA transformation of two variables¶
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train_PCA2 = pca.fit_transform(X_train_PCA)
X_test_PCA2 = pca.transform(X_test_PCA)
pca.fit(X_train_PCA2)
explained_variance = pca.explained_variance_ratio_
explained_variance
pca.components_
def draw_vector(v0, v1, ax=None):
ax = ax or plt.gca()
arrowprops=dict(arrowstyle='->',
linewidth=3,
color='red',
shrinkA=0, shrinkB=0)
ax.annotate('', v1, v0, arrowprops=arrowprops)
# plot data
plt.scatter(X_test_PCA2[:, 0], X_test_PCA2[:, 1], alpha=0.3)
for length, vector in zip(pca.explained_variance_, pca.components_):
v = vector * 3 * np.sqrt(length)
draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');
We substitute again for the logistic regression model¶
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid2 = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid2.fit(X_train_PCA2, ytrain_OVSA)
y_pred_LRC2 = LR_Grid2.predict(X_test_PCA2)
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain_OVSA, LR_Grid2.predict(X_train_PCA2)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain_OVSA, LR_Grid2.predict(X_train_PCA2)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid2.predict(X_test_PCA2)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid2.predict(X_test_PCA2)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid2.predict(X_test_PCA2)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid2.predict(X_test_PCA2)))
y_pred_proba = LR_Grid2.predict_proba(X_test_PCA2)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC2)
auc = metrics.roc_auc_score(ytest, y_pred_LRC2)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
print(X_train_PCA2.shape)
print(ytrain.shape)
It is clear that PCA improved the auc from 0.750 to 0.753.
Cluster visualisation¶
It is possible in such a graphic format only because there are two variables (after the PCA transformation, because there were 11 before). The area for assessing auc classification before and after PCA transformation is similar and is 0.75. now you can see what chapter ka looks like
For the training set¶
# Predicting the training set
# result through scatter plot
from matplotlib.colors import ListedColormap
X_set, y_set = X_train_PCA2, ytrain_OVSA
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, LR_Grid2.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
cmap = ListedColormap(('pink', 'white', 'lightgreen')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1') # for Xlabel
plt.ylabel('PC2') # for Ylabel
plt.legend() # to show legend
# show scatter plot
plt.show()
For the test set¶
# Predicting the training set
# result through scatter plot
from matplotlib.colors import ListedColormap
X_set, y_set = X_test_PCA2, ytest
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, LR_Grid2.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
cmap = ListedColormap(('pink', 'white', 'lightgreen')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1') # for Xlabel
plt.ylabel('PC2') # for Ylabel
plt.legend() # to show legend
# show scatter plot
plt.show()
PCA transformation of three variables¶
from sklearn.decomposition import PCA
pca3 = PCA(n_components = 3)
X_train_PCA3 = pca3.fit_transform(X_train_PCA)
X_test_PCA3 = pca3.transform(X_test_PCA)
pca3.fit(X_train_PCA)
Variance of the top 3 variables¶
The higher the variance, the better.
explained_variance = pca3.explained_variance_ratio_
explained_variance
pca.components_
Again, we substitute for the logistic regression model¶
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
Parameteres = {'C': np.power(10.0, np.arange(-3, 3))}
LR = LogisticRegression(warm_start = True)
LR_Grid3 = GridSearchCV(LR, param_grid = Parameteres, scoring = 'roc_auc', n_jobs = -1, cv=2)
LR_Grid3.fit(X_train_PCA3, ytrain_OVSA)
y_pred_LRC3 = LR_Grid3.predict(X_test_PCA3)
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain_OVSA, LR_Grid3.predict(X_train_PCA3)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain_OVSA, LR_Grid3.predict(X_train_PCA3)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, LR_Grid3.predict(X_test_PCA3)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, LR_Grid3.predict(X_test_PCA3)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, LR_Grid3.predict(X_test_PCA3)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, LR_Grid3.predict(X_test_PCA3)))
y_pred_proba = LR_Grid3.predict_proba(X_test_PCA3)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred_LRC3)
auc = metrics.roc_auc_score(ytest, y_pred_LRC3)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
print(X_train_PCA3.shape)
print(ytrain.shape)