In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
In [13]:
## colorful prints
def black(text):
print('33[30m', text, '33[0m', sep='')
def red(text):
print('33[31m', text, '33[0m', sep='')
def green(text):
print('33[32m', text, '33[0m', sep='')
def yellow(text):
print('33[33m', text, '33[0m', sep='')
def blue(text):
print('33[34m', text, '33[0m', sep='')
def magenta(text):
print('33[35m', text, '33[0m', sep='')
def cyan(text):
print('33[36m', text, '33[0m', sep='')
def gray(text):
print('33[90m', text, '33[0m', sep='')
In [14]:
df = pd.read_csv ('/home/wojciech/Pulpit/6/qsar_oral_toxicity.csv', sep=';')
green(df.shape)
df.head(3)
Out[14]:
I’m looking for empty cells¶
In [15]:
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[15]:
In [16]:
null_value = df.isnull().sum(axis=0)
null_value[null_value != 0]
Out[16]:
Mark empty cells as -999¶
In [50]:
df.fillna(-999, inplace=True)
In [18]:
df.shape
Out[18]:
Deletes duplicates¶
there were no duplicates
In [19]:
green(df.shape)
df.drop_duplicates(keep='first', inplace=True)
blue(df.shape)
In [20]:
blue(df.dtypes)
In [21]:
df.columns
Out[21]:
Encodes the resulting value¶
In [25]:
df['negative'] = pd.Categorical(df['negative']).codes
df['negative'].value_counts()
Out[25]:
In [27]:
df.rename(columns={'negative':'ident'}, inplace=True)
df['ident'].head(2)
Out[27]:
Step Forward Selection¶
In [28]:
X = df.drop('ident', axis=1)
y = df['ident']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
I specify how many programs should indicate the best variables:
In [29]:
k_features = 15
In [30]:
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
LR = LogisticRegression()
sfs1 = sfs(LR,k_features = k_features, forward=True, floating=False, scoring='r2',verbose=2,cv=5)
sfs1 = sfs1.fit(X_train,y_train)
In [32]:
feat_cols =list(sfs1.k_feature_idx_)
print(feat_cols)
In [33]:
PPS = feat_cols
KOT_lasso = dict(zip(df, PPS))
KOT_sorted_keys_lasso = sorted(KOT_lasso, key=KOT_lasso.get, reverse=True)
for r in KOT_sorted_keys_lasso:
print (r, (KOT_lasso[r]))
In [37]:
new_cols = df.columns[feat_cols]
new_cols
Out[37]:
Creates a dataset with reduced columns¶
In [39]:
df2 = df[new_cols]
df2['ident']=df['ident']
df2.head(3)
Out[39]:
In [40]:
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, model.predict(Xtest)))
y_pred_proba = model.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred)
auc = metrics.roc_auc_score(ytest, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc = plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
Logistic regression model for variables before reduction¶
In [41]:
blue(df.shape)
In [42]:
X1 = df.drop('ident', axis=1)
y1 = df['ident']
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=123,stratify=y1)
In [43]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X1_train,y1_train)
y1_pred = logmodel.predict(X1_test)
In [44]:
Classification_Assessment(logmodel ,X1_train, y1_train, X1_test, y1_test, y1_pred)
Logistic regression model for variables after reduction¶
In [45]:
blue(df2.shape)
In [47]:
X2 = df2.drop('ident', axis=1)
y2 = df2['ident']
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=123,stratify=y2)
In [48]:
from sklearn.linear_model import LogisticRegression
logmodel2 = LogisticRegression()
logmodel2.fit(X2_train,y2_train)
y2_pred = logmodel2.predict(X2_test)
In [49]:
Classification_Assessment(logmodel2 ,X2_train, y2_train, X2_test, y2_test, y2_pred)