
300320201719
It is a greedy optimization algorithm which aims to find the best performing feature subset. It repeatedly creates models and keeps aside the best or the worst performing feature at each iteration. It constructs the next model with the left features until all the features are exhausted. It then ranks the features based on the order of their elimination.
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
In [2]:
## colorful prints
def black(text):
print('33[30m', text, '33[0m', sep='')
def red(text):
print('33[31m', text, '33[0m', sep='')
def green(text):
print('33[32m', text, '33[0m', sep='')
def yellow(text):
print('33[33m', text, '33[0m', sep='')
def blue(text):
print('33[34m', text, '33[0m', sep='')
def magenta(text):
print('33[35m', text, '33[0m', sep='')
def cyan(text):
print('33[36m', text, '33[0m', sep='')
def gray(text):
print('33[90m', text, '33[0m', sep='')
In [3]:
df = pd.read_csv ('/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv')
green(df.shape)
df.head(3)
Out[3]:
Deleting unneeded columns¶
In [4]:
df['concave_points_worst'] = df['concave points_worst']
df['concave_points_se'] = df['concave points_se']
df['concave_points_mean'] = df['concave points_mean']
del df['Unnamed: 32']
del df['diagnosis']
del df['id']
In [5]:
df.isnull().sum()
Out[5]:
In [6]:
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[6]:
Deletes duplicates¶
there were no duplicates
In [7]:
green(df.shape)
df.drop_duplicates(keep='first', inplace=True)
blue(df.shape)
In [8]:
blue(df.dtypes)
In [9]:
df.columns
Out[9]:
We choose the continuous variable – compactness_mean¶
In [10]:
print('max:',df['compactness_mean'].max())
print('min:',df['compactness_mean'].min())
sns.distplot(np.array(df['compactness_mean']))
Out[10]:
Recursive Feature elimination¶
In [11]:
X = df.drop('compactness_mean', axis=1)
y = df['compactness_mean']
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
I set the number of variables that will remain in the model¶
In [12]:
Num_v = 15
In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
model=LinearRegression()
rfe=RFE(model,Num_v)
# Standaryzacja zmiennych
X_rfe = rfe.fit_transform(X,y)
model.fit(X_rfe,y)
print('Number of selected functions: ',rfe.n_features_)
print()
print('The mask of selected features: ',rfe.support_)
print()
print('The feature ranking:',rfe.ranking_)
print()
print('The external estimator:',rfe.estimator_)
Metoda zip na wyświetlenie rankingu cech¶
In [14]:
PPS = rfe.ranking_
KOT_MIC = dict(zip(df, PPS))
KOT_sorted_keys_MIC = sorted(KOT_MIC, key=KOT_MIC.get, reverse=True)
for r in KOT_sorted_keys_MIC:
print (r, KOT_MIC[r])
In [15]:
new_cols = X.columns[rfe.support_]
In [16]:
df2 = df[new_cols]
df2.head(3)
Out[16]:
We’re adding a result variable¶
In [17]:
df2['compactness_mean'] = df['compactness_mean']
df2.head(3)
Out[17]:
The Backward Elimination algorithm stated that reducing variables does not improve the model. Therefore, the number of variables was left unchanged.
OLS linear regression model for variables before reduction¶
In [18]:
blue(df.shape)
In [19]:
X1 = df.drop('compactness_mean', axis=1)
y1 = df['compactness_mean']
In [20]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y1, sm.add_constant(X1))
model_fit = model.fit()
print('R2: %.6f' % model_fit.rsquared)
#blue(model_fit.summary())
OLS linear regression model for variables after reduction¶
In [21]:
blue(df2.shape)
In [22]:
X2 = df2.drop('compactness_mean', axis=1)
y2 = df2['compactness_mean']
In [23]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y2, sm.add_constant(X2))
model_fit = model.fit()
print('R2: %.6f' % model_fit.rsquared)
#blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')