
300320201248
Forward selection is an iterative method in which we start with no function in the model. In each iteration, we add a function that best improves our model until adding a new variable improves the model’s performance.
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
In [2]:
## colorful prints
def black(text):
print('33[30m', text, '33[0m', sep='')
def red(text):
print('33[31m', text, '33[0m', sep='')
def green(text):
print('33[32m', text, '33[0m', sep='')
def yellow(text):
print('33[33m', text, '33[0m', sep='')
def blue(text):
print('33[34m', text, '33[0m', sep='')
def magenta(text):
print('33[35m', text, '33[0m', sep='')
def cyan(text):
print('33[36m', text, '33[0m', sep='')
def gray(text):
print('33[90m', text, '33[0m', sep='')
In [3]:
df = pd.read_csv ('/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv')
green(df.shape)
df.head(3)
Out[3]:
Deleting unneeded columns¶
In [4]:
df['concave_points_worst'] = df['concave points_worst']
df['concave_points_se'] = df['concave points_se']
df['concave_points_mean'] = df['concave points_mean']
del df['Unnamed: 32']
del df['diagnosis']
del df['id']
In [5]:
df.isnull().sum()
Out[5]:
In [6]:
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[6]:
Deletes duplicates¶
there were no duplicates
In [7]:
green(df.shape)
df.drop_duplicates(keep='first', inplace=True)
blue(df.shape)
In [8]:
blue(df.dtypes)
In [9]:
df.columns
Out[9]:
We choose the continuous variable – compactness_mean¶
In [10]:
print('max:',df['compactness_mean'].max())
print('min:',df['compactness_mean'].min())
sns.distplot(np.array(df['compactness_mean']))
Out[10]:
Step Forward Selection¶
In [11]:
X = df.drop('compactness_mean', axis=1)
y = df['compactness_mean']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
I specify how many programs should indicate the best variables:
In [12]:
k_features = 16
In [13]:
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
LR = LinearRegression()
sfs1 = sfs(LR,k_features = k_features, forward=True, floating=False, scoring='r2',verbose=2,cv=5)
sfs1 = sfs1.fit(X_train,y_train)
In [14]:
feat_cols =list(sfs1.k_feature_idx_)
print(feat_cols)
In [15]:
X.columns
Out[15]:
In [16]:
new_cols = df.columns[feat_cols]
new_cols
Out[16]:
I create a dataset with reduced columns.
In [17]:
df2 = df[new_cols]
df2.head(3)
Out[17]:
OLS linear regression model for variables before reduction¶
In [18]:
blue(df.shape)
In [19]:
X1 = df.drop('compactness_mean', axis=1)
y1 = df['compactness_mean']
In [20]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y1, sm.add_constant(X1))
model_fit = model.fit()
print('R2: #blue(model_fit.summary())
OLS linear regression model for variables after reduction¶
In [21]:
X2 = df2.drop('compactness_mean', axis=1)
y2 = df2['compactness_mean']
In [22]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y2, sm.add_constant(X2))
model_fit = model.fit()
print('R2: #blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')