290320201454
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
In [2]:
## colorful prints
def black(text):
print('33[30m', text, '33[0m', sep='')
def red(text):
print('33[31m', text, '33[0m', sep='')
def green(text):
print('33[32m', text, '33[0m', sep='')
def yellow(text):
print('33[33m', text, '33[0m', sep='')
def blue(text):
print('33[34m', text, '33[0m', sep='')
def magenta(text):
print('33[35m', text, '33[0m', sep='')
def cyan(text):
print('33[36m', text, '33[0m', sep='')
def gray(text):
print('33[90m', text, '33[0m', sep='')
In [3]:
df = pd.read_csv ('/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv')
green(df.shape)
df.head(3)
Out[3]:
Deleting unneeded columns¶
In [4]:
del df['Unnamed: 32']
del df['diagnosis']
del df['id']
In [5]:
df.isnull().sum()
Out[5]:
In [6]:
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[6]:
Deletes duplicates¶
there were no duplicates
In [7]:
green(df.shape)
df.drop_duplicates(keep='first', inplace=True)
blue(df.shape)
In [8]:
blue(df.dtypes)
In [9]:
df.columns
Out[9]:
We choose the continuous variable – compactness_mean¶
In [10]:
print('max:',df['compactness_mean'].max())
print('min:',df['compactness_mean'].min())
sns.distplot(np.array(df['compactness_mean']))
Out[10]:
Pearson correlation¶
In [11]:
def matrix_plot(df,title):
sns.set(style="ticks")
corr = df.corr()
corr = np.round(corr, decimals=2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(20, 20))
#cmap = sns.diverging_palette(580, 10, as_cmap=True)
cmap = sns.diverging_palette(180, 90, as_cmap=True) #Inna paleta barw
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=0.3, center=0.03,annot=True,
square=True, linewidths=.9, cbar_kws={"shrink": 0.8})
plt.xticks(rotation=90)
plt.title(title,fontsize=32,color='#0c343d',alpha=0.5)
plt.show
In [12]:
matrix_plot(df,'Pearson correlation')
Correlation to the result variable¶
In [13]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
CORREL = df.corr().sort_values('compactness_mean')
CORREL['compactness_mean'].plot(kind='barh',color='#0c343d',alpha=0.5)
plt.title('Correlation to the result variable', fontsize=20)
plt.xlabel('Correlation level')
plt.ylabel('Continuous independent variables')
Out[13]:
I find variables that are highly correlated with the result variable¶
In [14]:
kot = abs(CORREL['compactness_mean'])
FAT = kot[kot>=0.7]
FAT
Out[14]:
Compares variables in pairs¶
In [15]:
plt.barh(*zip(*FAT.items()),color='#0c343d',alpha=0.5)
plt.xticks(rotation=90)
Out[15]:
High autocorrelation chart¶
In [16]:
CORR = df.corr()
kot = CORR[CORR>=.9]
plt.figure(figsize=(6,4))
sns.heatmap(kot, cmap="Greens")
Out[16]:
Deleting correlated independent variables¶
The code we compare the correlation between variables and remove one of two features whose correlation is higher than 0.9
In [17]:
corr = df.corr()
kot = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if kot[j]:
kot[j] = False
selected_columns = df.columns[kot]
df2 = df[selected_columns]
In [18]:
kot #<== PĘTLA ZROBIŁA NAM wektor 31 elementów True- False
Out[18]:
Dimensions have been reduced¶
In [19]:
blue(df.shape)
green(df2.shape)
OLS linear regression model for variables before reduction¶
In [20]:
blue(df.shape)
green(df2.shape)
In [21]:
X1 = df.drop('compactness_mean', axis=1)
y1 = df['compactness_mean']
In [22]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y1, sm.add_constant(X1))
model_fit = model.fit()
print('R2: #blue(model_fit.summary())
OLS linear regression model for variables after reduction¶
In [23]:
X2 = df2.drop('compactness_mean', axis=1)
y2 = df2['compactness_mean']
In [24]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y2, sm.add_constant(X2))
model_fit = model.fit()
print('R2: #blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')
Eliminates variables previously selected in the FAT procedure¶
In [25]:
FAT
Out[25]:
In [26]:
df3 = df.drop(['compactness_se','concave points_worst','concavity_worst','concave points_mean','compactness_worst','concavity_mean'],1)
In [27]:
X3 = df3.drop('compactness_mean', axis=1)
y3 = df3['compactness_mean']
In [28]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = sm.OLS(y3, sm.add_constant(X3))
model_fit = model.fit()
print('R2: #blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')