
090420201150
In [1]:
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/tit_train.csv', na_values="-1")
df.head(2)
Out[1]:
I started with a loop connecting two functions in pairs¶
In [2]:
## ile jest zmiennych
a,b = df.shape #<- ile mamy kolumn
b
Out[2]:
In [3]:
for i in range(1,b):
i = df.columns[i]
for f in range (1,b):
f = df.columns[f]
print(i,f)
Using loops in place of gaps I insert values out of range¶
In [4]:
print('NUMBER OF EMPTY RECORDS vs. FULL RECORDS')
print('----------------------------------------')
for i in range(1,b):
i = df.columns[i]
r = df[i].isnull().sum()
h = df[i].count()
if r > 0:
print(i,"--------",r,"--------",h)
In [5]:
df.fillna(-777, inplace=True)
In [6]:
df = df.dropna(how='any')
df.isnull().sum()
Out[6]:
In [7]:
df.shape
Out[7]:
Encodes discrete (categorical) variables¶
In [8]:
import numpy as np
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
I run the LinearRegression () model
In [9]:
y = df['Survived']
X = df.drop('Survived', axis=1)
In [10]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
I create loops for two variables based on LinearRegression ()¶
In [11]:
c,b = df.shape #<- ile mamy kolumn
print('b: ',b)
a = list(range(1,b))
print('a :', a)
In [12]:
from sklearn import metrics
b= b-2
for i in range(1,b):
i = a[i]
for f in range (1,b):
f = a[f]
y = df['Survived']
X = df.drop('Survived', axis=1)
#a = X.columns[i]
#b = X.columns[f]
col = X.columns[[i,f]] #<-- nazwy kolumn
X = X[col] #<-- FAKTYCZNE warianty zbioru X
regr.fit(X, y)
y_pred = regr.predict(X)
R = regr.score(X, y)
R2 = np.sqrt(metrics.mean_squared_error(y, y_pred))
RR2 = R2+R
if RR2 > 0.72:
# print(' R2: %.3f' %R2, ' regr.score:%.3f' %regr.score(X, y), col)
print(' RR2: %.3f' %RR2, col)
I create loops for three variables based on LinearRegression ()¶
In [13]:
from sklearn import metrics
for i in range(1,b):
i = a[i]
for f in range (1,b):
f = a[f]
for g in range (1,b):
g = a[g]
y = df['Survived']
X = df.drop('Survived', axis=1)
col = X.columns[[i,f,g]] #<-- nazwy kolumn
X = X[col] #<-- FAKTYCZNE warianty zbioru X
regr.fit(X, y)
y_pred = regr.predict(X)
R = regr.score(X, y)
R2 = np.sqrt(metrics.mean_squared_error(y, y_pred))
RR2 = R2+R
if RR2 >= 0.757:
print(' RR2: %.3f' %RR2, col)
I create loops for four variables based on LinearRegression ()¶
In [14]:
from sklearn import metrics
for i in range(1,b):
i = a[i]
for f in range (1,b):
f = a[f]
for g in range (1,b):
g = a[g]
for r in range (1,b):
r = a[r]
y = df['Survived']
X = df.drop('Survived', axis=1)
col = X.columns[[i,f,g,r]] #<-- nazwy kolumn
X = X[col] #<-- FAKTYCZNE warianty zbioru X
regr.fit(X, y)
y_pred = regr.predict(X)
R = regr.score(X, y)
R2 = np.sqrt(metrics.mean_squared_error(y, y_pred))
RR2 = R2+R
if RR2 >= 0.761:
print(' RR2: %.3f' %RR2, col)
I am starting the RandomForestRegressor model
In [15]:
y = df['Survived']
X = df.drop('Survived', axis=1)
print(X.shape)
print(y.shape)
In [16]:
from sklearn.ensemble import RandomForestRegressor
model_RFC1 = RandomForestRegressor().fit(X, y)
In [17]:
from sklearn import metrics
for i in range(1,b):
i = a[i]
for f in range (1,b):
f = a[f]
for g in range (1,b):
g = a[g]
y = df['Survived']
X = df.drop('Survived', axis=1)
col = X.columns[[i,f,g]] #<-- nazwy kolumn
X = X[col] #<-- FAKTYCZNE warianty zbioru X
model_RFC1.fit(X, y)
y_pred2 = model_RFC1.predict(X)
R = model_RFC1.score(X, y)
R2 = np.sqrt(metrics.mean_squared_error(y, y_pred2))
RR2 = R2+R
if RR2 >= 1.05:
print(' RR2: %.3f' %RR2, col)
My own tractor comes to similar conclusions as other tools in the series: Feature Selection Techniques.
Only that my tractor is probably faster in calculations …..