In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
In [2]:
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/kaggletrain.csv')
df = df.dropna(how='any')
print(df.columns)
print(df.shape)
df.dtypes
Out[2]:
In [3]:
del df['Unnamed: 0']
df.columns
Out[3]:
In [4]:
df.head(3)
Out[4]:
Digitizing data in page format¶
In [5]:
df['Sex'] = pd.Categorical(df.Sex).codes
df['Ticket'] = pd.Categorical(df.Ticket).codes
df['Cabin'] = pd.Categorical(df.Ticket).codes
df['Embarked'] = pd.Categorical(df.Embarked).codes
df.dtypes
Out[5]:
In [6]:
df['Sex']=df['Sex'].astype('int64')
df['Age']=df['Age'].astype('int64')
df.dtypes
Out[6]:
Selection of variables divided into test and training set¶
In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
df2 = df[['Sex','Age','Pclass','Survived']]
X = df2[['Sex','Age']]
y = df2['Survived']
print('X :',X.shape)
print('y :',y.shape)
#Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.3, random_state = 0)
Replacing dataframe with array¶
In [8]:
import numpy as np
y = np.asarray(y)
X = np.asarray(X)
In [9]:
print('X:',X.shape)
print('y:',y.shape)
Data normalization (standardization)¶
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
How Random Forest classifies according to the depth of the tree¶
In [10]:
from helpers_05_08 import visualize_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_blobs
fig, ax = plt.subplots(1, 4, figsize=(16, 3))
fig.subplots_adjust(left=0.02, right=0.98, wspace=0.1)
#X, y = make_blobs(n_samples=300, centers=4,
# random_state=0, cluster_std=1.0)
for axi, depth in zip(ax, range(1,5)):
model = DecisionTreeClassifier(max_depth=depth)
visualize_tree(model, X, y, ax=axi)
axi.set_title('depth = {0}'.format(depth))
Random Forest model, depth 4¶
In [11]:
## MODEL
from sklearn.ensemble import RandomForestClassifier
RF4 = RandomForestClassifier(max_depth=4, random_state=0)
RF4.fit(X, y)
# Predicting the Test set results
y_pred4 = RF4.predict(X)
from matplotlib.colors import ListedColormap
X_set, y_set = X, y
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2,
RF4.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
cmap = ListedColormap(('pink', 'white', 'grey')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Sex') # for Xlabel
plt.ylabel('Age') # for Ylabel
plt.legend() # to show legend
# show scatter plot
plt.show()
First of all, women except babies, as well as young boys up to 20 years old and young men from 20 to 30 years old were saved from the Titanic disaster. This is how he classifies the model based on two variables: sex and age.
Visualization of the Rendom Forest classification using trees 6 deep¶
In [12]:
def visualize_classifier(model, X, y, ax=None, cmap='Reds'):
ax = ax or plt.gca()
# Plot the training points
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap,
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# fit the estimator
model.fit(X, y)
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap, clim=(y.min(), y.max()),
zorder=1)
ax.set(xlim=xlim, ylim=ylim)
## MODEL
from sklearn.ensemble import RandomForestClassifier
RF6 = RandomForestClassifier(max_depth=6, random_state=0)
RF6.fit(X, y)
# Predicting the Test set results
y_pred6 = RF6.predict(X)
visualize_classifier(RF6, X, y)
In [13]:
visualize_classifier(DecisionTreeClassifier(), X, y)
We run a forest of 240 trees, 6 depth each¶
In [14]:
## MODEL
from sklearn.ensemble import RandomForestClassifier
RF6 = RandomForestClassifier(n_estimators=240, max_depth=6, random_state=0)
RF6.fit(X, y)
# Predicting the Test set results
y_pred6 = RF6.predict(X)
from matplotlib.colors import ListedColormap
X_set, y_set = X, y
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2,
RF6.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
cmap = ListedColormap(('pink', 'white', 'grey')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Sex') # for Xlabel
plt.ylabel('Age') # for Ylabel
plt.legend() # to show legend
# show scatter plot
plt.show()
Increasing the number of trees for the variables 'Sex’ and 'Age’ has no effect over 100 trees.¶
In [16]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
## źródło: https://www.dezyre.com/recipes/plot-validation-curve-in-python
## Przerabiam data frame na macierz
import numpy as np
X = np.asarray(X)
Y = np.asarray(y)
digits = load_digits()
# Create feature matrix and target vector
X, y = digits.data, digits.target
# Plot Validation Curve
# Create range of values for parameter
param_range = np.arange(1, 275, 2)
# Calculate accuracy on training and test set using range of parameter values
train_scores, test_scores = validation_curve(RandomForestClassifier(max_depth=6),
X, y, param_name="n_estimators", param_range=param_range,
cv=4, scoring="accuracy", n_jobs=-1)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# Plot mean accuracy scores for training and test sets
plt.subplots(1, figsize=(17,5))
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
