
030420200928
In [1]:
## colorful prints
def black(text):
print('33[30m', text, '33[0m', sep='')
def red(text):
print('33[31m', text, '33[0m', sep='')
def green(text):
print('33[32m', text, '33[0m', sep='')
def yellow(text):
print('33[33m', text, '33[0m', sep='')
def blue(text):
print('33[34m', text, '33[0m', sep='')
def magenta(text):
print('33[35m', text, '33[0m', sep='')
def cyan(text):
print('33[36m', text, '33[0m', sep='')
def gray(text):
print('33[90m', text, '33[0m', sep='')
I use the popular Titanic database.
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/1/tit_train.csv')
df.head(3)
Quick cleaning and removal of columns¶
df.drop(['PassengerId','Unnamed: 0', 'Cabin','Name','Ticket'], axis=1, inplace=True)
Select all discrete columns and convert them into discrete data in bulk¶
blue(df.dtypes)
df = df.dropna(how='any')
gray(df.isnull().sum())
green(df.shape)
categorical_vars = df.describe(include=[“object”]).columns
categorical_varsfrom sklearn.preprocessing import LabelEncoder
df[categorical_vars] = df[categorical_vars].apply(LabelEncoder().fit_transform)
blue(df[‘Sex’].value_counts())
green(df[‘Embarked’].value_counts())
X = df.drop('Survived', axis=1)
y = df['Survived']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123,stratify=y)
# Jeżeli się rzuca wtedy wycinamy stratify=y.
CatBoostClassifier – First model¶
from catboost import CatBoostClassifier, Pool
# Initialize CatBoostClassifier
model_CB1 = CatBoostClassifier(
custom_loss=['Accuracy'],
random_seed=42,
logging_level='Silent'
)
Model parameters:
- custom_loss – Metric used during the training, selected: [‘Accuracy’] https://catboost.ai/docs/search/?query=%27Accuracy%27
- random_seed = 42 The random seed used for training. That random values are the same every time.
- logging_level = ‘Silent’ Log in level to exit to standard output. ‘Silent’ – do not send any login information to the standard output. ‘Verbose’ – send the following data to the standard output, then it shows in model.fit all the learning path. ‘Info’ or ‘Debug’ – displaying additional information and the number of trees.
Two new features are associated with the CatBoostClassifier model. This model needs to be split between categorical and numeric variables. This information is declared in the model in the cat_features manual. Column numbers containing categorical variables are provided there. This information must be in the model.
Another novelty: categorical variables, e.g. text, do not need to be coded because the algorithm will encode them into a digital format.
A function that assigns variables to a set of categorical variables based on the number of unique values¶
import numpy as np
categorical_fuX = np.where(X_train.nunique() <8) [0]
categorical_fuX
Function that assigns variables to a set of categorical variables based on np.float¶
import numpy as np
categorical_ff = np.where(X_train.dtypes != np.float)[0]
categorical_ff
X_train.columns[4]
Model fit¶
cat_features = categorical_fuX
# Fit model
model_CB1.fit(X_train, y_train, cat_features)
# Get predicted classes
pred_y = model_CB1.predict(X_test)
blue(pred_y)
# Get predicted probabilities for each class
preds_proba = model_CB1.predict_proba(X_test)
blue(preds_proba[:6])
Assuming that the goal is classification (0,1), then the classifier would derive the dimension probability matrix (N, 2). The first index refers to the probability that the data belongs to class 0, and the second index refers to the probability that the data belongs to class 1.
Both values in the line must give 1.
# Get predicted RawFormulaVal
pred_raw = model_CB1.predict(X_test, prediction_type='RawFormulaVal')
blue(pred_raw[:25])
The required prediction type.
– prediction_type = ‘Probability’, probability for 0 and 1, which gives a total of 1 (e.g. [0.54761429 0.45238571])
– prediction_type = ‘Class’, categorical values 0-1
– prediction_type = ‘RawFormulaVal’, numeric values, continuous (when classifying, setting continuous values gives better model quality – I don’t know why.
# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
print("Recall Training data: ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
print("Precision Training data: ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
print("----------------------------------------------------------------------")
print("Recall Test data: ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4))
print("Precision Test data: ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
print("----------------------------------------------------------------------")
print("Confusion Matrix Test data")
print(confusion_matrix(ytest, model.predict(Xtest)))
print("----------------------------------------------------------------------")
print(classification_report(ytest, model.predict(Xtest)))
y_pred_proba = model.predict_proba(Xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest, y_pred)
auc = metrics.roc_auc_score(ytest, y_pred)
plt.plot(fpr, tpr, label='Logistic Regression (auc = %0.2f)' % auc)
plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.show()
print('auc',auc)
Classification_Assessment(model_CB1 ,X_train, y_train, X_test, y_test, pred_raw)
# CatBoostClassifier – Secound model¶
from catboost import CatBoostRegressor
model_CB2 = CatBoostClassifier(iterations=2000,
learning_rate=1,
logging_level='Silent',
depth=2)
Iterations and learning rate
– By default, CatBoost builds 1000 trees (iterations = 1000). The number of iterations can be reduced to speed up training. When the number of iterations decreases, the learning rate should be increased.
– Reduce the learning rate if you observe over-matching.
cat_features = categorical_fuX
# Fit model
model_CB2.fit(X_train, y_train, cat_features)
# Get predicted RawFormulaVal
pred_raw2 = model_CB2.predict(X_test, prediction_type='RawFormulaVal')
blue(pred_raw2[:25])
Classification_Assessment(model_CB2 ,X_train, y_train, X_test, y_test, pred_raw2)
Comparison to the RandomForestClassifier model¶
This model is on steroids, so its execution time is long.
First, however, you had to encode discrete variables in the srt format into a numeric value.
Which variables have non-digital values?¶
import numpy as np
categorical_X_train = np.where(X_train.dtypes == np.object)[0]
categorical_X_train
categorical_X_test = np.where(X_test.dtypes == np.object)[0]
categorical_X_test
kot = X_train.columns[categorical_X_train]
blue(kot)
kot2 = X_test.columns[categorical_X_test]
green(kot2)
from sklearn.preprocessing import LabelEncoder
X_train[kot] = X_train[kot].apply(LabelEncoder().fit_transform)
X_test[kot2] = X_test[kot2].apply(LabelEncoder().fit_transform)
X_train.head()
I am starting the RandomForestClassifier model with very strong optimization¶
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
import time
start_time = time.time() ## pomiar czasu: start pomiaru czasu
print(time.ctime())
print('Pomiar czasu wykonania tego zadania')
rfc=RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [200, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [4,5,6,7,8],
'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
y_pred_RFC = CV_rfc.predict(X_test)
CZAS =((time.time() - start_time)/60) ## koniec pomiaru czasu
r,C = (df.shape )
print('Czas: ',CZAS)
print('Czas na jeden rekor w min.: ',CZAS/r)
Classification_Assessment(CV_rfc ,X_train, y_train, X_test, y_test, y_pred_RFC)
Without a doubt, the new CatBoostClassifier model works better (faster and more accurately) than the highly optimized RandomForestClassifier.