https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb

CatBoostClassifier sam koduje sobie zmienne tekstowe kategoryczne na zmienne kategoryczne wyrażone numerycznie. Jeżeli sami przeprowadzimy codowanie i zakodujemy zmienne kategoryczne na format cyfrowy, wyniki naszych modeli będą takie same (przynajmniej takie jest moje doświadczenie). Aby przeprowadzić eksperyment i przetestować model CatBoostClassifier bez wskazania na zmienne kategoryczne (cat_features) oraz ze wskazaniem na zmienne musimy sami zakodoać tekstowe zmienne kategoryczne na format cyfrowy. W przeciwnym razie gdy będziemy mieli zmienne tekstowe a nie wskarzemy CatBoostClassifier że to zmienne kategoryczne, wyskoczy nam błąd.

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
     print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

1.2 Załadowanie danych¶

inny sposób na załadowanie tych samych danych o Tytaniku.

from catboost.datasets import titanic
import numpy as np
import pandas as pd

train_df, test_df = titanic()

train_df.head()

Sprawdzam kompletność zbioru¶

metoda pokazuje tylko te zmienna, w których brakuje danych.

null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

W miejcu gdzie były puste rekordy wstawiana jest wartość -777¶

train_df.fillna(-777, inplace=True)
train_df.fillna(-777, inplace=True)

Dzielimy na zmienne opisujące i wynikowe¶

X = train_df.drop('Survived', axis=1)
y = train_df.Survived

Szukamy zmiennych kategorycznych¶

Zostały wybrane takie kolumny jako kolumny zmiennych kategorycznych.

print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

array([ 0,  1,  2,  3,  5,  6,  7,  9, 10])

Ticket 10
Parch 9
SibSp 7
Age 6
Sex 5
Name 3
Pclass 2
Survived 1
PassengerId 0

array([ 1,  2,  4,  6,  7, 11])

<matplotlib.axes._subplots.AxesSubplot at 0x7f84617d24d0>

yhatA = model.predict(X_validation)
print(yhatA[:12])

[0 0 0 1 1 1 1 0 1 1 0 0]

y_train[12]

0

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

categorical_features_indices

array([ 0,  1,  2,  3,  5,  6,  7,  9, 10])

Wyświetlamy co to za kolumny¶

PPS = categorical_features_indices

KOT_MIC = dict(zip(train_df, PPS))
KOT_sorted_keys_MIC = sorted(KOT_MIC, key=KOT_MIC.get, reverse=True)

for r in KOT_sorted_keys_MIC:
    print (r, KOT_MIC[r])

Ticket 10
Parch 9
SibSp 7
Age 6
Sex 5
Name 3
Pclass 2
Survived 1
PassengerId 0

array([ 1,  2,  4,  6,  7, 11])

<matplotlib.axes._subplots.AxesSubplot at 0x7f84617d24d0>

yhatA = model.predict(X_validation)
print(yhatA[:12])

[0 0 0 1 1 1 1 0 1 1 0 0]

y_train[12]

0

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
    print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

blue(X_train.shape)
green(y_train.shape)
blue(X_validation.shape)
green(y_validation.shape)

Można też użyć mojego sposobu na identyfikację zmiennych kategorycznych. Tutaj mamy nazwiska i kabiny więc ten sposób idetyfikacji zmiennych kategorycznych nie będzie właściwy.

import numpy as np

categorical_fuX = np.where(train_df.nunique() <8) [0]
categorical_fuX

array([ 1,  2,  4,  6,  7, 11])

Dzielimy zbiór na zbiory treningowe i testowe¶

from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_train.head(3)

Poziom zbilansowania zbioru wynikowego¶

y_train.value_counts(dropna = False, normalize=True).plot(kind='pie')

<matplotlib.axes._subplots.AxesSubplot at 0x7f84617d24d0>

2.1 Szkolenie modelowe¶

Teraz stwórzmy sam model: poszlibyśmy tutaj z parametrami domyślnymi (ponieważ zapewniają one naprawdę dobrą linię bazową prawie przez cały czas), jedyną rzeczą, którą chcielibyśmy tutaj określić, jest parametr custom_loss, ponieważ dałoby to nam możliwość zobaczenia co się dzieje pod względem tego wskaźnika konkurencji – dokładności, a także możliwości obserwowania utraty logów, ponieważ byłoby to bardziej płynne w przypadku zestawu danych o takim rozmiarze.

custom_loss metryka użyta podczas szkolenia, wybrane: [„accuracy”] https://catboost.ai/docs/search/?query=%27Accuracy%27
random_seed = 42 Losowe nasiona użyte do treningu. Te losowe wartości są za każdym razem takie same.
logging_level = ‘Silent’ Poziom logowania, aby przejść do standardowego wyjścia. „Cichy” – nie wysyłaj żadnych danych logowania na standardowe wyjście. „Verbose” – wyślij następujące dane na standardowe wyjście, a następnie pokaże w modelu. Dopasuj całą ścieżkę uczenia się. „Informacje” lub „Debugowanie” – wyświetlanie dodatkowych informacji i liczby drzew.

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

Zdefiniowanie modelu bez deklarowania zmiennych kategorycznych ¶

Optymalizacja pod kontem powierzchni AUC.

model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

yhatA = model.predict(X_validation)
print(yhatA[:12])

[0 0 0 1 1 1 1 0 1 1 0 0]

y_train[12]

0

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
    print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

blue(X_train.shape)
green(y_train.shape)
blue(X_validation.shape)
green(y_validation.shape)

(668, 11)
(668,)
(223, 11)
(223,)

Classification_Assessment(model,X_train, y_train, X_validation, y_validation, yhatA)

Recall Training data:      0.7391
Precision Training data:   0.9791
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

Jak widać, można zobaczyć, jak nasz model uczy się na podstawie pełnych wyników lub ładnych wykresów (osobiście zdecydowanie wybrałbym drugą opcję – po prostu sprawdź te wykresy: możesz na przykład powiększyć obszary zainteresowania!)

Dzięki temu możemy zobaczyć, że najlepsza wartość dokładności 0,8340 (na zestawie walidacyjnym) została osiągnięta na 157 etapie wzmocnienia.

Żeby to zobaczyć trzeba kliknąć na Accuracy i stanąć myszą na linii ciągłej (oznaczającej zmienne testowe) nie linii przerywanej(dane treningowe)Wartość accurace wysokości 0.834 osiąga u mnie przy 451 petli. To miejsce gdzie jest kropka!

Co to jest loglost?¶

Jeśli tylko przewidujesz prawdopodobieństwo dla klasy dodatniej, to funkcję utraty logarytmicznej można obliczyć dla jednej prognozy klasyfikacji binarnej ( yhat ) w porównaniu do oczekiwanego prawdopodobieństwa ( y ) w następujący sposób:

LogLoss = – ((1 – y) log (1 – yhat) + y log (yhat))¶

Obliczamy predykcję modelu¶

yhatA = model.predict(X_validation)
print(yhatA[:12])

yhatA = model.predict(X_validation)
print(yhatA[:12])

[0 0 0 1 1 1 1 0 1 1 0 0]

y_train[12]

0

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
    print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

blue(X_train.shape)
green(y_train.shape)
blue(X_validation.shape)
green(y_validation.shape)

(668, 11)
(668,)
(223, 11)
(223,)

Classification_Assessment(model,X_train, y_train, X_validation, y_validation, yhatA)

Recall Training data:      0.7391
Precision Training data:   0.9791
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

y_train[12]

y_train[12]

0

Sprawdzenie tego modelu klasyfikacji¶

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

# Classification Assessment
def Classification_Assessment(model ,Xtrain, ytrain, Xtest, ytest, y_pred):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import metrics
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
    from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
    def green(text):
        print('33[32m', text, '33[0m', sep='')  
    def blue(text):
        print('33[34m', text, '33[0m', sep='')         
    
    print("Recall Training data:     ", np.round(recall_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("Precision Training data:  ", np.round(precision_score(ytrain, model.predict(Xtrain)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Recall Test data:         ", np.round(recall_score(ytest, model.predict(Xtest)), decimals=4)) 
    print("Precision Test data:      ", np.round(precision_score(ytest, model.predict(Xtest)), decimals=4))
    print("----------------------------------------------------------------------")
    print("Confusion Matrix Test data")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("----------------------------------------------------------------------")
    green('Valuation for test data only:')
    print(classification_report(ytest, model.predict(Xtest)))
      
    green('Valuation for test data only:')
    y_pred_proba = model.predict_proba(Xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred)
    auc = metrics.roc_auc_score(ytest, y_pred)
    plt.plot(fpr, tpr, label='ROC (roc_auc = %0.2f)' % auc)
    plt.xlabel('False Positive Rate',color='grey', fontsize = 13)
    plt.ylabel('True Positive Rate',color='grey', fontsize = 13)
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.legend(loc=4)
    plt.plot([0, 1], [0, 1],'r--')
    plt.show()
    print('roc_auc %.3f' % auc)
    
   
    blue('---------------------') 
    AUC_train_1 = metrics.roc_auc_score(ytrain,model.predict_proba(Xtrain)[:,1])
    blue('AUC_train: %.3f' % AUC_train_1)
    AUC_test_1 = metrics.roc_auc_score(ytest,model.predict_proba(Xtest)[:,1])
    blue('AUC_test:  %.3f' % AUC_test_1)
    blue('---------------------')    

      
    print("Accuracy Training data:     ", np.round(accuracy_score(ytrain, model.predict(Xtrain)), decimals=4))
    green("----------------------------------------------------------------------")
    print("Accuracy Test data:         ", np.round(accuracy_score(ytest, model.predict(Xtest)), decimals=4)) 
    green("----------------------------------------------------------------------")

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
    print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

##  colorful prints
def black(text):
     print('33[30m', text, '33[0m', sep='')  
def red(text):
     print('33[31m', text, '33[0m', sep='')  
def green(text):
     print('33[32m', text, '33[0m', sep='')  
def yellow(text):     
    print('33[33m', text, '33[0m', sep='')  
def blue(text):
     print('33[34m', text, '33[0m', sep='') 
def magenta(text):
     print('33[35m', text, '33[0m', sep='')  
def cyan(text):
     print('33[36m', text, '33[0m', sep='')  
def gray(text):
     print('33[90m', text, '33[0m', sep='')

blue(X_train.shape)
green(y_train.shape)
blue(X_validation.shape)
green(y_validation.shape)

blue(X_train.shape)
green(y_train.shape)
blue(X_validation.shape)
green(y_validation.shape)

(668, 11)
(668,)
(223, 11)
(223,)

Classification_Assessment(model,X_train, y_train, X_validation, y_validation, yhatA)

Recall Training data:      0.7391
Precision Training data:   0.9791
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

roc_auc 0.790
---------------------
AUC_train: 0.968
AUC_test:  0.905
---------------------
Accuracy Training data:      0.8952
----------------------------------------------------------------------
Accuracy Test data:          0.8161
----------------------------------------------------------------------

cv_params = model.get_params()
cv_params

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Accuracy']}

cv_params.update({'loss_function': 'Logloss'})

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True)

cv_data.head(2)

Classification_Assessment(model,X_train, y_train, X_validation, y_validation, yhatA)

Classification_Assessment(model,X_train, y_train, X_validation, y_validation, yhatA)

Recall Training data:      0.7391
Precision Training data:   0.9791
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

roc_auc 0.790
---------------------
AUC_train: 0.968
AUC_test:  0.905
---------------------
Accuracy Training data:      0.8952
----------------------------------------------------------------------
Accuracy Test data:          0.8161
----------------------------------------------------------------------

cv_params = model.get_params()
cv_params

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Accuracy']}

cv_params.update({'loss_function': 'Logloss'})

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True)

cv_data.head(2)

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.03 on step 527

2.2 Walidacja krzyżowa modelu¶

Dobrze jest zweryfikować swój model, ale zweryfikować go – nawet lepiej. A także z działkami! Bez słów:

Pokazuje parametry modelu ¶

cv_params = model.get_params()
cv_params

cv_params = model.get_params()
cv_params

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Accuracy']}

Dodaje jeszcze jeden parametr do moedlu ¶

cv_params.update({'loss_function': 'Logloss'})

cv_params.update({'loss_function': 'Logloss'})

Nie wiem co to jest ¶

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True)

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True)

cv_data.head(2)

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.03 on step 527

/home/wojciech/anaconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py:61: FutureWarning: 
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return bound(*args, **kwds)

 np.max(cv_data['test-Accuracy-mean'])

0.8294051627384961

 np.argmax(cv_data['test-Accuracy-mean'])

527

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

to nie działa¶

yhatCV = cv_data.predict(X_validation)

Teraz mamy wartości naszych funkcji strat na każdym etapie wzmocnienia uśrednione 3-krotnie, co powinno zapewnić nam dokładniejsze oszacowanie wydajności naszego modelu:

cv_data.head(2)

cv_data.head(2)

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.03 on step 527

/home/wojciech/anaconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py:61: FutureWarning: 
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return bound(*args, **kwds)

 np.max(cv_data['test-Accuracy-mean'])

0.8294051627384961

 np.argmax(cv_data['test-Accuracy-mean'])

527

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

predictions = model.predict(X_validation)
predictions[:15]

 np.max(cv_data['test-Accuracy-mean'])

 np.max(cv_data['test-Accuracy-mean'])

0.8294051627384961

 np.argmax(cv_data['test-Accuracy-mean'])

 np.argmax(cv_data['test-Accuracy-mean'])

527

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

predictions = model.predict(X_validation)
predictions[:15]

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0])

predictions_probs = model.predict_proba(X_validation)
predictions_probs[:5]

array([[0.70226591, 0.29773409],
       [0.87482065, 0.12517935],
       [0.87590587, 0.12409413],
       [0.03421529, 0.96578471],
       [0.34333994, 0.65666006]])

model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0

params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
train_pool

Jak widzimy, nasze wstępne oszacowanie wydajności przy pojedynczym foldowaniu sprawdzania poprawności było zbyt optymistyczne – dlatego tak ważna jest krzyżowa weryfikacja!

odpuszczam – nie rozumiem tej sekcji¶

2.3 Stosowanie modelu¶

Wszystko, co musisz zrobić, aby uzyskać prognozy, to

predictions = model.predict(X_validation)
predictions[:15]

predictions = model.predict(X_validation)
predictions[:15]

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0])

predictions_probs = model.predict_proba(X_validation)
predictions_probs[:5]

predictions_probs = model.predict_proba(X_validation)
predictions_probs[:5]

array([[0.70226591, 0.29773409],
       [0.87482065, 0.12517935],
       [0.87590587, 0.12409413],
       [0.03421529, 0.96578471],
       [0.34333994, 0.65666006]])

Ale spróbujmy uzyskać lepsze prognozy, a funkcje Catboost nam w tym pomogą.

Być może zauważyłeś, że na etapie tworzenia modelu podałem nie tylko parametr custom_loss, ale także parametr random_seed. Zostało to zrobione, aby ten notatnik był odtwarzalny – domyślnie catboost wybiera losową wartość dla seed:

model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0

params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
train_pool

<catboost.core.Pool at 0x7f8460ed4ec0>

validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)
validate_pool

<catboost.core.Pool at 0x7f8460ee5280>

## -------linijka jak wywołać najlepsze parametry modelu ---------------------

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({                  ## <- tutaj model wkłada 'use_best_model'
    'use_best_model': True                  ## to nie są lepsze parametry tylko ten jeden nowy parametr   
})

### ----------------------------------------------------------------------------

best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8072

Best model validation accuracy: 0.8296

params

Zdefiniujmy niektóre parametry i utwórz Pool dla większej wygody. Pool Przechowuje wszystkie informacje o zbiorze danych (cechy, etykiety, wskaźniki cech jakościowych, wagi i wiele innych).

To taki zbiornik z parametrami modelu

params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}

params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}

Pool dla zmiennych treningowych – to nie model, to taki zbiornik

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
train_pool

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
train_pool

<catboost.core.Pool at 0x7f8460ed4ec0>

Pool dla zmiennych testowych – to nie model, to taki zbiornik

validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)
validate_pool

validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)
validate_pool

<catboost.core.Pool at 0x7f8460ee5280>

3.1 Korzystanie z najlepszego modelu¶

Jeśli zasadniczo masz zestaw sprawdzania poprawności, zawsze lepiej jest używać parametru use_best_model podczas treningu. Domyślnie ten parametr jest włączony. Jeśli jest włączony, wynikowy zestaw drzew zmniejsza się do najlepszej iteracji.

## -------linijka jak wywołać najlepsze parametry modelu ---------------------

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({                  ## <- tutaj model wkłada 'use_best_model'
    'use_best_model': True                  ## to nie są lepsze parametry tylko ten jeden nowy parametr   
})

### ----------------------------------------------------------------------------

best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

## -------linijka jak wywołać najlepsze parametry modelu ---------------------

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({                  ## <- tutaj model wkłada 'use_best_model'
    'use_best_model': True                  ## to nie są lepsze parametry tylko ten jeden nowy parametr   
})

### ----------------------------------------------------------------------------

best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8072

Best model validation accuracy: 0.8296

params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False}

params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False}

best_model_params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': True}

best_model = CatBoostClassifier(**best_model_params)

best_model.fit(train_pool, eval_set=validate_pool,plot=True )

Rozbieram powyższe na czynniki pierwsze¶

params

params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False}

wyświetlam stare parametry modelu

params

params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False}

wyświetlam nowe parametry modelu

best_model_params

best_model_params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': True}

best_model = CatBoostClassifier(**best_model_params)

best_model = CatBoostClassifier(**best_model_params)

best_model.fit(train_pool, eval_set=validate_pool,plot=True )

best_model.fit(train_pool, eval_set=validate_pool,plot=True )

<catboost.core.CatBoostClassifier at 0x7f8460ee3590>

print('ZWYKŁY MODEL    - Na zbiorze testowym accurace: ', accuracy_score(y_validation, model.predict(X_validation)))
print('NAJLEPSZY MODEL - Na zbiorze testowym accurace: ', accuracy_score(y_validation, best_model.predict(X_validation)))

ZWYKŁY MODEL    - Na zbiorze testowym accurace:  0.8071748878923767
NAJLEPSZY MODEL - Na zbiorze testowym accurace:  0.8295964125560538

y_bestPred = best_model.predict(X_validation)

Classification_Assessment(best_model,X_train, y_train, X_validation, y_validation, y_bestPred)

Recall Training data:      0.7391
Precision Training data:   0.974
----------------------------------------------------------------------
Recall Test data:          0.6854
Precision Test data:       0.8592
----------------------------------------------------------------------
Confusion Matrix Test data
[[124  10]
 [ 28  61]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       134
           1       0.86      0.69      0.76        89

    accuracy                           0.83       223
   macro avg       0.84      0.81      0.81       223
weighted avg       0.83      0.83      0.83       223

Valuation for test data only:

roc_auc 0.805
---------------------
AUC_train: 0.966
AUC_test:  0.906
---------------------
Accuracy Training data:      0.8937
----------------------------------------------------------------------
Accuracy Test data:          0.8296
----------------------------------------------------------------------

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 12.3 s, sys: 2.15 s, total: 14.4 s
Wall time: 3.37 s

print('ZWYKŁY MODEL    - Na zbiorze testowym accurace: ', accuracy_score(y_validation, model.predict(X_validation)))
print('NAJLEPSZY MODEL - Na zbiorze testowym accurace: ', accuracy_score(y_validation, best_model.predict(X_validation)))

print('ZWYKŁY MODEL    - Na zbiorze testowym accurace: ', accuracy_score(y_validation, model.predict(X_validation)))
print('NAJLEPSZY MODEL - Na zbiorze testowym accurace: ', accuracy_score(y_validation, best_model.predict(X_validation)))

ZWYKŁY MODEL    - Na zbiorze testowym accurace:  0.8071748878923767
NAJLEPSZY MODEL - Na zbiorze testowym accurace:  0.8295964125560538

y_bestPred = best_model.predict(X_validation)

Classification_Assessment(best_model,X_train, y_train, X_validation, y_validation, y_bestPred)

Recall Training data:      0.7391
Precision Training data:   0.974
----------------------------------------------------------------------
Recall Test data:          0.6854
Precision Test data:       0.8592
----------------------------------------------------------------------
Confusion Matrix Test data
[[124  10]
 [ 28  61]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       134
           1       0.86      0.69      0.76        89

    accuracy                           0.83       223
   macro avg       0.84      0.81      0.81       223
weighted avg       0.83      0.83      0.83       223

Valuation for test data only:

roc_auc 0.805
---------------------
AUC_train: 0.966
AUC_test:  0.906
---------------------
Accuracy Training data:      0.8937
----------------------------------------------------------------------
Accuracy Test data:          0.8296
----------------------------------------------------------------------

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 12.3 s, sys: 2.15 s, total: 14.4 s
Wall time: 3.37 s

<catboost.core.CatBoostClassifier at 0x7f846038d0d0>

%%time
earlystop_params = params.copy()   #<-- tradycyjne dodawanie parametrów
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

SPRAWDZAM jaki jest ten najlepszy model najlepszy¶

y_bestPred = best_model.predict(X_validation)

y_bestPred = best_model.predict(X_validation)

Classification_Assessment(best_model,X_train, y_train, X_validation, y_validation, y_bestPred)

Classification_Assessment(best_model,X_train, y_train, X_validation, y_validation, y_bestPred)

Recall Training data:      0.7391
Precision Training data:   0.974
----------------------------------------------------------------------
Recall Test data:          0.6854
Precision Test data:       0.8592
----------------------------------------------------------------------
Confusion Matrix Test data
[[124  10]
 [ 28  61]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       134
           1       0.86      0.69      0.76        89

    accuracy                           0.83       223
   macro avg       0.84      0.81      0.81       223
weighted avg       0.83      0.83      0.83       223

Valuation for test data only:

roc_auc 0.805
---------------------
AUC_train: 0.966
AUC_test:  0.906
---------------------
Accuracy Training data:      0.8937
----------------------------------------------------------------------
Accuracy Test data:          0.8296
----------------------------------------------------------------------

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 12.3 s, sys: 2.15 s, total: 14.4 s
Wall time: 3.37 s

<catboost.core.CatBoostClassifier at 0x7f846038d0d0>

%%time
earlystop_params = params.copy()   #<-- tradycyjne dodawanie parametrów
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

CPU times: user 1.25 s, sys: 212 ms, total: 1.46 s
Wall time: 364 ms

<catboost.core.CatBoostClassifier at 0x7f846038a410>

earlystop_params

3.2 Wczesne zatrzymanie¶

Jeśli zasadniczo masz zestaw sprawdzania poprawności, zawsze łatwiej i lepiej jest skorzystać z wczesnego zatrzymania. Ta funkcja jest podobna do poprzedniej, ale oprócz poprawy jakości wciąż oszczędza czas.

Czas robienia modelu bez ‘earlystop’

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 12.3 s, sys: 2.15 s, total: 14.4 s
Wall time: 3.37 s

<catboost.core.CatBoostClassifier at 0x7f846038d0d0>

%%time
earlystop_params = params.copy()   #<-- tradycyjne dodawanie parametrów
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

CPU times: user 1.25 s, sys: 212 ms, total: 1.46 s
Wall time: 364 ms

<catboost.core.CatBoostClassifier at 0x7f846038a410>

earlystop_params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False,
 'od_type': 'Iter',
 'od_wait': 40}

print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8072

Early-stopped model tree count: 57
Early-stopped model validation accuracy: 0.8161

Czas robienia modelu z ‘earlystop’

%%time
earlystop_params = params.copy()   #<-- tradycyjne dodawanie parametrów
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

%%time
earlystop_params = params.copy()   #<-- tradycyjne dodawanie parametrów
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

CPU times: user 1.25 s, sys: 212 ms, total: 1.46 s
Wall time: 364 ms

<catboost.core.CatBoostClassifier at 0x7f846038a410>

earlystop_params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False,
 'od_type': 'Iter',
 'od_wait': 40}

print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8072

Early-stopped model tree count: 57
Early-stopped model validation accuracy: 0.8161

earlystop_model.fit(train_pool, eval_set=validate_pool, plot=True)

<catboost.core.CatBoostClassifier at 0x7f846038a410>

y_earlystop = earlystop_model.predict(X_validation)

Nowe parametry ‘earlystop’:

earlystop_params

earlystop_params

{'iterations': 500,
 'learning_rate': 0.1,
 'eval_metric': 'Accuracy',
 'random_seed': 42,
 'logging_level': 'Silent',
 'use_best_model': False,
 'od_type': 'Iter',
 'od_wait': 40}

print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8072

Early-stopped model tree count: 57
Early-stopped model validation accuracy: 0.8161

earlystop_model.fit(train_pool, eval_set=validate_pool, plot=True)

<catboost.core.CatBoostClassifier at 0x7f846038a410>

y_earlystop = earlystop_model.predict(X_validation)

Classification_Assessment(earlystop_model,X_train, y_train, X_validation, y_validation, y_earlystop)

Recall Training data:      0.7628
Precision Training data:   0.9747
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

roc_auc 0.790
---------------------
AUC_train: 0.961
AUC_test:  0.901
---------------------
Accuracy Training data:      0.9027
----------------------------------------------------------------------
Accuracy Test data:          0.8161
----------------------------------------------------------------------

current_params = params.copy()
current_params.update({
    'iterations': 10
})

model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

Dzięki temu uzyskujemy lepszą jakość w krótszym czasie.

Chociaż, jak pokazano wcześniej, prosty schemat sprawdzania poprawności nie opisuje dokładnie wyniku poza zmiennymi treningowymi (może być tendencyjny z powodu podziału zestawu danych), nadal dobrze jest śledzić dynamikę ulepszeń modelu – a zatem, jak widać z tego przykładu, jest to naprawdę dobrze jest wcześniej zatrzymać proces wzmacniania (zanim rozpocznie się nadmierne dopasowanie)

Rozkładam to na czynniki pierwsze¶

Czyli model kończy się na najlepszym uzyskanym wyniku ‘accuracy’

earlystop_model.fit(train_pool, eval_set=validate_pool, plot=True)

earlystop_model.fit(train_pool, eval_set=validate_pool, plot=True)

<catboost.core.CatBoostClassifier at 0x7f846038a410>

y_earlystop = earlystop_model.predict(X_validation)

Classification_Assessment(earlystop_model,X_train, y_train, X_validation, y_validation, y_earlystop)

Recall Training data:      0.7628
Precision Training data:   0.9747
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

roc_auc 0.790
---------------------
AUC_train: 0.961
AUC_test:  0.901
---------------------
Accuracy Training data:      0.9027
----------------------------------------------------------------------
Accuracy Test data:          0.8161
----------------------------------------------------------------------

current_params = params.copy()
current_params.update({
    'iterations': 10
})

model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

<catboost.core.CatBoostClassifier at 0x7f8465fec1d0>

model_cp = CatBoostClassifier(**current_params)
model_cp = model_cp.fit(X_train, y_train, categorical_features_indices)

y_earlystop = earlystop_model.predict(X_validation)

y_earlystop = earlystop_model.predict(X_validation)

Classification_Assessment(earlystop_model,X_train, y_train, X_validation, y_validation, y_earlystop)

Classification_Assessment(earlystop_model,X_train, y_train, X_validation, y_validation, y_earlystop)

Recall Training data:      0.7628
Precision Training data:   0.9747
----------------------------------------------------------------------
Recall Test data:          0.6629
Precision Test data:       0.8429
----------------------------------------------------------------------
Confusion Matrix Test data
[[123  11]
 [ 30  59]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       134
           1       0.84      0.66      0.74        89

    accuracy                           0.82       223
   macro avg       0.82      0.79      0.80       223
weighted avg       0.82      0.82      0.81       223

Valuation for test data only:

roc_auc 0.790
---------------------
AUC_train: 0.961
AUC_test:  0.901
---------------------
Accuracy Training data:      0.9027
----------------------------------------------------------------------
Accuracy Test data:          0.8161
----------------------------------------------------------------------

current_params = params.copy()
current_params.update({
    'iterations': 10
})

model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

<catboost.core.CatBoostClassifier at 0x7f8465fec1d0>

model_cp = CatBoostClassifier(**current_params)
model_cp = model_cp.fit(X_train, y_train, categorical_features_indices)

baseline = model_cp.predict(X_train, prediction_type='RawFormulaVal')

model_cp.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

<catboost.core.CatBoostClassifier at 0x7f8460468cd0>

3.3 Korzystanie z linii bazowej¶

Możliwe jest wykorzystanie wyników przedtreningowych (wyjściowych) do treningu.

Nie wiem po co to jest – daje słabe wyniki itd

current_params = params.copy()
current_params.update({
    'iterations': 10
})

current_params = params.copy()
current_params.update({
    'iterations': 10
})

model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

<catboost.core.CatBoostClassifier at 0x7f8465fec1d0>

Znowu zmieniam parapetry dodaje jakiś parametr:’iterations’: 10

model_cp = CatBoostClassifier(**current_params)
model_cp = model_cp.fit(X_train, y_train, categorical_features_indices)

model_cp = CatBoostClassifier(**current_params)
model_cp = model_cp.fit(X_train, y_train, categorical_features_indices)

Uzyskaj linię bazową (tylko z prediction_type = ‘RawFormulaVal’)

baseline = model_cp.predict(X_train, prediction_type='RawFormulaVal')

baseline = model_cp.predict(X_train, prediction_type='RawFormulaVal')

Fit new model

model_cp.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

model_cp.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

<catboost.core.CatBoostClassifier at 0x7f8460468cd0>

y_pred_cp = model_cp.predict(X_validation)

y_pred_cp = model_cp.predict(X_validation)

Classification_Assessment(model_cp,X_train, y_train, X_validation, y_validation, y_pred_cp)

Classification_Assessment(model_cp,X_train, y_train, X_validation, y_validation, y_pred_cp)

Recall Training data:      0.6601
Precision Training data:   0.9227
----------------------------------------------------------------------
Recall Test data:          0.6966
Precision Test data:       0.7848
----------------------------------------------------------------------
Confusion Matrix Test data
[[117  17]
 [ 27  62]]
----------------------------------------------------------------------
Valuation for test data only:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       134
           1       0.78      0.70      0.74        89

    accuracy                           0.80       223
   macro avg       0.80      0.78      0.79       223
weighted avg       0.80      0.80      0.80       223

Valuation for test data only:

roc_auc 0.785
---------------------
AUC_train: 0.930
AUC_test:  0.879
---------------------
Accuracy Training data:      0.8503
----------------------------------------------------------------------
Accuracy Test data:          0.8027
----------------------------------------------------------------------

params_with_snapshot = params.copy()    #<-- tradycyjnie dodajemy nowe parametry
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})

params_with_snapshot.update({      #<-- zmieniamy ustawienia migawki
    'iterations': 10,
    'learning_rate': 0.1,
})

class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(),   ##<-- dodajemy własnoręcznie wymyśloną funkcję
    eval_metric="Logloss"
)

# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

0:	learn: 0.6827074	total: 21.7ms	remaining: 195ms
1:	learn: 0.6722947	total: 41.5ms	remaining: 166ms
2:	learn: 0.6624914	total: 58.2ms	remaining: 136ms
3:	learn: 0.6528402	total: 77.3ms	remaining: 116ms
4:	learn: 0.6436863	total: 96.3ms	remaining: 96.3ms
5:	learn: 0.6346627	total: 114ms	remaining: 76.2ms
6:	learn: 0.6279562	total: 133ms	remaining: 56.8ms
7:	learn: 0.6201005	total: 154ms	remaining: 38.5ms
8:	learn: 0.6127656	total: 171ms	remaining: 19ms
9:	learn: 0.6053589	total: 189ms	remaining: 0us

class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

3.4 Obsługa migawek¶

Catboost obsługuje migawki. Możesz go użyć do odzyskania treningu po przerwie lub do rozpoczęcia treningu z wcześniejszymi wynikami.

params_with_snapshot = params.copy()    #<-- tradycyjnie dodajemy nowe parametry
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})

params_with_snapshot = params.copy()    #<-- tradycyjnie dodajemy nowe parametry
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})

params_with_snapshot.update({      #<-- zmieniamy ustawienia migawki
    'iterations': 10,
    'learning_rate': 0.1,
})

params_with_snapshot.update({      #<-- zmieniamy ustawienia migawki
    'iterations': 10,
    'learning_rate': 0.1,
})

3.5 Funkcja celu zdefiniowana przez użytkownika¶

Możliwe jest stworzenie własnej funkcji celu. Utwórzmy funkcję celu logloss.

przybliżenia, cele, wagi są indeksowanymi pojemnikami pływaków
(pojemniki, które mają zdefiniowane tylko len i getitem).
parametrem wag może być Brak.

Aby zrozumieć, co oznaczają te parametry, załóż, że istnieje podzbiór zestawu danych, który jest obecnie przetwarzany. Program przybliża zawiera bieżące prognozy dla tego podzbioru, cele zawierają wartości docelowe podane w zestawie danych.

Ta funkcja powinna zwrócić listę par (der1, der2), gdzie
der1 jest pierwszą pochodną funkcji straty w odniesieniu do
do przewidywanej wartości, a der2 jest drugą pochodną.

W naszym przypadku logloss jest definiowany za pomocą następującej formuły:
cel log (sigmoid (w przybliżeniu)) + (1 – cel) (1 – sigmoid (w przybliżeniu))
gdzie sigmoid (x) = 1 / (1 + e ^ (- x)).

class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(),   ##<-- dodajemy własnoręcznie wymyśloną funkcję
    eval_metric="Logloss"
)

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(),   ##<-- dodajemy własnoręcznie wymyśloną funkcję
    eval_metric="Logloss"
)

# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

0:	learn: 0.6827074	total: 21.7ms	remaining: 195ms
1:	learn: 0.6722947	total: 41.5ms	remaining: 166ms
2:	learn: 0.6624914	total: 58.2ms	remaining: 136ms
3:	learn: 0.6528402	total: 77.3ms	remaining: 116ms
4:	learn: 0.6436863	total: 96.3ms	remaining: 96.3ms
5:	learn: 0.6346627	total: 114ms	remaining: 76.2ms
6:	learn: 0.6279562	total: 133ms	remaining: 56.8ms
7:	learn: 0.6201005	total: 154ms	remaining: 38.5ms
8:	learn: 0.6127656	total: 171ms	remaining: 19ms
9:	learn: 0.6053589	total: 189ms	remaining: 0us

class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5521578	total: 7.79ms	remaining: 70.1ms
1:	learn: 0.4885686	total: 16ms	remaining: 64.2ms
2:	learn: 0.4646498	total: 22.5ms	remaining: 52.5ms
3:	learn: 0.4433198	total: 29.7ms	remaining: 44.6ms
4:	learn: 0.4348036	total: 36.5ms	remaining: 36.5ms
5:	learn: 0.4304872	total: 43.6ms	remaining: 29.1ms
6:	learn: 0.4169664	total: 49.9ms	remaining: 21.4ms
7:	learn: 0.4067507	total: 56.6ms	remaining: 14.1ms
8:	learn: 0.4019576	total: 62.8ms	remaining: 6.98ms
9:	learn: 0.3970545	total: 69.8ms	remaining: 0us

model_kot = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model_kot

<catboost.core.CatBoostClassifier at 0x7f84603e9490>

ntree_start, ntree_end, eval_period = 3, 9, 2

predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)

for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

3.6 Funkcja metryczna zdefiniowana przez użytkownika¶

Możliwe jest również utworzenie własnej funkcji metrycznej. Utwórzmy funkcję metryczną logloss.

class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_validation, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5521578	total: 7.79ms	remaining: 70.1ms
1:	learn: 0.4885686	total: 16ms	remaining: 64.2ms
2:	learn: 0.4646498	total: 22.5ms	remaining: 52.5ms
3:	learn: 0.4433198	total: 29.7ms	remaining: 44.6ms
4:	learn: 0.4348036	total: 36.5ms	remaining: 36.5ms
5:	learn: 0.4304872	total: 43.6ms	remaining: 29.1ms
6:	learn: 0.4169664	total: 49.9ms	remaining: 21.4ms
7:	learn: 0.4067507	total: 56.6ms	remaining: 14.1ms
8:	learn: 0.4019576	total: 62.8ms	remaining: 6.98ms
9:	learn: 0.3970545	total: 69.8ms	remaining: 0us

model_kot = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model_kot

<catboost.core.CatBoostClassifier at 0x7f84603e9490>

ntree_start, ntree_end, eval_period = 3, 9, 2

predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)

for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.42990422 0.42665956 0.4192657  0.56176543 0.4763258 ]
First class probabilities using the first 5 trees: [0.40394604 0.35310234 0.38666939 0.57518619 0.49553116]
First class probabilities using the first 7 trees: [0.39987636 0.34035878 0.3468137  0.53325091 0.54678221]

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)

feature_importances = model.get_feature_importance(train_pool)

3.7 Przewidywane etapy¶

Model CatBoost ma metodę staged_predict. Pozwala iteracyjnie uzyskać prognozy dla danego zakresu drzew.

model_kot = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model_kot

model_kot = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model_kot

<catboost.core.CatBoostClassifier at 0x7f84603e9490>

określami ilość drzew

ntree_start, ntree_end, eval_period = 3, 9, 2

ntree_start, ntree_end, eval_period = 3, 9, 2

predictions_iterator

predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)

predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)

nie wiem co on teraz robi

for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.42990422 0.42665956 0.4192657  0.56176543 0.4763258 ]
First class probabilities using the first 5 trees: [0.40394604 0.35310234 0.38666939 0.57518619 0.49553116]
First class probabilities using the first 7 trees: [0.39987636 0.34035878 0.3468137  0.53325091 0.54678221]

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)

feature_importances = model.get_feature_importance(train_pool)

feature_names = X_train.columns
feature_names

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 56.4409024798132
Pclass: 16.831468536670158
Ticket: 6.3123775952096715
Parch: 4.157791677223602
Cabin: 3.6700917688447063
Embarked: 3.595172924440488
Age: 3.532435299190085
Fare: 3.002584491481529
SibSp: 2.457175227126605
PassengerId: 0.0
Name: 0.0

X_train['Ticket']

298              19988
884    SOTON/OQ 392076
247             250649
478             350060
305             113781
            ...       
106             343120
270             113798
860             350026
435             113760
102              35281
Name: Ticket, Length: 668, dtype: object

3.8 Najważniejsze cechy¶

Czasami bardzo ważne jest, aby zrozumieć, która funkcja miała największy wpływ na końcowy wynik. Aby to zrobić, model CatBoost ma metodę get_feature_importance.

Tworzy się taki model szkieletowy jak w poprzedniej metodzie

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)

Mówi się modelowi aby: ‘get_feature_importance’

feature_importances = model.get_feature_importance(train_pool)

feature_importances = model.get_feature_importance(train_pool)

feature_names = X_train.columns
feature_names

feature_names = X_train.columns
feature_names

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 56.4409024798132
Pclass: 16.831468536670158
Ticket: 6.3123775952096715
Parch: 4.157791677223602
Cabin: 3.6700917688447063
Embarked: 3.595172924440488
Age: 3.532435299190085
Fare: 3.002584491481529
SibSp: 2.457175227126605
PassengerId: 0.0
Name: 0.0

X_train['Ticket']

298              19988
884    SOTON/OQ 392076
247             250649
478             350060
305             113781
            ...       
106             343120
270             113798
860             350026
435             113760
102              35281
Name: Ticket, Length: 668, dtype: object

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Recall'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Accuracy'], plot=True)

print(eval_metrics['Accuracy'][:16])

[0.7937219730941704, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8071748878923767, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8071748878923767, 0.8116591928251121, 0.8161434977578476]

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

To pokazuje, że funkcje Sex i Pclass miały największy wpływ na wynik.

co ciekawe wcale zadeklarowałem w ‘train_pool’, w które zmienne są dyskretne a model sam je sobie zcyfryzował

X_train['Ticket']

X_train['Ticket']

298              19988
884    SOTON/OQ 392076
247             250649
478             350060
305             113781
            ...       
106             343120
270             113798
860             350026
435             113760
102              35281
Name: Ticket, Length: 668, dtype: object

3.9 Wskaźniki oceny¶¶

CatBoost ma metodę eval_metrics, która pozwala obliczyć dane metryki dla danego zestawu danych. I oczywiście je narysować 🙂

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Recall'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Accuracy'], plot=True)

print(eval_metrics['Accuracy'][:16])

[0.7937219730941704, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8071748878923767, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8071748878923767, 0.8116591928251121, 0.8161434977578476]

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

Można testować więcej wskaźników: https://catboost.ai/docs/search/?query=%27Accuracy%27

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Recall'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Recall'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Accuracy'], plot=True)

print(eval_metrics['Accuracy'][:16])

[0.7937219730941704, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8071748878923767, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8071748878923767, 0.8116591928251121, 0.8161434977578476]

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-89204c19c526> in <module>
      2 
      3 params_space = {
----> 4     'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
      5     'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
      6 }

NameError: name 'hyperopt' is not defined

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Accuracy'], plot=True)

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['Accuracy'], plot=True)

print(eval_metrics['Accuracy'][:16])

[0.7937219730941704, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8071748878923767, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8071748878923767, 0.8116591928251121, 0.8161434977578476]

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-89204c19c526> in <module>
      2 
      3 params_space = {
----> 4     'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
      5     'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
      6 }

NameError: name 'hyperopt' is not defined

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

print(eval_metrics['Accuracy'][:16])

print(eval_metrics['Accuracy'][:16])

[0.7937219730941704, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8071748878923767, 0.8026905829596412, 0.8071748878923767, 0.8071748878923767, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8116591928251121, 0.8071748878923767, 0.8116591928251121, 0.8161434977578476]

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-89204c19c526> in <module>
      2 
      3 params_space = {
----> 4     'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
      5     'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
      6 }

NameError: name 'hyperopt' is not defined

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-8a762d5f7dc8> in <module>
      1 model = CatBoostClassifier(
----> 2     l2_leaf_reg=int(best['l2_leaf_reg']),
      3     learning_rate=best['learning_rate'],
      4     iterations=500,
      5     eval_metric='Accuracy',

NameError: name 'best' is not defined

3.10 Porównanie procesów uczenia się¶

Możesz także porównać proces uczenia się różnych modeli na jednym wykresie.

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

model1 = CatBoostClassifier(iterations=1000, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=1000, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-89204c19c526> in <module>
      2 
      3 params_space = {
----> 4     'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
      5     'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
      6 }

NameError: name 'hyperopt' is not defined

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-8a762d5f7dc8> in <module>
      1 model = CatBoostClassifier(
----> 2     l2_leaf_reg=int(best['l2_leaf_reg']),
      3     learning_rate=best['learning_rate'],
      4     iterations=500,
      5     eval_metric='Accuracy',

NameError: name 'best' is not defined

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

model_VV.fit(X, y, cat_features=categorical_features_indices)

3.11 Zapisywanie modelu¶

Zawsze bardzo przydatne jest zrzucenie modelu na dysk (szczególnie jeśli szkolenie zajęło trochę czasu).

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

hyperopt¶

Chociaż zawsze można wybrać optymalną liczbę iteracji (etapy przyspieszające) poprzez walidację krzyżową i wykresy krzywej uczenia się, ważne jest również, aby bawić się niektórymi parametrami modelu, i chcielibyśmy zwrócić szczególną uwagę na l2_leaf_reg i learning_rate.

W tej sekcji wybieramy te parametry za pomocą pakietu hyperopt.

Instalujemy to!

!pip install hyperopt¶

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-89204c19c526> in <module>
      2 
      3 params_space = {
----> 4     'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
      5     'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
      6 }

NameError: name 'hyperopt' is not defined

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-8a762d5f7dc8> in <module>
      1 model = CatBoostClassifier(
----> 2     l2_leaf_reg=int(best['l2_leaf_reg']),
      3     learning_rate=best['learning_rate'],
      4     iterations=500,
      5     eval_metric='Accuracy',

NameError: name 'best' is not defined

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

model_VV.fit(X, y, cat_features=categorical_features_indices)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-87-73c5b8ae99aa> in <module>
----> 1 model_VV.fit(X, y, cat_features=categorical_features_indices)

NameError: name 'model_VV' is not defined

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

submisstion.to_csv('submission.csv', index=False)

‘l2_leaf_reg’ Współczynnik na poziomie regularyzacji L2 funkcji kosztu. Każda wartość dodatnia jest dozwolona.

Iteracje i szybkość uczenia się (Iterations and learning rate)¶

Domyślnie CatBoost buduje 1000 drzew. Liczbę iteracji można zmniejszyć, aby przyspieszyć trening.

Gdy liczba iteracji maleje, należy zwiększyć szybkość uczenia się. Domyślnie wartość szybkości uczenia się jest definiowana automatycznie w zależności od liczby iteracji i wejściowego zestawu danych. Zmiana liczby iteracji na mniejszą wartość jest dobrym punktem wyjścia do optymalizacji.

Teraz (po znalezieniu optymalnych parametrów: ‘l2_leaf_reg’ i ‘learning_rate’, zdobądźmy wszystkie dane CV z najlepszymi parametrami:¶

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-8a762d5f7dc8> in <module>
      1 model = CatBoostClassifier(
----> 2     l2_leaf_reg=int(best['l2_leaf_reg']),
      3     learning_rate=best['learning_rate'],
      4     iterations=500,
      5     eval_metric='Accuracy',

NameError: name 'best' is not defined

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

model_VV.fit(X, y, cat_features=categorical_features_indices)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-87-73c5b8ae99aa> in <module>
----> 1 model_VV.fit(X, y, cat_features=categorical_features_indices)

NameError: name 'model_VV' is not defined

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

submisstion.to_csv('submission.csv', index=False)

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8294051627384961

model_VV.fit(X, y, cat_features=categorical_features_indices)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-87-73c5b8ae99aa> in <module>
----> 1 model_VV.fit(X, y, cat_features=categorical_features_indices)

NameError: name 'model_VV' is not defined

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

submisstion.to_csv('submission.csv', index=False)

Przypomnijmy, że przy domyślnych parametrach wynik cv wyniósł 0,8283, a zatem mamy (prawdopodobnie nieistotną statystycznie) pewną poprawę.

Prześlij na konkurs¶

Teraz zmienilibyśmy nasz dostrojony model na wszystkich danych treningowych, które mamy

model_VV.fit(X, y, cat_features=categorical_features_indices)

model_VV.fit(X, y, cat_features=categorical_features_indices)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-87-73c5b8ae99aa> in <module>
----> 1 model_VV.fit(X, y, cat_features=categorical_features_indices)

NameError: name 'model_VV' is not defined

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

submisstion.to_csv('submission.csv', index=False)

Na koniec przygotujmy plik zgłoszenia:

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

import pandas as pd
submisstion = pd.DataFrame()
#submisstion['PassengerId'] = X_train['PassengerId']
#submisstion['Survived'] = model.predict(X_train)

submisstion.to_csv('submission.csv', index=False)

submisstion.to_csv('submission.csv', index=False)

Wreszcie możesz złożyć zgłoszenie w konkursie Titanic Kaggle.

Otóż to! Teraz możesz grać z CatBoost i wygrywać niektóre konkursy! 🙂

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	iterations	test-Logloss-mean	test-Logloss-std	train-Logloss-mean	train-Logloss-std	test-Accuracy-mean	test-Accuracy-std	train-Accuracy-mean	train-Accuracy-std
0	0	0.675761	0.001280	0.675172	0.002037	0.789001	0.018544	0.808642	0.007956
1	1	0.658254	0.002262	0.656563	0.003342	0.796857	0.023888	0.817621	0.014119

	PassengerId	Pclass	Name	Sex	Age	Parch	Ticket	Fare	Cabin	Embarked
298	299	1	Saalfeld, Mr. Adolphe	male	-777.0	0	19988	30.50	C106	S
884	885	3	Sutehall, Mr. Henry Jr	male	25.0	0	SOTON/OQ 392076	7.05	-777	S
247	248	2	Hamalainen, Mrs. William (Anna)	female	24.0	2	250649	14.50	-777	S