Multioutput Stacking Regressor

030520202122

In [1]:
import pandas as pd

df = pd.read_csv('/home/wojciech/Pulpit/3/BikeSharing.csv')
print(df.shape)
df.head(3)
(17379, 17)
Out[1]:
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0.0 3 13 16
1 2 2011-01-01 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0.0 8 32 40
2 3 2011-01-01 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0.0 5 27 32

cnt: count of total rental bikes including both casual and registered

I fill all holes with values out of range

Wypełniam wszystkie dziury wartościami z poza zakresu

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
CORREL =df.corr()
sns.heatmap(CORREL, annot=True, cbar=False, cmap="coolwarm")
plt.title('Macierz korelacji ze zmienną wynikową y', fontsize=20)
Out[2]:
Text(0.5, 1, 'Macierz korelacji ze zmienną wynikową y')
In [3]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
CORREL['cnt'].plot(kind='barh', color='red')
plt.title('Korelacja ze zmienną wynikową', fontsize=20)
plt.xlabel('Poziom korelacji')
plt.ylabel('Zmienne nezależne ciągłe')
Out[3]:
Text(0, 0.5, 'Zmienne nezależne ciągłe')

Variables: ‘registered’, ‘casual’ are also results only shown differently, therefore they must be removed from the data.

Zmienne: ‘registered’,’casual’ są to też wyniki tylko inazej pokazane dlatego trzeba je usunąć z danych.

In [4]:
a,b = df.shape     #<- ile mamy kolumn
b

print('NUMBER OF EMPTY RECORDS vs. FULL RECORDS')
print('----------------------------------------')
for i in range(1,b):
    i = df.columns[i]
    r = df[i].isnull().sum()
    h = df[i].count()
    pr = (r/h)*100
   
    if r > 0:
        print(i,"--------",r,"--------",h,"--------",pr) 
NUMBER OF EMPTY RECORDS vs. FULL RECORDS
----------------------------------------
In [5]:
import seaborn as sns

sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c72a00bd0>
In [6]:
#del df['Unnamed: 15']
#del df['Unnamed: 16']

df = df.dropna(how='any') # jednak je kasuje te dziury

# df.fillna(-777, inplace=True)
df.isnull().sum()
Out[6]:
instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64
In [7]:
print(df.dtypes)
df.head(3)
instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object
Out[7]:
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0.0 3 13 16
1 2 2011-01-01 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0.0 8 32 40
2 3 2011-01-01 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0.0 5 27 32

to_datetime

In [8]:
df['dteday'] =  pd.to_datetime(df['dteday'])
df['weekday'] = df.dteday.dt.weekday
df['month'] =df.dteday.dt.month
df['weekofyear'] =df.dteday.dt.weekofyear 
In [9]:
del df['dteday']
In [10]:
print(df.dtypes)
df.head(3)
instant         int64
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
month           int64
weekofyear      int64
dtype: object
Out[10]:
instant season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt month weekofyear
0 1 1 0 1 0 0 5 0 1 0.24 0.2879 0.81 0.0 3 13 16 1 52
1 2 1 0 1 1 0 5 0 1 0.22 0.2727 0.80 0.0 8 32 40 1 52
2 3 1 0 1 2 0 5 0 1 0.22 0.2727 0.80 0.0 5 27 32 1 52

Encodes text values

Koduje wartości tekstowe

In [11]:
import numpy as np

a,b = df.shape     #<- ile mamy kolumn
b

print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
    i = df.columns[i]
    f = df[i].dtypes
    if f == np.object:
        print(i,"---",f)   
    
        if f == np.object:
        
            df[i] = pd.Categorical(df[i]).codes
        
            continue
DISCRETE FUNCTIONS CODED
------------------------

df[‘Time’] = pd.Categorical(df[‘Time’]).codes
df[‘Time’] = df[‘Time’].astype(int)

In [12]:
df.dtypes
Out[12]:
instant         int64
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
month           int64
weekofyear      int64
dtype: object
In [13]:
df.columns
Out[13]:
Index(['instant', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'month', 'weekofyear'],
      dtype='object')

I specify what is X and what is y

Określam co jest X a co y

In [14]:
X = df.drop(['cnt','registered','casual'],1)
y = df['cnt']
In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

Funkcja plot_regression_resultssłuży do wykreślenia przewidywanych i prawdziwych celów.

In [16]:
import matplotlib.pyplot as plt


def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot([y_true.min(), y_true.max()],
            [y_true.min(), y_true.max()],
            '--r', linewidth=2)
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
                          edgecolor='none', linewidth=0)
    ax.legend([extra], [scores], loc='upper left')
    title = title + 'n Evaluation in {:.2f} seconds'.format(elapsed_time)
    ax.set_title(title)

Wydajność układania w stosy jest zwykle zbliżona do najlepszego modelu i czasami może przewyższać wydajność prognozowania każdego modelu.

In [17]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

estimators = [
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Lasso', LassoCV()),
    ('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
]
stacking_regressor = StackingRegressor(
    estimators=estimators, final_estimator=RidgeCV()
)
In [18]:
import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict

X_train, y_train = load_boston(return_X_y=True)

fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
                                               stacking_regressor)]):
    start_time = time.time()
    score = cross_validate(est, X_train, y_train,
                           scoring=['r2', 'neg_mean_absolute_error'],
                           n_jobs=-1, verbose=0)
    elapsed_time = time.time() - start_time
###-----------------------------------------------------------------------------
    y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
    plot_regression_results(
        ax, y_test, y_pred,
        name,
        (r'$R^2={:.2f} pm {:.2f}
In [19]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import RidgeCV


estimators = [
    ('knn',KNeighborsRegressor(n_neighbors=5)),
    ('omp', OrthogonalMatchingPursuit()),
    ('svr', SVR(kernel='rbf', C=1e3, gamma=0.1))
]
stacking_regressor = StackingRegressor(
    estimators=estimators, final_estimator=RidgeCV()
)
In [20]:
import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict

X_train, y_train = load_boston(return_X_y=True)

fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
                                               stacking_regressor)]):
    start_time = time.time()
    score = cross_validate(est, X_train, y_train,
                           scoring=['r2', 'neg_mean_absolute_error'],
                           n_jobs=-1, verbose=0)
    elapsed_time = time.time() - start_time
###-----------------------------------------------------------------------------
    y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
    plot_regression_results(
        ax, y_test, y_pred,
        name,
        (r'$R^2={:.2f} pm {:.2f}

Don’t continue the code because it takes forever to calculate it by SVR

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

estimators = [
(‘SVR’, SVR(kernel=’linear’)),
(‘Ridge’, Ridge(random_state=1)),
(‘svr_rbf’, SVR(kernel=’rbf’))
] stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict

X_train, y_train = load_boston(return_X_y=True)

fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators + [(‘Stacking Regressor’,
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=[‘r2’, ‘neg_mean_absolute_error’],
n_jobs=-1, verbose=0)
elapsed_time = time.time() – start_time
###—————————————————————————–
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r’$R^2={:.2f} pm {:.2f}$’ + ‘n’ + r’$MAE={:.2f} pm {:.2f}$’)
.format(np.mean(score[‘test_r2’]),
np.std(score[‘test_r2’]),
-np.mean(score[‘test_neg_mean_absolute_error’]),
np.std(score[‘test_neg_mean_absolute_error’])),
elapsed_time)

plt.suptitle(‘Single predictors versus stacked predictors’)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

+ n + r‘$MAE={:.2f} pm {:.2f}

In [19]:

In [20]:

Don’t continue the code because it takes forever to calculate it by SVR

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

estimators = [
(‘SVR’, SVR(kernel=’linear’)),
(‘Ridge’, Ridge(random_state=1)),
(‘svr_rbf’, SVR(kernel=’rbf’))
] stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict

X_train, y_train = load_boston(return_X_y=True)

fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators + [(‘Stacking Regressor’,
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=[‘r2’, ‘neg_mean_absolute_error’],
n_jobs=-1, verbose=0)
elapsed_time = time.time() – start_time
###—————————————————————————–
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r’$R^2={:.2f} pm {:.2f}$’ + ‘n’ + r’$MAE={:.2f} pm {:.2f}$’)
.format(np.mean(score[‘test_r2’]),
np.std(score[‘test_r2’]),
-np.mean(score[‘test_neg_mean_absolute_error’]),
np.std(score[‘test_neg_mean_absolute_error’])),
elapsed_time)

plt.suptitle(‘Single predictors versus stacked predictors’)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

) .format(np.mean(score[‘test_r2’]), np.std(score[‘test_r2’]), np.mean(score[‘test_neg_mean_absolute_error’]), np.std(score[‘test_neg_mean_absolute_error’])), elapsed_time) plt.suptitle(‘Single predictors versus stacked predictors’) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()

In [19]:

In [20]: