030520202122
import pandas as pd
df = pd.read_csv('/home/wojciech/Pulpit/3/BikeSharing.csv')
print(df.shape)
df.head(3)
cnt: count of total rental bikes including both casual and registered
I fill all holes with values out of range
Wypełniam wszystkie dziury wartościami z poza zakresu
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
CORREL =df.corr()
sns.heatmap(CORREL, annot=True, cbar=False, cmap="coolwarm")
plt.title('Macierz korelacji ze zmienną wynikową y', fontsize=20)
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
CORREL['cnt'].plot(kind='barh', color='red')
plt.title('Korelacja ze zmienną wynikową', fontsize=20)
plt.xlabel('Poziom korelacji')
plt.ylabel('Zmienne nezależne ciągłe')
Variables: ‘registered’, ‘casual’ are also results only shown differently, therefore they must be removed from the data.
Zmienne: ‘registered’,’casual’ są to też wyniki tylko inazej pokazane dlatego trzeba je usunąć z danych.
a,b = df.shape #<- ile mamy kolumn
b
print('NUMBER OF EMPTY RECORDS vs. FULL RECORDS')
print('----------------------------------------')
for i in range(1,b):
i = df.columns[i]
r = df[i].isnull().sum()
h = df[i].count()
pr = (r/h)*100
if r > 0:
print(i,"--------",r,"--------",h,"--------",pr)
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#del df['Unnamed: 15']
#del df['Unnamed: 16']
df = df.dropna(how='any') # jednak je kasuje te dziury
# df.fillna(-777, inplace=True)
df.isnull().sum()
print(df.dtypes)
df.head(3)
to_datetime¶
df['dteday'] = pd.to_datetime(df['dteday'])
df['weekday'] = df.dteday.dt.weekday
df['month'] =df.dteday.dt.month
df['weekofyear'] =df.dteday.dt.weekofyear
del df['dteday']
print(df.dtypes)
df.head(3)
Encodes text values¶
Koduje wartości tekstowe
import numpy as np
a,b = df.shape #<- ile mamy kolumn
b
print('DISCRETE FUNCTIONS CODED')
print('------------------------')
for i in range(1,b):
i = df.columns[i]
f = df[i].dtypes
if f == np.object:
print(i,"---",f)
if f == np.object:
df[i] = pd.Categorical(df[i]).codes
continue
df[‘Time’] = pd.Categorical(df[‘Time’]).codes
df[‘Time’] = df[‘Time’].astype(int)
df.dtypes
df.columns
I specify what is X and what is y¶
Określam co jest X a co y
X = df.drop(['cnt','registered','casual'],1)
y = df['cnt']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
Funkcja plot_regression_resultssłuży do wykreślenia przewidywanych i prawdziwych celów.¶
import matplotlib.pyplot as plt
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
"""Scatter plot of the predicted vs true targets."""
ax.plot([y_true.min(), y_true.max()],
[y_true.min(), y_true.max()],
'--r', linewidth=2)
ax.scatter(y_true, y_pred, alpha=0.2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
ax.set_xlim([y_true.min(), y_true.max()])
ax.set_ylim([y_true.min(), y_true.max()])
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
edgecolor='none', linewidth=0)
ax.legend([extra], [scores], loc='upper left')
title = title + 'n Evaluation in {:.2f} seconds'.format(elapsed_time)
ax.set_title(title)
Wydajność układania w stosy jest zwykle zbliżona do najlepszego modelu i czasami może przewyższać wydajność prognozowania każdego modelu.
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
estimators = [
('Random Forest', RandomForestRegressor(random_state=42)),
('Lasso', LassoCV()),
('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
]
stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)
import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict
X_train, y_train = load_boston(return_X_y=True)
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)
for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=['r2', 'neg_mean_absolute_error'],
n_jobs=-1, verbose=0)
elapsed_time = time.time() - start_time
###-----------------------------------------------------------------------------
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r'$R^2={:.2f} pm {:.2f}
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import RidgeCV
estimators = [
('knn',KNeighborsRegressor(n_neighbors=5)),
('omp', OrthogonalMatchingPursuit()),
('svr', SVR(kernel='rbf', C=1e3, gamma=0.1))
]
stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)
import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict
X_train, y_train = load_boston(return_X_y=True)
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)
for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=['r2', 'neg_mean_absolute_error'],
n_jobs=-1, verbose=0)
elapsed_time = time.time() - start_time
###-----------------------------------------------------------------------------
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r'$R^2={:.2f} pm {:.2f}
Don’t continue the code because it takes forever to calculate it by SVR¶
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
estimators = [
(‘SVR’, SVR(kernel=’linear’)),
(‘Ridge’, Ridge(random_state=1)),
(‘svr_rbf’, SVR(kernel=’rbf’))
]
stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict
X_train, y_train = load_boston(return_X_y=True)
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)
for ax, (name, est) in zip(axs, estimators + [(‘Stacking Regressor’,
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=[‘r2’, ‘neg_mean_absolute_error’],
n_jobs=-1, verbose=0)
elapsed_time = time.time() – start_time
###—————————————————————————–
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r’$R^2={:.2f} pm {:.2f}$’ + ‘n’ + r’$MAE={:.2f} pm {:.2f}$’)
.format(np.mean(score[‘test_r2’]),
np.std(score[‘test_r2’]),
-np.mean(score[‘test_neg_mean_absolute_error’]),
np.std(score[‘test_neg_mean_absolute_error’])),
elapsed_time)
plt.suptitle(‘Single predictors versus stacked predictors’)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
+ ‘n‘ + r‘$MAE={:.2f} pm {:.2f}
Don’t continue the code because it takes forever to calculate it by SVR¶
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
estimators = [
(‘SVR’, SVR(kernel=’linear’)),
(‘Ridge’, Ridge(random_state=1)),
(‘svr_rbf’, SVR(kernel=’rbf’))
]
stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV()
)import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict
X_train, y_train = load_boston(return_X_y=True)
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)
for ax, (name, est) in zip(axs, estimators + [(‘Stacking Regressor’,
stacking_regressor)]):
start_time = time.time()
score = cross_validate(est, X_train, y_train,
scoring=[‘r2’, ‘neg_mean_absolute_error’],
n_jobs=-1, verbose=0)
elapsed_time = time.time() – start_time
###—————————————————————————–
y_pred = cross_val_predict(est, X_test, y_test, n_jobs=-1, verbose=0)
plot_regression_results(
ax, y_test, y_pred,
name,
(r’$R^2={:.2f} pm {:.2f}$’ + ‘n’ + r’$MAE={:.2f} pm {:.2f}$’)
.format(np.mean(score[‘test_r2’]),
np.std(score[‘test_r2’]),
-np.mean(score[‘test_neg_mean_absolute_error’]),
np.std(score[‘test_neg_mean_absolute_error’])),
elapsed_time)
plt.suptitle(‘Single predictors versus stacked predictors’)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
) .format(np.mean(score[‘test_r2’]), np.std(score[‘test_r2’]), –np.mean(score[‘test_neg_mean_absolute_error’]), np.std(score[‘test_neg_mean_absolute_error’])), elapsed_time) plt.suptitle(‘Single predictors versus stacked predictors’) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()