
Feel free to read the code on GitHub
data source: https://archive.ics.uci.edu/ml/datasets/Air+Quality
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
In [2]:
df = pd.read_csv ('/home/wojciech/Pulpit/1/AirQualityUCI.csv', sep=';')
df.head(3)
Out[2]:
Deletes the -200 variable indicating a data error¶
In [3]:
df[['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
'T', 'RH', 'AH']] = df[['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
'T', 'RH', 'AH']].replace(-200,np.NaN)
Deletes invalid records¶
In [4]:
del df['NMHC(GT)']
del df['Unnamed: 15']
del df['Unnamed: 16']
print(df.shape)
df.isnull().sum()
df = df.dropna(how='any')
print(df.shape)
print(df.isnull().sum())
Replace variables with numeric values¶
In [5]:
print(df.dtypes)
In [6]:
df['CO(GT)'] = df['CO(GT)'].str.replace(',', '.')
In [7]:
df['C6H6(GT)'] = df['C6H6(GT)'].str.replace(',', '.')
In [8]:
df['T'] = df['T'].str.replace(',', '.')
In [9]:
df['RH'] = df['RH'].str.replace(',', '.')
In [10]:
df['AH'] = df['AH'].str.replace(',', '.')
In [11]:
df[['CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)',
'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
'T', 'RH', 'AH']] = df[['CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)',
'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
'T', 'RH', 'AH']].astype(float)
We choose the test kit¶
In [12]:
df2= df[['PT08.S4(NO2)','PT08.S3(NOx)','PT08.S2(NMHC)','AH','C6H6(GT)']]
Encodes the resulting categorical variable – C6H6 (GT)¶
In [13]:
print('max:',df2['C6H6(GT)'].max())
print('min:',df2['C6H6(GT)'].min())
sns.distplot(np.array(df['C6H6(GT)']),color='#999999')
Out[13]:
In [14]:
df2['C6H6(GT)'] = df['C6H6(GT)'].apply(lambda x: 1 if x > 10 else 0)
df2['C6H6(GT)'].value_counts(dropna = False, normalize=True).plot(kind='pie',colors=['#b7b7b7','#ea9999'])
Out[14]:
In [15]:
X = df2.drop('C6H6(GT)', axis=1)
y = df2['C6H6(GT)']
Data for the chart¶
In [16]:
y= y.to_frame()
y.head(5)
Out[16]:
In [17]:
df2.head(3)
Out[17]:
Classification chart¶
In [18]:
fig = plt.figure(figsize = (20, 25))
j = 0
for i in df2.columns:
plt.subplot(6, 4, j+1)
j = 1+j
sns.distplot(df2[i][y['C6H6(GT)']==0], color='#999999', label = '0')
sns.distplot(df2[i][y['C6H6(GT)']==1], color='#ff0000', label = '1')
plt.legend(loc='best',fontsize=10)
fig.suptitle('Air pollution with C6H6 substance',fontsize=34,color='#ff0000',alpha=0.3)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()
Definicja¶
In [19]:
def scientist_plot(data, y, AAA, Title):
fig = plt.figure(figsize = (20, 25))
j = 0
for i in df2.columns:
plt.subplot(6, 4, j+1)
j = 1+j
sns.distplot(data[i][y[AAA]==0], color='#ffff00', label = 'acceptable norm')
sns.distplot(data[i][y[AAA]==1], color='#4a86e8', label = 'norm exceeded')
plt.legend(loc='best',fontsize=10)
fig.suptitle(Title,fontsize=34,color='#4a86e8',alpha=0.5)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()
In [20]:
scientist_plot(df2, y, 'C6H6(GT)','Statistical characteristics of exogenous variables')
Dispersion matrix for classification¶
In [21]:
fig = plt.figure(figsize = (20, 25))
kot = ['#999999','#ff0000']
sns.pairplot(data=df2[['PT08.S4(NO2)','PT08.S3(NOx)','PT08.S2(NMHC)','AH','C6H6(GT)']], hue='C6H6(GT)', dropna=True, height=2, palette=kot)
fig.suptitle('Air pollution with C6H6 substance',fontsize=34,color='#ff0000',alpha=0.3)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()
In [ ]:
In [ ]:
In [ ]: