In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Autos

Source of data: https://datahub.io/machine-learning/autos

df2= pd.read_csv('c:/1/autos.csv')
df2.head()

fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

plt.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.title("Bubble Plot of Autos Arean(color: 'price & size: 'city_mpg')", fontsize=16)
plt.xlabel('horsepower', fontsize=18)
plt.ylabel('engine_size', fontsize=18)
plt.colorbar()

plt.show()

C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))

Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64

fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
ax.set_title("Bubble Plot of Autos Arean(color: 'price & size: 'engine_size')", fontsize=16)
ax.set_xlabel('horsepower', fontsize=18)
ax.set_ylabel('engine_size', fontsize=18)


## Sztuczka żeby mieć colorbar
AA = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.colorbar(AA)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

handles, labels = AA.legend_elements(prop="sizes", alpha=0.6)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df2['make']):
    ax.annotate(txt, (df2['horsepower'][i],df2['engine_size'] [i]))

plt.show()

C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))

Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64

Midwest

df = pd.read_csv('c:/2/midwest_filter.csv')
df.head()

# Plot
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='Reds', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of PopTotal vs Arean(color: 'popdensity' & size: 'dot_size' - both are numeric columns in midwest)", fontsize=16)
plt.xlabel('Area', fontsize=18)
plt.ylabel('Poptotal', fontsize=18)
plt.colorbar()
plt.show()

C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))

Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64

fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
ax.set_title("Bubble Plot of PopTotal vs Arean color: 'popdensity' & size: 'dot_size'", fontsize=16)
ax.set_xlabel('Area', fontsize=18)
ax.set_ylabel('Poptotal', fontsize=18)
   


## Sztuczka żeby mieć colorbar
BB = ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
plt.colorbar(BB)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### legenda do wielkości kółek
handles, labels = BB.legend_elements(prop="sizes", alpha=0.6)
legend = ax.legend(handles, labels, loc="lower right", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df['county']):
    ax.annotate(txt, (df['area'][i],df['poptotal'] [i]))

plt.show()

C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))

Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64

WorldHappinessReport

Source of data: https://worldhappiness.report/download/

The best plots appear when we combine various data!

df3= pd.read_csv('c:/1/WorldHappinessReport.csv')
df3 = df3[df3['Year']==2017]
df3.tail(2)

df4 = pd.read_csv('c:/1/WorldPopulation.csv')
df4.head(2)

Only Africa and only 2017.

D3 = df4.set_index('Country Name')['2017'].to_dict()
#D3

df3['Population2017'] = df3['Country'].map(D3) 
df3['Population2017'] = df3['Population2017']/100000

df3.isnull().sum()
df3 = df3.dropna(how='any')
df3.isnull().sum()

Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64

kot = ['Sub-Saharan Africa','Middle East and Northern Africa']
AFR = df3[df3['Region'].isin(kot)]
AFR.head(2)

AFR.to_csv('c:/8/AfricaHappinessReport2017.csv')
df10 = pd.read_csv('c:/8/AfricaHappinessReport2017.csv')
df10.head(2)

fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
ax.set_title("AFRICA 2017 Happiness & Freedomn(color: 'Economy (GDP per Capita)' & size: 'Population2017')", fontsize=16)
ax.set_xlabel('Happiness Score', fontsize=18)
ax.set_ylabel('Freedom', fontsize=18)


## Sztuczka żeby mieć colorbar
CC = ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
plt.colorbar(CC)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### Sztuczka, żeby mieć legende do size - nie działa dla danych ciągłych (musi byc tylko kilka klas)
handles, labels = CC.legend_elements(prop="sizes", alpha=0.1)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df10['Country']):
    ax.annotate(txt, (df10['Happiness Score'][i],df10['Freedom'] [i]))

plt.show()

Diabetes

df2= pd.read_csv('c:/1/diabetes.csv')
df2.head(2)

Adds BMI indicator amplifier

df2['BMI_class'] = ((pd.qcut(df2['BMI'],5, labels=False).astype(int))+1)*70

fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('Age', 'Glucose', data=df2, s='BMI_class', c='BloodPressure', cmap='YlOrBr', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of Diabetesn color: BloodPressure & size: BMI", fontsize=16)
plt.xlabel('Age', fontsize=18)
plt.ylabel('Glucose', fontsize=18)
plt.colorbar()
plt.show()

In [1]:

import scipy.cluster.hierarchy as shc
import pandas as pd
import matplotlib.pyplot as plt

# Import Data
df = pd.read_csv('c:/1/USArrests.csv')

USArrests

Source of data: https://www.kaggle.com/deepakg/usarrests

df.rename(columns = {'Unnamed: 0': 'State'}, inplace=True)
df.head(4)

# Plot
plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("USArrests Dendograms", fontsize=22)  
dend = shc.dendrogram(shc.linkage(df[['Murder', 'Assault', 'UrbanPop', 'Rape']], method='ward'), labels=df.State.values, color_threshold=100)  
plt.xticks(fontsize=12)
plt.show()

array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],

array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)

Text(0, 0.5, 'Spending')

Earning and expenses

Source of example: https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/

df3 = pd.read_csv('c:/1/hierarchical-clustering-with-python-and-scikit-learn-shopping-data.csv')
df3.head()

We have a table that shows gender, age, annual income and expenditure. We take a vector of two coordinates from the DataFrame table: annual income in k $ – a tendency to spend on a scale of 1 to 100.

data = df3.iloc[:, 3:5].values
data

array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],

plt.figure(figsize=(10, 3))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)

Text(0, 0.5, 'Spending')

The dendrogram showed that there are 5 clusters (5 branches) of the bank’s clients. We create a clustering matrix. Since we had five clusters, we have five labels at the output, i.e. 0 to 4.

from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
cluster.fit_predict(data)

array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)

plt.figure(figsize=(10, 7))
plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('CUSTOMERS CLUSTERINGS')
plt.xlabel('Annual earnings')
plt.ylabel('Spending')

Text(0, 0.5, 'Spending')

Purple cluster – (in the lower right corner) a cluster of clients with high earnings but low expenses. Customers in the middle (blue data points) are those with average income and average salary. The largest number of customers belongs to this category.

df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()

Text(0.5, 0, 'BMI')

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

Text(0.5, 0, 'BMI')

df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()

Text(0.5, 0, 'BMI')

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

Text(0.5, 0, 'BMI')

df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()

Text(0.5, 0, 'BMI')

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

Text(0.5, 0, 'BMI')

Clinical tests

Source of data: https://www.kaggle.com/saurabh00007/diabetescsv

df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()

PKP = df3[['Age','SkinThickness','BMI']]

PKP.head()

The dendroid chart will tell you how many clusters you want

plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(PKP, method='ward'))

Text(0.5, 0, 'BMI')

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

Text(0.5, 0, 'BMI')

It seems 5 clusters

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], color='black',marker='o')

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)

Text(0.5, 0, 'BMI')

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

Text(0.5, 0, 'BMI')

from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
KF = cluster.fit_predict(PKP)
KF

array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)

# Initializing KMeans
kmeans = KMeans(n_clusters=5)
# Fitting with inputs
kmeans = kmeans.fit(PKP)
# Predicting the clusters
labels = kmeans.predict(PKP)
# Getting the cluster centers
C = kmeans.cluster_centers_

C

array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], c=KF)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], marker='.', c='red', s=1000)

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)

Text(0.5, 0, 'BMI')

In [1]:

import matplotlib as mpl
import calmap
import pandas as pd
import matplotlib.pyplot as plt

yahoo

df = pd.read_csv('c:/1/yahoo.txt', parse_dates=['date'])
df.set_index('date', inplace=True)
df.head(3)

plt.figure(figsize=(16,10), dpi= 280)
calmap.calendarplot(df['2014']['VIX.Close'],cmap= 'seismic', fig_kws={'figsize': (16,10)}, yearlabel_kws={'color':'black', 'fontsize':24}, subplot_kws={'title':'Yahoo Stock Prices'})
plt.show()

(,
 array([],
       dtype=object))

df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()

calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

(,
 array([],
       dtype=object))

paltes: https://matplotlib.org/examples/color/colormaps_reference.html

cmaps = [(’Perceptually Uniform Sequential’, [ 'viridis’, 'plasma’, 'inferno’, 'magma’]), (’Sequential’, [ 'Greys’, 'Purples’, 'Blues’, 'Greens’, 'Oranges’, 'Reds’, 'YlOrBr’, 'YlOrRd’, 'OrRd’, 'PuRd’, 'RdPu’, 'BuPu’, 'GnBu’, 'PuBu’, 'YlGnBu’, 'PuBuGn’, 'BuGn’, 'YlGn’]), (’Sequential (2)’, [ 'binary’, 'gist_yarg’, 'gist_gray’, 'gray’, 'bone’, 'pink’, 'spring’, 'summer’, 'autumn’, 'winter’, 'cool’, 'Wistia’, 'hot’, 'afmhot’, 'gist_heat’, 'copper’]), (’Diverging’, [ 'PiYG’, 'PRGn’, 'BrBG’, 'PuOr’, 'RdGy’, 'RdBu’, 'RdYlBu’, 'RdYlGn’, 'Spectral’, 'coolwarm’, 'bwr’, 'seismic’]), (’Qualitative’, [ 'Pastel1′, 'Pastel2′, 'Paired’, 'Accent’, 'Dark2′, 'Set1′, 'Set2′, 'Set3′, 'tab10′, 'tab20′, 'tab20b’, 'tab20c’]), (’Miscellaneous’, [ 'flag’, 'prism’, 'ocean’, 'gist_earth’, 'terrain’, 'gist_stern’, 'gnuplot’, 'gnuplot2′, 'CMRmap’, 'cubehelix’, 'brg’, 'hsv’, 'gist_rainbow’, 'rainbow’, 'jet’, 'nipy_spectral’, 'gist_ncar’])]

df.head(4)

phone_data

df2 = pd.read_csv('c:/1/phone_data.csv', parse_dates=['date'])
df2.head(3)

df2.set_index('date', inplace=True)
df2.head()

plt.figure(figsize=(16,10), dpi= 280)
calmap.calendarplot(df2['2014']['duration'],cmap= 'BrBG', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'black', 'fontsize':24}
                    , subplot_kws={'title':'phone_data'})
plt.show()

(,
 array([],
       dtype=object))

df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()

calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

(,
 array([],
       dtype=object))

Energy

df3 = pd.read_csv('c:/2/Energy.csv', index_col=0, parse_dates=['Date'])
df3.set_index('Date', inplace=True)
df3.head()

plt.figure(figsize=(116,100), dpi= 280)

calmap.calendarplot(df3['2007']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})

calmap.calendarplot(df3['2008']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})

calmap.calendarplot(df3['2009']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white', daylabels='PWŚCPSN'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})

(,
 array([],
       dtype=object))

df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()

calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

(,
 array([],
       dtype=object))

Personal calendar

df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()

df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()

calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

(,
 array([],
       dtype=object))

In [1]:

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
import matplotlib.pylab as plt
from pylab import plot, show, subplot, specgram, imshow, savefig
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plote

Banking marketing

Analysis of the categorical results. Source of data: https://archive.ics.uci.edu/ml/machine-learning-databases/00222/

df = pd.read_csv('c:/1/bank.csv')
df.head()

CORREL = df.corr().sort_values('y')
CORREL['y'].to_frame().sort_values('y')
CORREL.index

Index(['nr_employed', 'pdays', 'euribor3m', 'emp_var_rate', 'cons_price_idx',
       'campaign', 'Unnamed: 0', 'Unnamed: 0.1', 'age', 'cons_conf_idx',
       'previous', 'duration', 'y'],
      dtype='object')

plt.figure(figsize=(10,6))
CORREL['y'].plot(kind='barh', color='red')
plt.title('Correlation with the result variable', fontsize=20)
plt.xlabel('Correlation level')
plt.ylabel('Continuous independent variables')

Text(0, 0.5, 'Continuous independent variables')

Index(['nr_employed', 'pdays', 'euribor3m', 'emp_var_rate', 'cons_price_idx',
       'campaign', 'Unnamed: 0', 'Unnamed: 0.1', 'age', 'cons_conf_idx',
       'previous', 'duration', 'y'],
      dtype='object')

nr_employed      -0.354678
pdays            -0.324914
euribor3m        -0.307771
emp_var_rate     -0.298334
cons_price_idx   -0.136211
campaign         -0.066357
Unnamed: 0       -0.006165
Unnamed: 0.1     -0.006165
age               0.030399
cons_conf_idx     0.054878
previous          0.230181
duration          0.405274
y                 1.000000
Name: y, dtype: float64

Variables for the chart

lebel=CORREL.index
lebel

Index(['nr_employed', 'pdays', 'euribor3m', 'emp_var_rate', 'cons_price_idx',
       'campaign', 'Unnamed: 0', 'Unnamed: 0.1', 'age', 'cons_conf_idx',
       'previous', 'duration', 'y'],
      dtype='object')

data = CORREL['y']
data

nr_employed      -0.354678
pdays            -0.324914
euribor3m        -0.307771
emp_var_rate     -0.298334
cons_price_idx   -0.136211
campaign         -0.066357
Unnamed: 0       -0.006165
Unnamed: 0.1     -0.006165
age               0.030399
cons_conf_idx     0.054878
previous          0.230181
duration          0.405274
y                 1.000000
Name: y, dtype: float64

title = 'Correlation with the result variable'

# Draw plot, # dpi=80 wykres, który będzie miał wymiary 80 na 80 pikseli
plt.figure(figsize=(8,4), dpi= 80, facecolor='#f4cccc', edgecolor='yellow') 

plt.hlines(y=lebel, xmin=0, xmax=data)
for x, y, tex in zip(data, lebel, data):
    t = plt.text(x, y, round(tex, 2), horizontalalignment='right' if x < 0 else 'left', 
                 verticalalignment='center', fontdict={'color':'#ff0000' if x < 0 else '#38761d', 'size':14})

# Decorations    
plt.yticks(lebel, fontsize=12)
plt.title(title, fontdict={'size':20})
plt.grid(linestyle='--', alpha=0.5)
plt.xlim(-1.0, 1.0)
plt.show()

Trigger

def Hlines(data,lebel,title):
    plt.figure(figsize=(8,4), dpi= 80, facecolor='#f4cccc', edgecolor='yellow') # dpi=80 wykres, który będzie miał wymiary 80 na 80 pikseli
    plt.hlines(y=lebel, xmin=0, xmax=data)
    for x, y, tex in zip(data, lebel, data):
        t = plt.text(x, y, round(tex, 2), horizontalalignment='right' if x < 0 else 'left', 
                     verticalalignment='center', fontdict={'color':'#ff0000' if x < 0 else '#38761d', 'size':14})

    plt.yticks(lebel, fontsize=12, color='#660000', alpha=0.9)
    plt.title(title, fontdict={'size':20}, color='#660000', alpha=0.9)
    plt.grid(linestyle='--', alpha=0.6)
    plt.xlim(-1.0, 1.0) #limit ax
    plt.show()

data = CORREL['y']
lebel=CORREL.index
title = 'Correlation with the result variable'

Hlines(data,lebel,title)

https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

pip install pywaffle

from pywaffle import Waffle
import squarify 
import pandas as pd
import matplotlib.pyplot as plt

Car market analysis

Source of data: https://github.com/selva86/datasets/blob/master/mpg_ggplot2.csv

GSuite Text and Background Palette: https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/

df= pd.read_csv('c:/1/mpg_ggplot2.txt')
df.head()

df = df.groupby('class').size().reset_index(name='counts')
df

# Prepare Data

n_categories = df.shape[0]
colors = [plt.cm.YlGnBu(i/float(n_categories)) for i in range(n_categories)]

# Draw Plot and Decorate
fig = plt.figure(
    FigureClass=Waffle,
    plots={
        '111': {
            'values': df['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in df[['class', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.05, 1), 'fontsize': 12},
            'title': {'label': 'Vehicles by Class', 'loc': 'center', 'fontsize':28}
        },
    },
    rows=7,
    colors=colors,
    figsize=(16, 9)
)

7

[(0.988362, 0.998364, 0.644924, 1.0),
 (0.981173, 0.759135, 0.156863, 1.0),
 (0.961293, 0.488716, 0.084289, 1.0),
 (0.832299, 0.283913, 0.257383, 1.0),
 (0.621685, 0.164184, 0.388781, 1.0),
 (0.397674, 0.083257, 0.433183, 1.0),
 (0.15585, 0.044559, 0.325338, 1.0)]

n_categories

7

colors

[(0.988362, 0.998364, 0.644924, 1.0),
 (0.981173, 0.759135, 0.156863, 1.0),
 (0.961293, 0.488716, 0.084289, 1.0),
 (0.832299, 0.283913, 0.257383, 1.0),
 (0.621685, 0.164184, 0.388781, 1.0),
 (0.397674, 0.083257, 0.433183, 1.0),
 (0.15585, 0.044559, 0.325338, 1.0)]

Titanic disaster

We ought to find which passengers have chance to survive according to their affiliation to the established groups.

Source of data: https://www.kaggle.com/shivamp629/traincsv

df2 = pd.read_csv('c:/1/kaggletrain.csv')
df2.head(3)

dfTDK = df2.groupby('Pclass').size().reset_index(name='counts')
dfTDK

paltes: https://matplotlib.org/examples/color/colormaps_reference.html

cmaps = [(’Perceptually Uniform Sequential’, [
’viridis’, 'plasma’, 'inferno’, 'magma’]),
(’Sequential’, [
’Greys’, 'Purples’, 'Blues’, 'Greens’, 'Oranges’, 'Reds’,
’YlOrBr’, 'YlOrRd’, 'OrRd’, 'PuRd’, 'RdPu’, 'BuPu’,
’GnBu’, 'PuBu’, 'YlGnBu’, 'PuBuGn’, 'BuGn’, 'YlGn’]),
(’Sequential (2)’, [
’binary’, 'gist_yarg’, 'gist_gray’, 'gray’, 'bone’, 'pink’,
’spring’, 'summer’, 'autumn’, 'winter’, 'cool’, 'Wistia’,
’hot’, 'afmhot’, 'gist_heat’, 'copper’]),
(’Diverging’, [
’PiYG’, 'PRGn’, 'BrBG’, 'PuOr’, 'RdGy’, 'RdBu’,
’RdYlBu’, 'RdYlGn’, 'Spectral’, 'coolwarm’, 'bwr’, 'seismic’]),
(’Qualitative’, [
’Pastel1′, 'Pastel2′, 'Paired’, 'Accent’,
’Dark2′, 'Set1′, 'Set2′, 'Set3′,
’tab10′, 'tab20′, 'tab20b’, 'tab20c’]),
(’Miscellaneous’, [
’flag’, 'prism’, 'ocean’, 'gist_earth’, 'terrain’, 'gist_stern’,
’gnuplot’, 'gnuplot2′, 'CMRmap’, 'cubehelix’, 'brg’, 'hsv’,
’gist_rainbow’, 'rainbow’, 'jet’, 'nipy_spectral’, 'gist_ncar’])]

# Prepare Data

n_categories = dfTDK.shape[0]
colors2 = [plt.cm.cubehelix(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': dfTDK['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in dfTDK[['Pclass', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Structure of passengers population of Titanic', 'loc': 'center', 'fontsize':68,'alpha':0.5}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)

Embarked: (C = Cherbourg, Q = Queenstown, S = Southampton)

df2.Embarked = df2.Embarked.str.replace('C', 'Cherbourg')
df2.Embarked = df2.Embarked.str.replace('Q', 'Queenstown')
df2.Embarked = df2.Embarked.str.replace('S', 'Southampton')
df2.sample(4)

dfPKP = df2.groupby('Embarked').size().reset_index(name='counts')
dfPKP

# Prepare Data

n_categories = dfPKP.shape[0]
colors2 = [plt.cm.bwr(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': dfPKP['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in dfPKP[['Embarked', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Place of embarking of passengers on the Titanic', 'loc': 'center', 'fontsize':58,'alpha':0.5}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)

df3 = pd.read_csv('c:/1/bank.csv')
df3.head(3)

df_STS = df3.pivot_table(index='job', values = 'Unnamed: 0',aggfunc='count').reset_index()
df_STS.sort_values('Unnamed: 0', ascending=False)

n_categories = df_STS.shape[0]
colors2 = [plt.cm.bwr(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=180, FigureClass=Waffle,
    plots={
        '111': {
            'values': df_STS['Unnamed: 0'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in df_STS[['job', 'Unnamed: 0']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 128},
            'title': {'label': 'Structure of bank customers by occupation', 'loc': 'center', 'fontsize':158,'alpha':0.5}
        },
    },
    rows=150,
    colors=colors2,
    figsize=(118, 70)
)

Structure of airports

Source of data: http://ourairports.com/data/airports.csv

df4= pd.read_csv('c:/1/airports.csv')
df4.head(4)

PL = df4[df4['iso_country']=='PL']

PPL = PL.pivot_table(index='type', values='id',aggfunc = 'count').reset_index()
PPL

n_categories = PPL.shape[0]
colors2 = [plt.cm.RdYlBu(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': PPL['id'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in PPL[['type', 'id']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Structure of airports in Poland', 'loc': 'center', 'fontsize':48,'alpha':0.8}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np

# Import Data
df = pd.read_csv('c:/2/gdppercap.txt')
df

To make vector of data: series 1, series 2 and labels

lebel = df.continent.to_list()
lebel

['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']

series1=np.round(df['1952'].to_list(), decimals=1)
series1

array([ 1252.6,  4079.1,  5195.5,  5661.1, 10298.1])

series2=np.round(df['1957'].to_list(), decimals=1)
series2

array([ 1385.2,  4616. ,  4003.1,  6963. , 11598.5])

Needed parameters

lebel = df.continent.to_list()
series1=np.round(df['1952'].to_list(), decimals=1)
series2=np.round(df['1957'].to_list(), decimals=1)
title = 'Slopechart: Comparing GDP Per Capita between 1952 vs 1957'
ylabel ='Mean GDP Per Capita'
xlabel =["1952", "1957"]

Definition of the trigger

def SlopeChart(series1, series2, title, xlabel, ylabel, lebel):
    
# Labels formatting
    left_label = [str(c) + ', '+ str(round(y)) for c, y in zip(lebel, series1)]
    right_label = [str(c) + ', '+ str(round(y)) for c, y in zip(lebel, series2)]
    klass = ['red' if (y1-y2) < 0 else 'green' for y1, y2 in zip(series1, series2)]

# lines color
    def newline(p1, p2, color='black'):
        ax = plt.gca()
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='red' if p1[1]-p2[1] > 0 else 'green', marker='o', markersize=6)
        ax.add_line(l)
        return l

    fig, ax = plt.subplots(1,1,figsize=(14,14), dpi= 380)

# Vertical Lines
    ax.vlines(x=1, ymin=500, ymax=13000, color='black', alpha=0.7, linewidth=1, linestyles='dotted')
    ax.vlines(x=3, ymin=500, ymax=13000, color='black', alpha=0.7, linewidth=1, linestyles='dotted')

# Points
    ax.scatter(y=series1, x=np.repeat(1, df.shape[0]), s=10, color='black', alpha=0.7)
    ax.scatter(y=series2, x=np.repeat(3, df.shape[0]), s=10, color='black', alpha=0.7)

# Line Segmentsand Annotation
    for p1, p2, c in zip(series1, series2, df['continent']):
        newline([1,p1], [3,p2])
        ax.text(1-0.05, p1, c + ', ' + str(round(p1)), horizontalalignment='right', verticalalignment='center', fontdict={'size':14})
        ax.text(3+0.05, p2, c + ', ' + str(round(p2)), horizontalalignment='left', verticalalignment='center', fontdict={'size':14})

# 'Before' and 'After' Annotations
    ax.text(1-0.05, 13000, 'BEFORE', horizontalalignment='right', verticalalignment='center', fontdict={'size':18, 'weight':700})
    ax.text(3+0.05, 13000, 'AFTER', horizontalalignment='left', verticalalignment='center', fontdict={'size':18, 'weight':700})

# Decoration
    ax.set_title(title, fontdict={'size':22})
    ax.set(xlim=(0,4), ylim=(0,14000), ylabel=ylabel)
    ax.set_xticks([1,3])
    ax.set_xticklabels(xlabel)
    plt.yticks(np.arange(500, 13000, 2000), fontsize=12)

# Lighten borders
    plt.gca().spines["top"].set_alpha(.0)
    plt.gca().spines["bottom"].set_alpha(.0)
    plt.gca().spines["right"].set_alpha(.0)
    plt.gca().spines["left"].set_alpha(.0)
    plt.show()

Slope Chart realization

SlopeChart(series1, series2, title, xlabel, ylabel, lebel)

['Australia and New Zealand',
 'Central and Eastern Europe',
 'Eastern Asia',
 'Latin America and Caribbean',
 'Middle East and Northern Africa',
 'North America',
 'Southeastern Asia',
 'Southern Asia',
 'Sub-Saharan Africa',
 'Western Europe']

Index(['Region', 2015.0, 2016.0, 2017.0], dtype='object', name='Year')

array([  9.5,  79. ,  64.5,  46.9,  77.6,  10. ,  81.2, 113.1, 127.9,
        29.5])

array([  9. ,  75.1,  63.6,  50.8,  76.4,  10.5,  73.8, 109.9, 127.9,
        27.1])

df2 = pd.read_csv('c:/1/WorldHappinessReport.csv')
df2.head(5)

kot = df2.pivot_table(index='Region',columns=['Year'], values='Happiness Rank', aggfunc='mean')
kot

PKP = kot.reset_index()
labelPKP = PKP['Region'].to_list()
labelPKP

['Australia and New Zealand',
 'Central and Eastern Europe',
 'Eastern Asia',
 'Latin America and Caribbean',
 'Middle East and Northern Africa',
 'North America',
 'Southeastern Asia',
 'Southern Asia',
 'Sub-Saharan Africa',
 'Western Europe']

PKP.columns

Index(['Region', 2015.0, 2016.0, 2017.0], dtype='object', name='Year')

PKP.columns = ['Region', '2015', '2016', '2017']

series1 = np.round(PKP['2015'].to_list(), decimals=1)
series1

array([  9.5,  79. ,  64.5,  46.9,  77.6,  10. ,  81.2, 113.1, 127.9,
        29.5])

series2 = np.round(PKP['2017'].to_list(), decimals=1)
series2

array([  9. ,  75.1,  63.6,  50.8,  76.4,  10.5,  73.8, 109.9, 127.9,
        27.1])

def SlopeChart(series1, series2, title, xlabel, ylabel, lebelPKP):
    
# Labels formatting
    left_label = [str(c) + ', '+ str(round(y)) for c, y in zip(labelPKP, series1)]
    right_label = [str(c) + ', '+ str(round(y)) for c, y in zip(labelPKP, series2)]
    klass = ['red' if (y1-y2) < 0 else 'green' for y1, y2 in zip(series1, series2)]

# lines color
    def newline(p1, p2, color='black'):
        ax = plt.gca()
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='red' if p1[1]-p2[1] > 0 else 'green', marker='o', markersize=6)
        ax.add_line(l)
        return l

    fig, ax = plt.subplots(1,1,figsize=(14,14), dpi= 380)

# Vertical Lines
    ax.vlines(x=1, ymin=0, ymax=120, color='black', alpha=0.7, linewidth=1, linestyles='dotted')
    ax.vlines(x=3, ymin=0, ymax=120, color='black', alpha=0.7, linewidth=1, linestyles='dotted')

# Points
    ax.scatter(y=series1, x=np.repeat(1, PKP.shape[0]), s=10, color='black', alpha=0.7)
    ax.scatter(y=series2, x=np.repeat(3, PKP.shape[0]), s=10, color='black', alpha=0.7)

# Line Segmentsand Annotation
    for p1, p2, c in zip(series1, series2, labelPKP):
        newline([1,p1], [3,p2])
        ax.text(1-0.05, p1, c + ', ' + str(round(p1)), horizontalalignment='right', verticalalignment='center', fontdict={'size':14})
        ax.text(3+0.05, p2, c + ', ' + str(round(p2)), horizontalalignment='left', verticalalignment='center', fontdict={'size':14})

# 'Before' and 'After' Annotations
    ax.text(1-0.05, 140, 'BEFORE', horizontalalignment='right', verticalalignment='center', fontdict={'size':18, 'weight':700})
    ax.text(3+0.05, 140, 'AFTER', horizontalalignment='left', verticalalignment='center', fontdict={'size':18, 'weight':700})

# Decoration
    ax.set_title(title, fontdict={'size':22})
    ax.set(xlim=(0,4), ylim=(0,150), ylabel=ylabel) ## Skala osi Y
    ax.set_xticks([1,3])
    ax.set_xticklabels(xlabel)
    plt.yticks(np.arange(20, 150, 40), fontsize=12)  ## Podziałaka osi Y

# Lighten borders
    plt.gca().spines["top"].set_alpha(.0)
    plt.gca().spines["bottom"].set_alpha(.0)
    plt.gca().spines["right"].set_alpha(.0)
    plt.gca().spines["left"].set_alpha(.0)
    plt.show()

labelPKP
series1
series2
title = 'Changing the the level of happiness: 2015 vs. 2017'
ylabel ='Less points - more happiness'
xlabel =["2015", "2017"]

SlopeChart(series1, series2, title, xlabel, ylabel, labelPKP)

An old Chinese proverb says: one picture says more than one thousands words.

One good plot can rescue entire presentation. One poor picture can drown down all good speech. After plenty of shame appointment and boring presentations I decided to improve my tools of visualisation.

import squarify 
import pandas as pd
import matplotlib.pyplot as plt

df1 = pd.read_csv('c:/11/freeFormResponses.csv', skiprows = 1)

headers = ['Duration (in seconds)', 'Gender', 'Gender2','Age','Country','Education', 'Major_undergraduate','Recent_role', 'Recent_role2', 'Industry','Industry2' ,'Years_of_experience', 'compensation$USD'] 
df = pd.read_csv('c:/11/multipleChoiceResponses.csv', usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12], header=None, names=headers, skiprows=2)
df.head(4)

df.drop(['Gender2','Recent_role2','Industry2'], axis=1, inplace=True)

Correcting data

Every time when we want to do plot we will need to check and improve data. Especially check of unique occurrences and elimination of minority of rubbish and NaN cells (lack of data).

df.isnull().sum()

Duration (in seconds)       0
Gender                      0
Age                         0
Country                     0
Education                 421
Major_undergraduate       912
Recent_role               959
Industry                 2174
Years_of_experience      2758
compensation$USD         3674
dtype: int64

df.dtypes

Duration (in seconds)     int64
Gender                   object
Age                      object
Country                  object
Education                object
Major_undergraduate      object
Recent_role              object
Industry                 object
Years_of_experience      object
compensation$USD         object
dtype: object

Very important is reduction of the class or join some similar groups if it is not bad for the project.

df['Gender']=df['Gender'].replace('Prefer to self-describe', 'Prefer not to say')

df.Education.value_counts(dropna = False)

Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
Professional degree                                                    599
NaN                                                                    421
I prefer not to answer                                                 345
No formal education past high school                                   232
Name: Education, dtype: int64

We can get assumption if somebody didn’t answer he didn’t want to give information: 'I prefer not to answer’.

import numpy as np

df['Education']=df['Education'].replace(np.NaN, 'I prefer not to answer')

df.Education.value_counts(dropna = False)

Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
I prefer not to answer                                                 766
Professional degree                                                    599
No formal education past high school                                   232
Name: Education, dtype: int64

df.Education.isnull().sum()

0

df.Major_undergraduate.value_counts(dropna = False)

Computer science (software engineering, etc.)                    9430
Engineering (non-computer focused)                               3705
Mathematics or statistics                                        2950
A business discipline (accounting, economics, finance, etc.)     1791
Physics or astronomy                                             1110
Information technology, networking, or system administration     1029
NaN                                                               912
Medical or life sciences (biology, chemistry, medicine, etc.)     871
Other                                                             770
Social sciences (anthropology, psychology, sociology, etc.)       554
Humanities (history, literature, philosophy, etc.)                269
Environmental science or geology                                  253
I never declared a major                                          128
Fine arts or performing arts                                       87
Name: Major_undergraduate, dtype: int64

Rozumiem, że NaN i 'Other’ jest wtedy, gdy ktoś nie chce zadeklarować swojej specjalizacji:’I never declared a major’

df['Major_undergraduate']=df['Major_undergraduate'].replace(np.NaN, 'I never declared a major')
df['Major_undergraduate']=df['Major_undergraduate'].replace('Other', 'I never declared a major')

df.Major_undergraduate.value_counts(dropna = False, normalize=True).plot(kind='barh')

Student                    5253
Data Scientist             4137
Software Engineer          3130
Data Analyst               1922
Other                      1322
Research Scientist         1189
NaN                         959
Not employed                842
Consultant                  785
Business Analyst            772
Data Engineer               737
Research Assistant          600
Manager                     590
Product/Project Manager     428
Chief Officer               360
Statistician                237
DBA/Database Engineer       145
Developer Advocate          117
Marketing Analyst           115
Salesperson                 102
Principal Investigator       97
Data Journalist              20
Name: Recent_role, dtype: int64

0                                     Computer science 
1                             Mathematics or statistics
2                                A business discipline 
3                                  Physics or astronomy
4                                          Engineering 
5                              I never declared a major
6     Information technology, networking, or system ...
7                                      Social sciences 
8                             Medical or life sciences 
9                                           Humanities 
10                     Environmental science or geology
Name: Major_undergraduate, dtype: object

0                             Computer science n (112)
1                      Mathematics or statisticsn (52)
2                         A business discipline n (34)
3                           Physics or astronomyn (24)
4                                   Engineering n (22)
5                       I never declared a majorn (17)
6     Information technology, networking, or system ...
7                               Social sciences n (12)
8                       Medical or life sciences n (7)
9                                     Humanities n (4)
10               Environmental science or geologyn (3)
dtype: object

df.Recent_role.value_counts(dropna=False)

Student                    5253
Data Scientist             4137
Software Engineer          3130
Data Analyst               1922
Other                      1322
Research Scientist         1189
NaN                         959
Not employed                842
Consultant                  785
Business Analyst            772
Data Engineer               737
Research Assistant          600
Manager                     590
Product/Project Manager     428
Chief Officer               360
Statistician                237
DBA/Database Engineer       145
Developer Advocate          117
Marketing Analyst           115
Salesperson                 102
Principal Investigator       97
Data Journalist              20
Name: Recent_role, dtype: int64

df['Recent_role']=df['Recent_role'].replace(np.NaN, 'Other')

Poland in data

Because I am from Poland, most interesting data for me is information from my country. I separate data about Poland from original data.

PL= df[df.Country=='Poland']

Z5 = PL.pivot_table(index=['Major_undergraduate'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z5.head(10)

The Treemap

I came across this publication and decided to do Treemap by this way.
https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

PPL=Z5.reset_index()
PPL.head(5)

Cut out too long descriptions

PPL['Major_undergraduate']= PPL['Major_undergraduate'].str.split('(').apply(lambda x: x[0])
PPL['Major_undergraduate']

0                                     Computer science 
1                             Mathematics or statistics
2                                A business discipline 
3                                  Physics or astronomy
4                                          Engineering 
5                              I never declared a major
6     Information technology, networking, or system ...
7                                      Social sciences 
8                             Medical or life sciences 
9                                           Humanities 
10                     Environmental science or geology
Name: Major_undergraduate, dtype: object

Adds numbers of occurrences to the descriptions

label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
label

0                             Computer science n (112)
1                      Mathematics or statisticsn (52)
2                         A business discipline n (34)
3                           Physics or astronomyn (24)
4                                   Engineering n (22)
5                       I never declared a majorn (17)
6     Information technology, networking, or system ...
7                               Social sciences n (12)
8                       Medical or life sciences n (7)
9                                     Humanities n (4)
10               Environmental science or geologyn (3)
dtype: object

To pull vectors of data from the pivot table

PPL.reset_index()

label
sizes = PPL['Age'].to_list()

colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3']

import squarify
import matplotlib.pyplot as plt

# Plot
plt.figure(figsize=(12,8), dpi= 380)
squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)

plt.title('Data Scientist society in Poland (2018)',  fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.8, 'y':1.02})
plt.axis('off') # brak numerów na osiach
plt.show()

Trigger to create Treemap

Components to create perfect pie plot: labels, sizes, colors, title

To prepare perfect treemap first I will need to pull vectors of data from the pivot table.

To pull vectors of data from the pivot table

PPL.reset_index()

label = label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
sizes = PPL['Age'].to_list()
title = 'Data Scientist society in Poland (2018)'

# https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/
#colors = ['#274e13','#6aa84f','#93c47d', '#b6d7a8','#d9ead3','#b7b7b7','#38761d'] #green
#colors = ['#0c343d','#134f5c','#45818e','#76a5af','#a2c4c9','#d0e0e3'] #cyan
#colors = ['#7f6000','#bf9000','#f1c232','#ffd966','#ffe599','#fff2cc'] #yelow
#colors = ['#4c1130','#a64d79','#c27ba0','#d5a6bd','#ead1dc','#741b47',] #magenta
#colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6'] #purple
#colors = ['#cfe2f3','#9fc5e8','#6fa8dc'] #blue
#colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']

#colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3'] #=> niemieckie czasopismo

import squarify
import matplotlib.pyplot as plt

def Tmap(sizes, labels, colors, title):
    plt.figure(figsize=(12,8), dpi= 380)
    squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)

    plt.title(title,  fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.9, 'y':1.02})
    plt.axis('off') # brak numerów na osiach
    plt.show()

Tmap(sizes, label, colors, title)

Analysis of the categorical results.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Titanic disaster

Analysis of the categorical results.
We ought to find which passengers have chance to survive according to their affiliation to the established groups.

Source of data: https://www.kaggle.com/shivamp629/traincsv

df = pd.read_csv('c:/1/kaggletrain.csv')
df.head()

Woj = ['#b6d7a8','#6aa84f']

# Plot
g = sns.catplot("Survived", col="Pclass", col_wrap=4,
                data=df[df.Pclass.notnull()],
                kind="count", height=3.5, aspect=.8, 
                palette=Woj)

plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """

Banking marketing

Analysis of the categorical results.
Source of data: https://archive.ics.uci.edu/ml/machine-learning-databases/00222/

df2 = pd.read_csv('c:/1/bank.csv')
df2.head(3)

Kot = ['grey', 'red']
plt.figure(dpi= 380)
# Plot
g = sns.catplot("y", col="marital", col_wrap=4,
                data=df2[df2.marital.notnull()],
                kind="count", height=3.5, aspect=.8, 
                palette=Kot,  alpha=0.5, legend=True)

plt.rc("font", size=15)

plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """

Clinical tests

Source of data: https://www.kaggle.com/saurabh00007/diabetescsv

df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head(3)

kot = ['young patient', 'medium patient', 'senior patient']
df3['Age group'] = pd.qcut(df['Age'],3, labels=kot)

Kot = ['#ff9900', '#783f04']
plt.figure(dpi= 380)
# Plot
g = sns.catplot("Outcome", col='Age group', col_wrap=4,
                data=df3[df2.marital.notnull()],
                kind="count", height=5.5, aspect=.7, 
                palette=Kot,  alpha=0.4)

plt.rc("font", size=14)

plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """

An old Chinese proverb says: one picture says more than one thousands words. One good plot can rescue entire presentation. One poor picture can drown down all good speech. After plenty of shame appointment and boring presentations I decided to improve my tools of visualisation.

import pandas as pd

df1 = pd.read_csv('c:/11/freeFormResponses.csv', skiprows = 1)

headers = ['Duration (in seconds)', 'Gender', 'Gender2','Age','Country','Education', 'Major_undergraduate','Recent_role', 'Recent_role2', 'Industry','Industry2' ,'Years_of_experience', 'compensation$USD'] 
df = pd.read_csv('c:/11/multipleChoiceResponses.csv', usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12], header=None, names=headers, skiprows=2)
df.head(4)

df.drop(['Gender2','Recent_role2','Industry2'], axis=1, inplace=True)

Correcting data

Every time when we want to do plot we will need to check and improve data. Especially check of unique occurrences and elimination of minority of rubbish and NaN cells (lack of data).

df.isnull().sum()

Duration (in seconds)       0
Gender                      0
Age                         0
Country                     0
Education                 421
Major_undergraduate       912
Recent_role               959
Industry                 2174
Years_of_experience      2758
compensation$USD         3674
dtype: int64

df.dtypes

Duration (in seconds)     int64
Gender                   object
Age                      object
Country                  object
Education                object
Major_undergraduate      object
Recent_role              object
Industry                 object
Years_of_experience      object
compensation$USD         object
dtype: object

Very important is reduction of the class or join some similar groups if it is not bad for the project.

df['Gender']=df['Gender'].replace('Prefer to self-describe', 'Prefer not to say')

df.Education.value_counts(dropna = False)

Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
Professional degree                                                    599
NaN                                                                    421
I prefer not to answer                                                 345
No formal education past high school                                   232
Name: Education, dtype: int64

We can get assumption if somebody didn’t answer he didn’t want to give information: 'I prefer not to answer’.

import numpy as np

df['Education']=df['Education'].replace(np.NaN, 'I prefer not to answer')

df.Education.value_counts(dropna = False)

Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
I prefer not to answer                                                 766
Professional degree                                                    599
No formal education past high school                                   232
Name: Education, dtype: int64

df.Education.isnull().sum()

0

df.Major_undergraduate.value_counts(dropna = False)

Computer science (software engineering, etc.)                    9430
Engineering (non-computer focused)                               3705
Mathematics or statistics                                        2950
A business discipline (accounting, economics, finance, etc.)     1791
Physics or astronomy                                             1110
Information technology, networking, or system administration     1029
NaN                                                               912
Medical or life sciences (biology, chemistry, medicine, etc.)     871
Other                                                             770
Social sciences (anthropology, psychology, sociology, etc.)       554
Humanities (history, literature, philosophy, etc.)                269
Environmental science or geology                                  253
I never declared a major                                          128
Fine arts or performing arts                                       87
Name: Major_undergraduate, dtype: int64

Rozumiem, że NaN i 'Other’ jest wtedy, gdy ktoś nie chce zadeklarować swojej specjalizacji:’I never declared a major’

df['Major_undergraduate']=df['Major_undergraduate'].replace(np.NaN, 'I never declared a major')
df['Major_undergraduate']=df['Major_undergraduate'].replace('Other', 'I never declared a major')

import matplotlib as plt
df.Major_undergraduate.value_counts(dropna = False, normalize=True).plot(kind='barh')

df.Recent_role.value_counts(dropna=False)

Student                    5253
Data Scientist             4137
Software Engineer          3130
Data Analyst               1922
Other                      1322
Research Scientist         1189
NaN                         959
Not employed                842
Consultant                  785
Business Analyst            772
Data Engineer               737
Research Assistant          600
Manager                     590
Product/Project Manager     428
Chief Officer               360
Statistician                237
DBA/Database Engineer       145
Developer Advocate          117
Marketing Analyst           115
Salesperson                 102
Principal Investigator       97
Data Journalist              20
Name: Recent_role, dtype: int64

df['Recent_role']=df['Recent_role'].replace(np.NaN, 'Other')

Z1 = df.pivot_table(index=['Major_undergraduate'], columns = 'Gender', values='Age',aggfunc='count').sort_values('Male',ascending=False)
Z1

Z1.plot(kind='barh', legend=True, title='Data Scientists by Major undergraduate and Gender (Kaggle 2018)', figsize=(7, 4), color=('b','g','y'))

array([],
      dtype=object)

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Z2 = df.pivot_table(index=['Country'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False).nlargest(20,'Male')
Z2

Poland in data¶

Because I am from Poland, most interesting data for me is information from my country. I separate data about Poland from original data.

PL= df[df.Country=='Poland']

Z3 = PL.pivot_table(index=['Major_undergraduate'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False)
Z3

Z3 = PL.pivot_table(index=['Recent_role'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False)
Z3

Let’s do standard, quick Pie Plot

We can see banal, predictable visualization.

Z4 = PL.pivot_table(index=['Recent_role'], values='Age',aggfunc='count').sort_values('Age', ascending=False)

Z4.plot(kind='pie', subplots=True, legend=False, title="Data Scientists by Recent_role (Kaggle 2018)",figsize=(15,7), autopct='

array([],
      dtype=object)

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Z5 = PL.pivot_table(index=['Major_undergraduate'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z5.head(10)

Better Pie Plot with interesting colors

At the beginning we can change colors and give better descriptions.

GSuite Text and Background Palette: https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/

import matplotlib.pyplot as plt
## Wielkość wykresu
plt.figure(figsize=(16,8))


## informacja że jest to wykres złożony
ax1 = plt.subplot(aspect='equal')



## ustalenie koloru
colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']

## równanie podstawowe
Z5.plot(kind='pie',colors =colors , y = 'Age', ax=ax1, autopct='

# opisy, nazwy itp
ax1.set_xlabel('Something to write',  fontsize=15, color='darkred', alpha=1)
ax1.set_ylabel('Something to write', fontsize=11,  color='grey', alpha=0.8)
ax1.set_title('Major_undergraduate in Data Scientists (Kaggle 2018)',  fontsize=18, color='grey', alpha=0.8)
ax1.set_facecolor('#d8dcd6')

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

The best Pie Plot¶

I came across this publication and decided to do Pie Plot by this way.
https://medium.com/@kvnamipara/a-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

PPL=Z5.reset_index()
PPL.head(5)

To pull vectors of data from the pivot table.

PPL.reset_index()
labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()

fig1, ax1 = plt.subplots(figsize=(10,5))


ax1.pie(sizes, labels=labels, autopct='

ax1.axis('equal')  
plt.tight_layout()
plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Colors changing

# linia wskazuje że będzie to wykres złożony - wymiary: 6:6
fig1, ax1 = plt.subplots(figsize=(10,5))

colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6']

ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

ax1.axis('equal')  
plt.tight_layout()
plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Changing size and color of the all fonts

textprops={’fontsize’: 30, 'color’:”green”}

# linia wskazuje że będzie to wykres złożony - wymiary: 6:6
fig1, ax1 = plt.subplots(figsize=(18,12))

colors = ['#e06666','#ea9999','#f4cccc','#ff0000','#434343']

ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

ax1.axis('equal')  
plt.tight_layout()
plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Changing size and color of the separate fonts¶

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')

fig1, ax1 = plt.subplots(figsize=(15,12))

colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')
    
ax1.axis('equal')  
plt.tight_layout()
plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Changing size and color for the chosen categories

https://medium.com/@kvnamipara/a-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f

fig1, ax1 = plt.subplots(figsize=(6,6))

colors = ['#ff9999','#747574','#99ff99','#ffcc99','#f1c232']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('grey')

    
texts[0].set_fontsize(24)
texts[0].set_color('black')
texts[4].set_fontsize(33)
texts[4].set_color('green')
    
ax1.axis('equal')  
plt.tight_layout()
plt.show()

C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Making a bagel ¶

fig1, ax1 = plt.subplots(figsize=(18,6))

colors = ['#a2c4c9','#b6d7a8','#747574','#99ff99','#ffcc99','#76a5af']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')
    
ax1.axis('equal')  
plt.tight_layout()

plt.show()

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Making the better bangle

fig1, ax1 = plt.subplots(figsize=(18,8))

colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('black')
    autotext.set_fontsize(22)

texts[0].set_fontsize(18)
texts[0].set_color('black')
    
#draw circle
centre_circle = plt.Circle((0,0),0.40,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)    
    
    
ax1.set_xlabel('Year 2018',  fontsize=15, color='darkred', alpha=1)
ax1.set_ylabel('Data Scientist', fontsize=11,  color='grey', alpha=0.8)
ax1.set_title('Data Scientist by profession',  fontsize=58, color='#d0e0e3', alpha=0.8)
ax1.set_facecolor('#d8dcd6')


ax1.axis('equal')  
plt.tight_layout()

plt.show()

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

We enter the gender variable

PL.columns

Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')

Z6 = PL.pivot_table(index=['Major_undergraduate','Gender'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z6.head(10)

https://medium.com/@kvnamipara/a-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

PLG=Z6.reset_index()
PLG.head(2)

PLG.reset_index()
labels_gender = PLG['Gender'].to_list()
sizes_gender = PLG['Age'].to_list()

The double bangle

import matplotlib.pyplot as plt


colors_gender = ['#c2c2f0','#ffb3e6']
 

fig1, ax1 = plt.subplots(figsize=(18,6))

colors = ['#ff0000','#747574','#ffd966','#ffcc99','#ea9999']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='

plt.pie(sizes_gender,colors=colors_gender,radius=0.75,startangle=0)
centre_circle = plt.Circle((0,0),0.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)




for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('black')

    
#draw circle
centre_circle = plt.Circle((0,0),0.50,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)    
    
    
ax1.axis('equal')  
plt.tight_layout()

plt.show()

The double bangle is: „one bridge too far”. This plot is beautiful but bangles are not correlated each other. To achieve adequate connection vectors should be come from one pivot tables. At the moment I have no idea how to do it (this groupby, query, pivot ….).

Trigger to create Pie Plot

Components to create perfect pie plot: labels, sizes, colors

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

PPL=Z5.reset_index()
PPL.head(5)
PPL.reset_index()

labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()

colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']

def PPieP(sizes,labels,colors):
    fig1, ax1 = plt.subplots(figsize=(18,8))

    patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


    for text in texts:
        text.set_color('grey')
    for autotext in autotexts:
        autotext.set_color('black')
        autotext.set_fontsize(22)

    texts[0].set_fontsize(18)
    texts[0].set_color('black')
    
    #draw circle
    centre_circle = plt.Circle((0,0),0.40,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)    
    
    
    ax1.set_xlabel('Year 2018',  fontsize=15, color='darkred', alpha=1)
    ax1.set_ylabel('Data Scientist', fontsize=11,  color='grey', alpha=0.8)
    ax1.set_title('Data Scientist by profession',  fontsize=58, color='#d0e0e3', alpha=0.8)
    ax1.set_facecolor('#d8dcd6')


    ax1.axis('equal')  
    plt.tight_layout()

    plt.show()

# Variables to the trigger:

labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()
#colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']
#colors = ['#ff0000','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
#colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6']
#colors = ['#cfe2f3','#9fc5e8','#6fa8dc']
colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']



# Trigger:

PPieP(sizes,labels,colors)

Good advice in making presentation is to prepare plots using one standard.

As says man who built my house: messy but equally!

	Unnamed: 0	symboling	normalized_losses	make	fuel_type	aspiration	num_doors	body_style	drive_wheels	engine_location	…	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
0	0	3	NaN	alfa-romero	gas	std	two	convertible	rwd	front	…	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27	13495.0
1	1	3	NaN	alfa-romero	gas	std	two	convertible	rwd	front	…	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27	16500.0
2	2	1	NaN	alfa-romero	gas	std	two	hatchback	rwd	front	…	152	mpfi	2.68	3.47	9.0	154.0	5000.0	19	26	16500.0
3	3	2	164.0	audi	gas	std	four	sedan	fwd	front	…	109	mpfi	3.19	3.40	10.0	102.0	5500.0	24	30	13950.0
4	4	2	164.0	audi	gas	std	four	sedan	4wd	front	…	136	mpfi	3.19	3.40	8.0	115.0	5500.0	18	22	17450.0

	PID	county	state	area	poptotal	popdensity	popwhite	popblack	popamerindian	popasian	…	percprof	poppovertyknown	percpovertyknown	percbelowpoverty	percchildbelowpovert	percadultpoverty	percelderlypoverty	inmetro	category	dot_size
0	561	ADAMS	IL	0.052	66090	1270.961540	63917	1702	98	249	…	4.355859	63628.0	96.274777	13.151443	18.011717	11.009776	12.443812	0.0	AAR	250.944411
1	562	ALEXANDER	IL	0.014	10626	759.000000	7054	3496	19	48	…	2.870315	10529.0	99.087145	32.244278	45.826514	27.385647	25.228976	0.0	LHR	185.781260
2	563	BOND	IL	0.022	14991	681.409091	14477	429	35	16	…	4.488572	14235.0	94.956974	12.068844	14.036061	10.852090	12.697410	0.0	AAR	175.905385
3	564	BOONE	IL	0.017	30806	1812.117650	29344	127	46	150	…	4.197800	30337.0	98.477569	7.209019	11.179536	5.536013	6.217047	1.0	ALU	319.823487
4	565	BROWN	IL	0.018	5836	324.222222	5264	547	14	5	…	3.367680	4815.0	82.505140	13.520249	13.022889	11.143211	19.200000	0.0	AAR	130.442161

	Unnamed: 0	Country	Region	Happiness Rank	Happiness Score	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Year
493	493	Zambia	Sub-Saharan Africa	116.0	4.514	0.636407	1.003187	0.257836	0.461603	0.078214	0.249580	1.826705	2017.0
494	494	Zimbabwe	Sub-Saharan Africa	138.0	3.875	0.375847	1.083096	0.196764	0.336384	0.095375	0.189143	1.597970	2017.0

	Unnamed: 0	Country Name	Country Code	1961	1962	1963	1964	1965	1966	1967	…	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018
0	0	Aruba	ABW	54211.0	55438.0	56225.0	56695.0	57032.0	57360.0	57715.0	…	101353.0	101453.0	101669.0	102053.0	102577.0	103187.0	103795.0	104341.0	104822.0	105264.0
1	1	Afghanistan	AFG	8996351.0	9166764.0	9345868.0	9533954.0	9731361.0	9938414.0	10152331.0	…	27294031.0	28004331.0	28803167.0	29708599.0	30696958.0	31731688.0	32758020.0	33736494.0	34656032.0	35530081.0

	Unnamed: 0	Country	Region	Happiness Rank	Happiness Score	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Year	Population2017
332	332	Algeria	Middle East and Northern Africa	53.0	5.872	1.091864	1.146217	0.617585	0.233336	0.146096	0.069437	2.567604	2017.0	406.06052
333	333	Angola	Sub-Saharan Africa	140.0	3.795	0.858428	1.104412	0.049869	0.000000	0.069720	0.097926	1.614482	2017.0	288.13463

	State	Murder	Assault	UrbanPop	Rape
0	Alabama	13.2	236	58	21.2
1	Alaska	10.0	263	48	44.5
2	Arizona	8.1	294	80	31.0
3	Arkansas	8.8	190	50	19.5

	CustomerID	Genre	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	VIX.Open	VIX.High	VIX.Low	VIX.Close	VIX.Volume	VIX.Adjusted	year	month	monthf	weekday	weekdayf	week
date
2007-01-03	12.16	12.75	11.53	12.04	0	12.04	2007	1	Jan	3	Wed	1
2007-01-04	12.40	12.42	11.28	11.51	0	11.51	2007	1	Jan	4	Thu	1
2007-01-05	11.84	12.25	11.68	12.14	0	12.14	2007	1	Jan	5	Fri	1

	index	date	duration	item	month	network	network_type
0	0	2014-10-15 06:58:00	34.429	data	2014-11	data	data
1	1	2014-10-15 06:58:00	13.000	call	2014-11	Vodafone	mobile
2	2	2014-10-15 14:46:00	23.000	call	2014-11	Meteor	mobile

	Consumption	Wind	Solar	Wind+Solar
Date
2006-01-01	1069.184	NaN	NaN	NaN
2006-01-02	1380.521	NaN	NaN	NaN
2006-01-03	1442.533	NaN	NaN	NaN
2006-01-04	1457.217	NaN	NaN	NaN
2006-01-05	1477.131	NaN	NaN	NaN

	Continuous
Date
2018-08-31	65
2018-09-02	60
2018-09-03	75
2018-09-04	120
2018-09-05	120

	Unnamed: 0	Unnamed: 0.1	age	job	marital	education	default	housing	loan	contact	…	campaign	pdays	previous	poutcome	emp_var_rate	cons_price_idx	cons_conf_idx	euribor3m	nr_employed	y
0	0	0	44	blue-collar	married	basic.4y	unknown	yes	no	cellular	…	1	999	0	nonexistent	1.4	93.444	-36.1	4.963	5228.1	0
1	1	1	53	technician	married	unknown	no	no	no	cellular	…	1	999	0	nonexistent	-0.1	93.200	-42.0	4.021	5195.8	0
2	2	2	28	management	single	university.degree	no	yes	no	cellular	…	3	6	2	success	-1.7	94.055	-39.8	0.729	4991.6	1
3	3	3	39	services	married	high.school	no	no	no	cellular	…	2	999	0	nonexistent	-1.8	93.075	-47.1	1.405	5099.1	0
4	4	4	55	retired	married	basic.4y	no	yes	no	cellular	…	1	3	1	success	-2.9	92.201	-31.4	0.869	5076.2	1

	manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
0	audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
1	audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
2	audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
3	audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
4	audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

	class	counts
0	2seater	5
1	compact	47
2	midsize	41
3	minivan	11
4	pickup	33
5	subcompact	35
6	suv	62

	Unnamed: 0	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	female	38.0	1	PC 17599	71.2833	C85	C
2	2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S

	Pclass	counts
0	1	216
1	2	184
2	3	491

	Unnamed: 0	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
368	368	369	1	3	Jermyn, Miss. Annie	female	NaN	0	0	14313	7.7500	NaN	Queenstown
871	871	872	1	1	Beckwith, Mrs. Richard Leonard (Sallie Monypeny)	female	47.0	1	1	11751	52.5542	D35	Southampton
125	125	126	1	3	Nicola-Yarred, Master. Elias	male	12.0	1	0	2651	11.2417	NaN	Cherbourg
462	462	463	0	1	Gee, Mr. Arthur H	male	47.0	0	0	111320	38.5000	E63	Southampton

	Embarked	counts
0	Cherbourg	168
1	Queenstown	77
2	Southampton	644

	job	Unnamed: 0
0	admin.	10422
1	blue-collar	9254
9	technician	6743
7	services	3969
4	management	2924
5	retired	1720
2	entrepreneur	1456
6	self-employed	1421
3	housemaid	1060
10	unemployed	1014
8	student	875
11	unknown	330

	id	ident	type	name	latitude_deg	longitude_deg	elevation_ft	continent	iso_country	iso_region	municipality	scheduled_service	gps_code	iata_code	local_code	home_link	wikipedia_link	keywords
0	6523	00A	heliport	Total Rf Heliport	40.070801	-74.933601	11.0	NaN	US	US-PA	Bensalem	no	00A	NaN	00A	NaN	NaN	NaN
1	323361	00AA	small_airport	Aero B Ranch Airport	38.704022	-101.473911	3435.0	NaN	US	US-KS	Leoti	no	00AA	NaN	00AA	NaN	NaN	NaN
2	6524	00AK	small_airport	Lowell Field	59.949200	-151.695999	450.0	NaN	US	US-AK	Anchor Point	no	00AK	NaN	00AK	NaN	NaN	NaN
3	6525	00AL	small_airport	Epps Airpark	34.864799	-86.770302	820.0	NaN	US	US-AL	Harvest	no	00AL	NaN	00AL	NaN	NaN	NaN

	type	id
0	closed	25
1	heliport	3
2	large_airport	7
3	medium_airport	22
4	small_airport	218

	continent	1952	1957
0	Africa	1252.572466	1385.236062
1	Americas	4079.062552	4616.043733
2	Asia	5195.484004	4003.132940
3	Europe	5661.057435	6963.012816
4	Oceania	10298.085650	11598.522455

	Unnamed: 0	Country	Region	Happiness Rank	Happiness Score	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Year
0	0	Afghanistan	Southern Asia	153.0	3.575	0.31982	0.30285	0.30335	0.23414	0.09719	0.36510	1.95210	2015.0
1	1	Albania	Central and Eastern Europe	95.0	4.959	0.87867	0.80434	0.81325	0.35733	0.06413	0.14272	1.89894	2015.0
2	2	Algeria	Middle East and Northern Africa	68.0	5.605	0.93929	1.07772	0.61766	0.28579	0.17383	0.07822	2.43209	2015.0
3	3	Angola	Sub-Saharan Africa	137.0	4.033	0.75778	0.86040	0.16683	0.10384	0.07122	0.12344	1.94939	2015.0
4	4	Argentina	Latin America and Caribbean	30.0	6.574	1.05351	1.24823	0.78723	0.44974	0.08484	0.11451	2.83600	2015.0

Year	2015.0	2016.0	2017.0
Region
Australia and New Zealand	9.500000	8.500000	9.000000
Central and Eastern Europe	79.000000	78.448276	75.068966
Eastern Asia	64.500000	67.166667	63.600000
Latin America and Caribbean	46.909091	48.333333	50.772727
Middle East and Northern Africa	77.600000	78.105263	76.421053
North America	10.000000	9.500000	10.500000
Southeastern Asia	81.222222	80.000000	73.750000
Southern Asia	113.142857	111.714286	109.857143
Sub-Saharan Africa	127.900000	129.657895	127.871795
Western Europe	29.523810	29.190476	27.142857

	Duration (in seconds)	Gender	Gender2	Age	Country	Education	Major_undergraduate	Recent_role	Recent_role2	Industry	Industry2	Years_of_experience	compensation$USD
0	710	Female	-1	45-49	United States of America	Doctoral degree	Other	Consultant	-1	Other	0	NaN	NaN
1	434	Male	-1	30-34	Indonesia	Bachelor’s degree	Engineering (non-computer focused)	Other	0	Manufacturing/Fabrication	-1	5-10	10-20,000
2	718	Female	-1	30-34	United States of America	Master’s degree	Computer science (software engineering, etc.)	Data Scientist	-1	I am a student	-1	0-1	0-10,000
3	621	Male	-1	35-39	United States of America	Master’s degree	Social sciences (anthropology, psychology, soc…	Not employed	-1	NaN	-1	NaN	NaN

	Age
Major_undergraduate
Computer science (software engineering, etc.)	112
Mathematics or statistics	52
A business discipline (accounting, economics, finance, etc.)	34
Physics or astronomy	24
Engineering (non-computer focused)	22
I never declared a major	17
Information technology, networking, or system administration	14
Social sciences (anthropology, psychology, sociology, etc.)	12
Medical or life sciences (biology, chemistry, medicine, etc.)	7
Humanities (history, literature, philosophy, etc.)	4

Gender	Female	Male	Prefer not to say
Major_undergraduate
Computer science (software engineering, etc.)	1463	7837	130
Engineering (non-computer focused)	432	3223	50
Mathematics or statistics	660	2241	49
I never declared a major	297	1438	75
A business discipline (accounting, economics, finance, etc.)	334	1435	22
Physics or astronomy	119	968	23
Information technology, networking, or system administration	186	832	11
Medical or life sciences (biology, chemistry, medicine, etc.)	203	646	22
Social sciences (anthropology, psychology, sociology, etc.)	160	379	15
Environmental science or geology	57	190	6
Humanities (history, literature, philosophy, etc.)	74	185	10
Fine arts or performing arts	25	56	6

Gender	Female	Male	Prefer not to say	SUM
Country
SUM	4010.0	19430.0	419.0	23859
India	657.0	3719.0	41.0	4417
United States of America	1082.0	3530.0	104.0	4716
China	267.0	1337.0	40.0	1644
Other	165.0	849.0	22.0	1036
Russia	113.0	750.0	16.0	879
Brazil	65.0	666.0	5.0	736
Germany	103.0	621.0	10.0	734
Japan	34.0	557.0	6.0	597
United Kingdom of Great Britain and Northern Ireland	131.0	554.0	17.0	702
France	104.0	494.0	6.0	604
Canada	123.0	475.0	6.0	604
Spain	75.0	406.0	4.0	485
Italy	47.0	303.0	5.0	355
Australia	51.0	272.0	7.0	330
Turkey	56.0	267.0	4.0	327
I do not wish to disclose my location	83.0	250.0	61.0	394
Poland	54.0	243.0	4.0	301
Netherlands	41.0	225.0	4.0	270
Ukraine	31.0	218.0	3.0	252

matplotlib - THE DATA SCIENCE LIBRARY

Perfect Plots: Bubble Plot

Autos

Midwest

WorldHappinessReport

Diabetes

Dendrogram and clustering 3d

USArrests

Earning and expenses

Clinical tests

The dendroid chart will tell you how many clusters you want

It seems 5 clusters

Perfect Plots: Calendarplot

yahoo

phone_data

Energy

Personal calendar

Perfect Plots: H_line plot

Banking marketing

Variables for the chart

Trigger

Perfect Plots: Waffle plot

pip install pywaffle

Car market analysis

Titanic disaster

Structure of airports

Perfect Plots: Slope Chart

To make vector of data: series 1, series 2 and labels

Needed parameters

Definition of the trigger

Slope Chart realization

Perfect Plot: Treemap

Correcting data

Poland in data

The Treemap

Cut out too long descriptions

Adds numbers of occurrences to the descriptions

To pull vectors of data from the pivot table

Trigger to create Treemap

Components to create perfect pie plot: labels, sizes, colors, title

To pull vectors of data from the pivot table

Perfect Plots: Categorical Plot

Titanic disaster

Banking marketing

Clinical tests

Perfect Plots: Pie Plot

Correcting data

Poland in data¶

Let’s do standard, quick Pie Plot

Better Pie Plot with interesting colors

The best Pie Plot¶

To pull vectors of data from the pivot table.

Colors changing

Changing size and color of the all fonts

Changing size and color of the separate fonts¶

Changing size and color for the chosen categories

Making a bagel ¶

Making the better bangle

We enter the gender variable

The double bangle

Trigger to create Pie Plot

Components to create perfect pie plot: labels, sizes, colors

As says man who built my house: messy but equally!

How to make my own template for plots

Today we learn how to make my own template for plots

Data preparation

We do template for plots

Pivot table is the best

Use template