Perfect Plots: Bubble Plot

 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [2]:
df2= pd.read_csv('c:/1/autos.csv')
df2.head()
Out[2]:
  Unnamed: 0 symboling normalized_losses make fuel_type aspiration num_doors body_style drive_wheels engine_location engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
0 0 3 NaN alfa-romero gas std two convertible rwd front 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 13495.0
1 1 3 NaN alfa-romero gas std two convertible rwd front 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 16500.0
2 2 1 NaN alfa-romero gas std two hatchback rwd front 152 mpfi 2.68 3.47 9.0 154.0 5000.0 19 26 16500.0
3 3 2 164.0 audi gas std four sedan fwd front 109 mpfi 3.19 3.40 10.0 102.0 5500.0 24 30 13950.0
4 4 2 164.0 audi gas std four sedan 4wd front 136 mpfi 3.19 3.40 8.0 115.0 5500.0 18 22 17450.0

5 rows × 27 columns

In [3]:
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

plt.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.title("Bubble Plot of Autos Arean(color: 'price & size: 'city_mpg')", fontsize=16)
plt.xlabel('horsepower', fontsize=18)
plt.ylabel('engine_size', fontsize=18)
plt.colorbar()

plt.show()    
In [4]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
ax.set_title("Bubble Plot of Autos Arean(color: 'price & size: 'engine_size')", fontsize=16)
ax.set_xlabel('horsepower', fontsize=18)
ax.set_ylabel('engine_size', fontsize=18)


## Sztuczka żeby mieć colorbar
AA = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.colorbar(AA)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

handles, labels = AA.legend_elements(prop="sizes", alpha=0.6)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df2['make']):
    ax.annotate(txt, (df2['horsepower'][i],df2['engine_size'] [i]))

plt.show()  
 

Midwest

In [5]:
df = pd.read_csv('c:/2/midwest_filter.csv')
df.head()
Out[5]:
  PID county state area poptotal popdensity popwhite popblack popamerindian popasian percprof poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert percadultpoverty percelderlypoverty inmetro category dot_size
0 561 ADAMS IL 0.052 66090 1270.961540 63917 1702 98 249 4.355859 63628.0 96.274777 13.151443 18.011717 11.009776 12.443812 0.0 AAR 250.944411
1 562 ALEXANDER IL 0.014 10626 759.000000 7054 3496 19 48 2.870315 10529.0 99.087145 32.244278 45.826514 27.385647 25.228976 0.0 LHR 185.781260
2 563 BOND IL 0.022 14991 681.409091 14477 429 35 16 4.488572 14235.0 94.956974 12.068844 14.036061 10.852090 12.697410 0.0 AAR 175.905385
3 564 BOONE IL 0.017 30806 1812.117650 29344 127 46 150 4.197800 30337.0 98.477569 7.209019 11.179536 5.536013 6.217047 1.0 ALU 319.823487
4 565 BROWN IL 0.018 5836 324.222222 5264 547 14 5 3.367680 4815.0 82.505140 13.520249 13.022889 11.143211 19.200000 0.0 AAR 130.442161

5 rows × 29 columns

In [6]:
# Plot
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='Reds', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of PopTotal vs Arean(color: 'popdensity' & size: 'dot_size' - both are numeric columns in midwest)", fontsize=16)
plt.xlabel('Area', fontsize=18)
plt.ylabel('Poptotal', fontsize=18)
plt.colorbar()
plt.show()   
In [7]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
ax.set_title("Bubble Plot of PopTotal vs Arean color: 'popdensity' & size: 'dot_size'", fontsize=16)
ax.set_xlabel('Area', fontsize=18)
ax.set_ylabel('Poptotal', fontsize=18)
   


## Sztuczka żeby mieć colorbar
BB = ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
plt.colorbar(BB)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### legenda do wielkości kółek
handles, labels = BB.legend_elements(prop="sizes", alpha=0.6)
legend = ax.legend(handles, labels, loc="lower right", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df['county']):
    ax.annotate(txt, (df['area'][i],df['poptotal'] [i]))

plt.show()  
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))
 

WorldHappinessReport

Source of data: https://worldhappiness.report/download/

 

The best plots appear when we combine various data!

In [8]:
df3= pd.read_csv('c:/1/WorldHappinessReport.csv')
df3 = df3[df3['Year']==2017]
df3.tail(2)
Out[8]:
  Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year
493 493 Zambia Sub-Saharan Africa 116.0 4.514 0.636407 1.003187 0.257836 0.461603 0.078214 0.249580 1.826705 2017.0
494 494 Zimbabwe Sub-Saharan Africa 138.0 3.875 0.375847 1.083096 0.196764 0.336384 0.095375 0.189143 1.597970 2017.0
In [9]:
df4 = pd.read_csv('c:/1/WorldPopulation.csv')
df4.head(2)
Out[9]:
  Unnamed: 0 Country Name Country Code 1961 1962 1963 1964 1965 1966 1967 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0 0 Aruba ABW 54211.0 55438.0 56225.0 56695.0 57032.0 57360.0 57715.0 101353.0 101453.0 101669.0 102053.0 102577.0 103187.0 103795.0 104341.0 104822.0 105264.0
1 1 Afghanistan AFG 8996351.0 9166764.0 9345868.0 9533954.0 9731361.0 9938414.0 10152331.0 27294031.0 28004331.0 28803167.0 29708599.0 30696958.0 31731688.0 32758020.0 33736494.0 34656032.0 35530081.0

2 rows × 61 columns

 

Only Africa and only 2017.

In [10]:
D3 = df4.set_index('Country Name')['2017'].to_dict()
#D3
In [11]:
df3['Population2017'] = df3['Country'].map(D3) 
df3['Population2017'] = df3['Population2017']/100000
In [12]:
df3.isnull().sum()
df3 = df3.dropna(how='any')
df3.isnull().sum()
Out[12]:
Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64
In [13]:
kot = ['Sub-Saharan Africa','Middle East and Northern Africa']
AFR = df3[df3['Region'].isin(kot)]
AFR.head(2)
Out[13]:
  Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year Population2017
332 332 Algeria Middle East and Northern Africa 53.0 5.872 1.091864 1.146217 0.617585 0.233336 0.146096 0.069437 2.567604 2017.0 406.06052
333 333 Angola Sub-Saharan Africa 140.0 3.795 0.858428 1.104412 0.049869 0.000000 0.069720 0.097926 1.614482 2017.0 288.13463
In [14]:
AFR.to_csv('c:/8/AfricaHappinessReport2017.csv')
df10 = pd.read_csv('c:/8/AfricaHappinessReport2017.csv')
df10.head(2)
Out[14]:
  Unnamed: 0 Unnamed: 0.1 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year Population2017
0 332 332 Algeria Middle East and Northern Africa 53.0 5.872 1.091864 1.146217 0.617585 0.233336 0.146096 0.069437 2.567604 2017.0 406.06052
1 333 333 Angola Sub-Saharan Africa 140.0 3.795 0.858428 1.104412 0.049869 0.000000 0.069720 0.097926 1.614482 2017.0 288.13463
In [15]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
ax.set_title("AFRICA 2017 Happiness & Freedomn(color: 'Economy (GDP per Capita)' & size: 'Population2017')", fontsize=16)
ax.set_xlabel('Happiness Score', fontsize=18)
ax.set_ylabel('Freedom', fontsize=18)


## Sztuczka żeby mieć colorbar
CC = ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
plt.colorbar(CC)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### Sztuczka, żeby mieć legende do size - nie działa dla danych ciągłych (musi byc tylko kilka klas)
handles, labels = CC.legend_elements(prop="sizes", alpha=0.1)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df10['Country']):
    ax.annotate(txt, (df10['Happiness Score'][i],df10['Freedom'] [i]))

plt.show()  
 

Diabetes

In [16]:
df2= pd.read_csv('c:/1/diabetes.csv')
df2.head(2)
Out[16]:
  Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
 

Adds BMI indicator amplifier

In [17]:
df2['BMI_class'] = ((pd.qcut(df2['BMI'],5, labels=False).astype(int))+1)*70
In [18]:
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('Age', 'Glucose', data=df2, s='BMI_class', c='BloodPressure', cmap='YlOrBr', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of Diabetesn color: BloodPressure & size: BMI", fontsize=16)
plt.xlabel('Age', fontsize=18)
plt.ylabel('Glucose', fontsize=18)
plt.colorbar()
plt.show()