matplotlib - THE DATA SCIENCE LIBRARY http://sigmaquality.pl/tag/matplotlib/ Wojciech Moszczyński Mon, 13 Dec 2021 17:49:19 +0000 pl-PL hourly 1 https://wordpress.org/?v=6.8.3 https://sigmaquality.pl/wp-content/uploads/2019/02/cropped-ryba-32x32.png matplotlib - THE DATA SCIENCE LIBRARY http://sigmaquality.pl/tag/matplotlib/ 32 32 Perfect Plots: Bubble Plot https://sigmaquality.pl/data-plots/perfect-plots_-bubble-plot/ Thu, 07 Nov 2019 18:26:00 +0000 http://sigmaquality.pl/perfect-plots_-bubble-plot/   Feel free to read the code on GitHub In [1]: import pandas as pd import matplotlib.pyplot as plt import numpy as np   Autos [...]

Artykuł Perfect Plots: Bubble Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
 

Feel free to read the code on GitHub

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [2]:
df2= pd.read_csv('c:/1/autos.csv')
df2.head()
Out[2]:
  Unnamed: 0 symboling normalized_losses make fuel_type aspiration num_doors body_style drive_wheels engine_location engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
0 0 3 NaN alfa-romero gas std two convertible rwd front 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 13495.0
1 1 3 NaN alfa-romero gas std two convertible rwd front 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 16500.0
2 2 1 NaN alfa-romero gas std two hatchback rwd front 152 mpfi 2.68 3.47 9.0 154.0 5000.0 19 26 16500.0
3 3 2 164.0 audi gas std four sedan fwd front 109 mpfi 3.19 3.40 10.0 102.0 5500.0 24 30 13950.0
4 4 2 164.0 audi gas std four sedan 4wd front 136 mpfi 3.19 3.40 8.0 115.0 5500.0 18 22 17450.0

5 rows × 27 columns

In [3]:
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

plt.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.title("Bubble Plot of Autos Arean(color: 'price & size: 'city_mpg')", fontsize=16)
plt.xlabel('horsepower', fontsize=18)
plt.ylabel('engine_size', fontsize=18)
plt.colorbar()

plt.show()    
In [4]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
ax.set_title("Bubble Plot of Autos Arean(color: 'price & size: 'engine_size')", fontsize=16)
ax.set_xlabel('horsepower', fontsize=18)
ax.set_ylabel('engine_size', fontsize=18)


## Sztuczka żeby mieć colorbar
AA = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
plt.colorbar(AA)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

handles, labels = AA.legend_elements(prop="sizes", alpha=0.6)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df2['make']):
    ax.annotate(txt, (df2['horsepower'][i],df2['engine_size'] [i]))

plt.show()  
 

Midwest

In [5]:
df = pd.read_csv('c:/2/midwest_filter.csv')
df.head()
Out[5]:
  PID county state area poptotal popdensity popwhite popblack popamerindian popasian percprof poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert percadultpoverty percelderlypoverty inmetro category dot_size
0 561 ADAMS IL 0.052 66090 1270.961540 63917 1702 98 249 4.355859 63628.0 96.274777 13.151443 18.011717 11.009776 12.443812 0.0 AAR 250.944411
1 562 ALEXANDER IL 0.014 10626 759.000000 7054 3496 19 48 2.870315 10529.0 99.087145 32.244278 45.826514 27.385647 25.228976 0.0 LHR 185.781260
2 563 BOND IL 0.022 14991 681.409091 14477 429 35 16 4.488572 14235.0 94.956974 12.068844 14.036061 10.852090 12.697410 0.0 AAR 175.905385
3 564 BOONE IL 0.017 30806 1812.117650 29344 127 46 150 4.197800 30337.0 98.477569 7.209019 11.179536 5.536013 6.217047 1.0 ALU 319.823487
4 565 BROWN IL 0.018 5836 324.222222 5264 547 14 5 3.367680 4815.0 82.505140 13.520249 13.022889 11.143211 19.200000 0.0 AAR 130.442161

5 rows × 29 columns

In [6]:
# Plot
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='Reds', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of PopTotal vs Arean(color: 'popdensity' & size: 'dot_size' - both are numeric columns in midwest)", fontsize=16)
plt.xlabel('Area', fontsize=18)
plt.ylabel('Poptotal', fontsize=18)
plt.colorbar()
plt.show()   
In [7]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
ax.set_title("Bubble Plot of PopTotal vs Arean color: 'popdensity' & size: 'dot_size'", fontsize=16)
ax.set_xlabel('Area', fontsize=18)
ax.set_ylabel('Poptotal', fontsize=18)
   


## Sztuczka żeby mieć colorbar
BB = ax.scatter('area', 'poptotal', data=df, s='dot_size', c='popdensity', cmap='YlGn', edgecolors='blue', linewidths=0.8)
plt.colorbar(BB)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### legenda do wielkości kółek
handles, labels = BB.legend_elements(prop="sizes", alpha=0.6)
legend = ax.legend(handles, labels, loc="lower right", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df['county']):
    ax.annotate(txt, (df['area'][i],df['poptotal'] [i]))

plt.show()  
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:995: RuntimeWarning: invalid value encountered in greater_equal
  cond = ((label_values >= func(arr).min()) &
C:ProgramDataAnaconda3libsite-packagesmatplotlibcollections.py:996: RuntimeWarning: invalid value encountered in less_equal
  (label_values <= func(arr).max()))
 

WorldHappinessReport

Source of data: https://worldhappiness.report/download/

 

The best plots appear when we combine various data!

In [8]:
df3= pd.read_csv('c:/1/WorldHappinessReport.csv')
df3 = df3[df3['Year']==2017]
df3.tail(2)
Out[8]:
  Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year
493 493 Zambia Sub-Saharan Africa 116.0 4.514 0.636407 1.003187 0.257836 0.461603 0.078214 0.249580 1.826705 2017.0
494 494 Zimbabwe Sub-Saharan Africa 138.0 3.875 0.375847 1.083096 0.196764 0.336384 0.095375 0.189143 1.597970 2017.0
In [9]:
df4 = pd.read_csv('c:/1/WorldPopulation.csv')
df4.head(2)
Out[9]:
  Unnamed: 0 Country Name Country Code 1961 1962 1963 1964 1965 1966 1967 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0 0 Aruba ABW 54211.0 55438.0 56225.0 56695.0 57032.0 57360.0 57715.0 101353.0 101453.0 101669.0 102053.0 102577.0 103187.0 103795.0 104341.0 104822.0 105264.0
1 1 Afghanistan AFG 8996351.0 9166764.0 9345868.0 9533954.0 9731361.0 9938414.0 10152331.0 27294031.0 28004331.0 28803167.0 29708599.0 30696958.0 31731688.0 32758020.0 33736494.0 34656032.0 35530081.0

2 rows × 61 columns

 

Only Africa and only 2017.

In [10]:
D3 = df4.set_index('Country Name')['2017'].to_dict()
#D3
In [11]:
df3['Population2017'] = df3['Country'].map(D3) 
df3['Population2017'] = df3['Population2017']/100000
In [12]:
df3.isnull().sum()
df3 = df3.dropna(how='any')
df3.isnull().sum()
Out[12]:
Unnamed: 0                       0
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
Population2017                   0
dtype: int64
In [13]:
kot = ['Sub-Saharan Africa','Middle East and Northern Africa']
AFR = df3[df3['Region'].isin(kot)]
AFR.head(2)
Out[13]:
  Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year Population2017
332 332 Algeria Middle East and Northern Africa 53.0 5.872 1.091864 1.146217 0.617585 0.233336 0.146096 0.069437 2.567604 2017.0 406.06052
333 333 Angola Sub-Saharan Africa 140.0 3.795 0.858428 1.104412 0.049869 0.000000 0.069720 0.097926 1.614482 2017.0 288.13463
In [14]:
AFR.to_csv('c:/8/AfricaHappinessReport2017.csv')
df10 = pd.read_csv('c:/8/AfricaHappinessReport2017.csv')
df10.head(2)
Out[14]:
  Unnamed: 0 Unnamed: 0.1 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year Population2017
0 332 332 Algeria Middle East and Northern Africa 53.0 5.872 1.091864 1.146217 0.617585 0.233336 0.146096 0.069437 2.567604 2017.0 406.06052
1 333 333 Angola Sub-Saharan Africa 140.0 3.795 0.858428 1.104412 0.049869 0.000000 0.069720 0.097926 1.614482 2017.0 288.13463
In [15]:
fig, ax = plt.subplots(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    

ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
ax.set_title("AFRICA 2017 Happiness & Freedomn(color: 'Economy (GDP per Capita)' & size: 'Population2017')", fontsize=16)
ax.set_xlabel('Happiness Score', fontsize=18)
ax.set_ylabel('Freedom', fontsize=18)


## Sztuczka żeby mieć colorbar
CC = ax.scatter('Happiness Score', 'Freedom', data=df10, s='Population2017', c='Freedom', cmap='RdYlGn', edgecolors='grey', linewidths=0.8)
plt.colorbar(CC)


### DRUGI SPOSÓB
#im = ax.scatter('horsepower', 'engine_size', data=df2, s='engine_size', c='price', cmap='PuBu', edgecolors='grey', linewidths=0.8)
#fig.colorbar(im, ax=ax)

### Sztuczka, żeby mieć legende do size - nie działa dla danych ciągłych (musi byc tylko kilka klas)
handles, labels = CC.legend_elements(prop="sizes", alpha=0.1)
legend2 = ax.legend(handles, labels, loc="upper left", title="Sizes")

## sztuczka żeby mieć podpisy na kólkach
for i, txt in enumerate(df10['Country']):
    ax.annotate(txt, (df10['Happiness Score'][i],df10['Freedom'] [i]))

plt.show()  
 

Diabetes

In [16]:
df2= pd.read_csv('c:/1/diabetes.csv')
df2.head(2)
Out[16]:
  Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
 

Adds BMI indicator amplifier

In [17]:
df2['BMI_class'] = ((pd.qcut(df2['BMI'],5, labels=False).astype(int))+1)*70
In [18]:
fig = plt.figure(figsize=(14, 7), dpi= 280, facecolor='white', edgecolor='black')    
plt.scatter('Age', 'Glucose', data=df2, s='BMI_class', c='BloodPressure', cmap='YlOrBr', edgecolors='blue', linewidths=0.8)
plt.title("Bubble Plot of Diabetesn color: BloodPressure & size: BMI", fontsize=16)
plt.xlabel('Age', fontsize=18)
plt.ylabel('Glucose', fontsize=18)
plt.colorbar()
plt.show()     

Artykuł Perfect Plots: Bubble Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Dendrogram and clustering 3d https://sigmaquality.pl/data-plots/dendrogram-and-clustering-3d/ Fri, 25 Oct 2019 19:56:00 +0000 http://sigmaquality.pl/dendron1/ In [1]: import scipy.cluster.hierarchy as shc import pandas as pd import matplotlib.pyplot as plt # Import Data df = pd.read_csv('c:/1/USArrests.csv') USArrests Source of data: https://www.kaggle.com/deepakg/usarrests [...]

Artykuł Dendrogram and clustering 3d pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
In [1]:
import scipy.cluster.hierarchy as shc
import pandas as pd
import matplotlib.pyplot as plt

# Import Data
df = pd.read_csv('c:/1/USArrests.csv')

USArrests

Source of data: https://www.kaggle.com/deepakg/usarrests

In [2]:
df.rename(columns = {'Unnamed: 0': 'State'}, inplace=True)
df.head(4)
Out[2]:
State Murder Assault UrbanPop Rape
0 Alabama 13.2 236 58 21.2
1 Alaska 10.0 263 48 44.5
2 Arizona 8.1 294 80 31.0
3 Arkansas 8.8 190 50 19.5
In [3]:
# Plot
plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("USArrests Dendograms", fontsize=22)  
dend = shc.dendrogram(shc.linkage(df[['Murder', 'Assault', 'UrbanPop', 'Rape']], method='ward'), labels=df.State.values, color_threshold=100)  
plt.xticks(fontsize=12)
plt.show()
In [4]:
df3 = pd.read_csv('c:/1/hierarchical-clustering-with-python-and-scikit-learn-shopping-data.csv')
df3.head()
Out[4]:
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40

We have a table that shows gender, age, annual income and expenditure. We take a vector of two coordinates from the DataFrame table: annual income in k $ – a tendency to spend on a scale of 1 to 100.

In [5]:
data = df3.iloc[:, 3:5].values
data
Out[5]:
array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
In [6]:
plt.figure(figsize=(10, 3))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

The dendrogram showed that there are 5 clusters (5 branches) of the bank’s clients. We create a clustering matrix. Since we had five clusters, we have five labels at the output, i.e. 0 to 4.

In [7]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
cluster.fit_predict(data)
Out[7]:
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)
In [8]:
plt.figure(figsize=(10, 7))
plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('CUSTOMERS CLUSTERINGS')
plt.xlabel('Annual earnings')
plt.ylabel('Spending')
Out[8]:
Text(0, 0.5, 'Spending')

Purple cluster – (in the lower right corner) a cluster of clients with high earnings but low expenses. Customers in the middle (blue data points) are those with average income and average salary. The largest number of customers belongs to this category.

Clinical tests

Source of data: https://www.kaggle.com/saurabh00007/diabetescsv

In [21]:
df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()
Out[21]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [22]:
PKP = df3[['Age','SkinThickness','BMI']]
In [23]:
PKP.head()
Out[23]:
Age SkinThickness BMI
0 50 35 33.6
1 31 29 26.6
2 32 0 23.3
3 21 23 28.1
4 33 35 43.1
The dendroid chart will tell you how many clusters you want
In [24]:
plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(PKP, method='ward'))
It seems 5 clusters
In [26]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], color='black',marker='o')

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)
Out[26]:
Text(0.5, 0, 'BMI')
In [27]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
KF = cluster.fit_predict(PKP)
KF
Out[27]:
array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)
In [28]:
# Initializing KMeans
kmeans = KMeans(n_clusters=5)
# Fitting with inputs
kmeans = kmeans.fit(PKP)
# Predicting the clusters
labels = kmeans.predict(PKP)
# Getting the cluster centers
C = kmeans.cluster_centers_
In [29]:
C
Out[29]:
array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])
In [31]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], c=KF)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], marker='.', c='red', s=1000)

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)
Out[31]:
Text(0.5, 0, 'BMI')

Artykuł Dendrogram and clustering 3d pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: Calendarplot https://sigmaquality.pl/data-plots/perfect-plots_-calendarplot-2/ Thu, 24 Oct 2019 18:50:00 +0000 http://sigmaquality.pl/perfect-plots_-calendarplot-2/ Feel free to read the code on GitHub In [1]: import matplotlib as mpl import calmap import pandas as pd import matplotlib.pyplot as plt   [...]

Artykuł Perfect Plots: Calendarplot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Feel free to read the code on GitHub

In [1]:
import matplotlib as mpl
import calmap
import pandas as pd
import matplotlib.pyplot as plt
 

yahoo

In [2]:
df = pd.read_csv('c:/1/yahoo.txt', parse_dates=['date'])
df.set_index('date', inplace=True)
df.head(3)
Out[2]:
  VIX.Open VIX.High VIX.Low VIX.Close VIX.Volume VIX.Adjusted year month monthf weekday weekdayf week
date                        
2007-01-03 12.16 12.75 11.53 12.04 0 12.04 2007 1 Jan 3 Wed 1
2007-01-04 12.40 12.42 11.28 11.51 0 11.51 2007 1 Jan 4 Thu 1
2007-01-05 11.84 12.25 11.68 12.14 0 12.14 2007 1 Jan 5 Fri 1
In [3]:
plt.figure(figsize=(16,10), dpi= 280)
calmap.calendarplot(df['2014']['VIX.Close'],cmap= 'seismic', fig_kws={'figsize': (16,10)}, yearlabel_kws={'color':'black', 'fontsize':24}, subplot_kws={'title':'Yahoo Stock Prices'})
plt.show()
<Figure size 4480x2800 with 0 Axes>
 

paltes: https://matplotlib.org/examples/color/colormaps_reference.html

cmaps = [(’Perceptually Uniform Sequential’, [ 'viridis’, 'plasma’, 'inferno’, 'magma’]), (’Sequential’, [ 'Greys’, 'Purples’, 'Blues’, 'Greens’, 'Oranges’, 'Reds’, 'YlOrBr’, 'YlOrRd’, 'OrRd’, 'PuRd’, 'RdPu’, 'BuPu’, 'GnBu’, 'PuBu’, 'YlGnBu’, 'PuBuGn’, 'BuGn’, 'YlGn’]), (’Sequential (2)’, [ 'binary’, 'gist_yarg’, 'gist_gray’, 'gray’, 'bone’, 'pink’, 'spring’, 'summer’, 'autumn’, 'winter’, 'cool’, 'Wistia’, 'hot’, 'afmhot’, 'gist_heat’, 'copper’]), (’Diverging’, [ 'PiYG’, 'PRGn’, 'BrBG’, 'PuOr’, 'RdGy’, 'RdBu’, 'RdYlBu’, 'RdYlGn’, 'Spectral’, 'coolwarm’, 'bwr’, 'seismic’]), (’Qualitative’, [ 'Pastel1′, 'Pastel2′, 'Paired’, 'Accent’, 'Dark2′, 'Set1′, 'Set2′, 'Set3′, 'tab10′, 'tab20′, 'tab20b’, 'tab20c’]), (’Miscellaneous’, [ 'flag’, 'prism’, 'ocean’, 'gist_earth’, 'terrain’, 'gist_stern’, 'gnuplot’, 'gnuplot2′, 'CMRmap’, 'cubehelix’, 'brg’, 'hsv’, 'gist_rainbow’, 'rainbow’, 'jet’, 'nipy_spectral’, 'gist_ncar’])]

In [4]:
df.head(4)
Out[4]:
  VIX.Open VIX.High VIX.Low VIX.Close VIX.Volume VIX.Adjusted year month monthf weekday weekdayf week
date                        
2007-01-03 12.16 12.75 11.53 12.04 0 12.04 2007 1 Jan 3 Wed 1
2007-01-04 12.40 12.42 11.28 11.51 0 11.51 2007 1 Jan 4 Thu 1
2007-01-05 11.84 12.25 11.68 12.14 0 12.14 2007 1 Jan 5 Fri 1
2007-01-08 12.48 12.83 11.78 12.00 0 12.00 2007 1 Jan 1 Mon 2
 

phone_data

In [5]:
df2 = pd.read_csv('c:/1/phone_data.csv', parse_dates=['date'])
df2.head(3)
Out[5]:
  index date duration item month network network_type
0 0 2014-10-15 06:58:00 34.429 data 2014-11 data data
1 1 2014-10-15 06:58:00 13.000 call 2014-11 Vodafone mobile
2 2 2014-10-15 14:46:00 23.000 call 2014-11 Meteor mobile
In [6]:
df2.set_index('date', inplace=True)
df2.head()
Out[6]:
  index duration item month network network_type
date            
2014-10-15 06:58:00 0 34.429 data 2014-11 data data
2014-10-15 06:58:00 1 13.000 call 2014-11 Vodafone mobile
2014-10-15 14:46:00 2 23.000 call 2014-11 Meteor mobile
2014-10-15 14:48:00 3 4.000 call 2014-11 Tesco mobile
2014-10-15 17:27:00 4 4.000 call 2014-11 Tesco mobile
In [7]:
plt.figure(figsize=(16,10), dpi= 280)
calmap.calendarplot(df2['2014']['duration'],cmap= 'BrBG', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'black', 'fontsize':24}
                    , subplot_kws={'title':'phone_data'})
plt.show()
<Figure size 4480x2800 with 0 Axes>
 

Energy

In [8]:
df3 = pd.read_csv('c:/2/Energy.csv', index_col=0, parse_dates=['Date'])
df3.set_index('Date', inplace=True)
df3.head()
Out[8]:
  Consumption Wind Solar Wind+Solar
Date        
2006-01-01 1069.184 NaN NaN NaN
2006-01-02 1380.521 NaN NaN NaN
2006-01-03 1442.533 NaN NaN NaN
2006-01-04 1457.217 NaN NaN NaN
2006-01-05 1477.131 NaN NaN NaN
In [9]:
plt.figure(figsize=(116,100), dpi= 280)

calmap.calendarplot(df3['2007']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})

calmap.calendarplot(df3['2008']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})

calmap.calendarplot(df3['2009']['Consumption'],cmap= 'YlOrBr', how='sum'
                    ,fillcolor='white', daylabels='PWŚCPSN'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Daily power consumption'})
Out[9]:
(<Figure size 1152x720 with 1 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001C980465198>],
       dtype=object))
<Figure size 32480x28000 with 0 Axes>
 

Personal calendar

In [10]:
df4 = pd.read_excel('c:/3/wtm.xlsx', parse_dates=['Date'])
df4.set_index('Date', inplace=True)
df4.head()
Out[10]:
  Continuous
Date  
2018-08-31 65
2018-09-02 60
2018-09-03 75
2018-09-04 120
2018-09-05 120
In [11]:
calmap.calendarplot(df4['2018']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})

calmap.calendarplot(df4['2019']['Continuous'],cmap= 'YlGn', how='sum'
                    ,fillcolor='white'
                    , fig_kws={'figsize': (16,10)}
                    , yearlabel_kws={'color':'gray', 'fontsize':44,'alpha':0.5}
                    , subplot_kws={'title':'Personal calendar'})
Out[11]:
(<Figure size 1152x720 with 1 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001C98056CF28>],
       dtype=object))

Artykuł Perfect Plots: Calendarplot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: H_line plot https://sigmaquality.pl/data-plots/perfect-plots_-h_line/ Tue, 22 Oct 2019 19:59:00 +0000 http://sigmaquality.pl/perfect-plots_-h_line/ Feel free to read the code on GitHub In [1]: import numpy as np import pandas as pd import seaborn as sns from sklearn.preprocessing import [...]

Artykuł Perfect Plots: H_line plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
import matplotlib.pylab as plt
from pylab import plot, show, subplot, specgram, imshow, savefig
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plote

Banking marketing

Analysis of the categorical results. Source of data: https://archive.ics.uci.edu/ml/machine-learning-databases/00222/

In [2]:
df = pd.read_csv('c:/1/bank.csv')
df.head()
Out[2]:
Unnamed: 0 Unnamed: 0.1 age job marital education default housing loan contact campaign pdays previous poutcome emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
0 0 0 44 blue-collar married basic.4y unknown yes no cellular 1 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1 0
1 1 1 53 technician married unknown no no no cellular 1 999 0 nonexistent -0.1 93.200 -42.0 4.021 5195.8 0
2 2 2 28 management single university.degree no yes no cellular 3 6 2 success -1.7 94.055 -39.8 0.729 4991.6 1
3 3 3 39 services married high.school no no no cellular 2 999 0 nonexistent -1.8 93.075 -47.1 1.405 5099.1 0
4 4 4 55 retired married basic.4y no yes no cellular 1 3 1 success -2.9 92.201 -31.4 0.869 5076.2 1

5 rows × 23 columns

In [3]:
CORREL = df.corr().sort_values('y')
CORREL['y'].to_frame().sort_values('y')
CORREL.index
Out[3]:
Index(['nr_employed', 'pdays', 'euribor3m', 'emp_var_rate', 'cons_price_idx',
       'campaign', 'Unnamed: 0', 'Unnamed: 0.1', 'age', 'cons_conf_idx',
       'previous', 'duration', 'y'],
      dtype='object')
In [4]:
plt.figure(figsize=(10,6))
CORREL['y'].plot(kind='barh', color='red')
plt.title('Correlation with the result variable', fontsize=20)
plt.xlabel('Correlation level')
plt.ylabel('Continuous independent variables')
Out[4]:
Text(0, 0.5, 'Continuous independent variables')

Variables for the chart

In [5]:
lebel=CORREL.index
lebel
Out[5]:
Index(['nr_employed', 'pdays', 'euribor3m', 'emp_var_rate', 'cons_price_idx',
       'campaign', 'Unnamed: 0', 'Unnamed: 0.1', 'age', 'cons_conf_idx',
       'previous', 'duration', 'y'],
      dtype='object')
In [6]:
data = CORREL['y']
data
Out[6]:
nr_employed      -0.354678
pdays            -0.324914
euribor3m        -0.307771
emp_var_rate     -0.298334
cons_price_idx   -0.136211
campaign         -0.066357
Unnamed: 0       -0.006165
Unnamed: 0.1     -0.006165
age               0.030399
cons_conf_idx     0.054878
previous          0.230181
duration          0.405274
y                 1.000000
Name: y, dtype: float64
In [7]:
title = 'Correlation with the result variable'
In [8]:
# Draw plot, # dpi=80 wykres, który będzie miał wymiary 80 na 80 pikseli
plt.figure(figsize=(8,4), dpi= 80, facecolor='#f4cccc', edgecolor='yellow') 

plt.hlines(y=lebel, xmin=0, xmax=data)
for x, y, tex in zip(data, lebel, data):
    t = plt.text(x, y, round(tex, 2), horizontalalignment='right' if x < 0 else 'left', 
                 verticalalignment='center', fontdict={'color':'#ff0000' if x < 0 else '#38761d', 'size':14})

# Decorations    
plt.yticks(lebel, fontsize=12)
plt.title(title, fontdict={'size':20})
plt.grid(linestyle='--', alpha=0.5)
plt.xlim(-1.0, 1.0)
plt.show()

Trigger

In [9]:
def Hlines(data,lebel,title):
    plt.figure(figsize=(8,4), dpi= 80, facecolor='#f4cccc', edgecolor='yellow') # dpi=80 wykres, który będzie miał wymiary 80 na 80 pikseli
    plt.hlines(y=lebel, xmin=0, xmax=data)
    for x, y, tex in zip(data, lebel, data):
        t = plt.text(x, y, round(tex, 2), horizontalalignment='right' if x < 0 else 'left', 
                     verticalalignment='center', fontdict={'color':'#ff0000' if x < 0 else '#38761d', 'size':14})

    plt.yticks(lebel, fontsize=12, color='#660000', alpha=0.9)
    plt.title(title, fontdict={'size':20}, color='#660000', alpha=0.9)
    plt.grid(linestyle='--', alpha=0.6)
    plt.xlim(-1.0, 1.0) #limit ax
    plt.show()
In [10]:
data = CORREL['y']
lebel=CORREL.index
title = 'Correlation with the result variable'

Hlines(data,lebel,title)

Artykuł Perfect Plots: H_line plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: Waffle plot https://sigmaquality.pl/data-plots/perfect-plots-waffle-plot/ Tue, 22 Oct 2019 19:58:00 +0000 http://sigmaquality.pl/perfect-plots-waffle-plot/ Feel free to read the code on GitHub   https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/   pip install pywaffle In [1]: from pywaffle import Waffle import squarify import pandas as pd [...]

Artykuł Perfect Plots: Waffle plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Feel free to read the code on GitHub

 

pip install pywaffle

In [1]:
from pywaffle import Waffle
import squarify 
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
df= pd.read_csv('c:/1/mpg_ggplot2.txt')
df.head()
Out[2]:
  manufacturer model displ year cyl trans drv cty hwy fl class
0 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
1 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
2 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
3 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
4 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
In [3]:
df = df.groupby('class').size().reset_index(name='counts')
df
Out[3]:
  class counts
0 2seater 5
1 compact 47
2 midsize 41
3 minivan 11
4 pickup 33
5 subcompact 35
6 suv 62
In [72]:
# Prepare Data

n_categories = df.shape[0]
colors = [plt.cm.YlGnBu(i/float(n_categories)) for i in range(n_categories)]

# Draw Plot and Decorate
fig = plt.figure(
    FigureClass=Waffle,
    plots={
        '111': {
            'values': df['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in df[['class', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.05, 1), 'fontsize': 12},
            'title': {'label': 'Vehicles by Class', 'loc': 'center', 'fontsize':28}
        },
    },
    rows=7,
    colors=colors,
    figsize=(16, 9)
)
In [5]:
n_categories
Out[5]:
7
In [6]:
colors
Out[6]:
[(0.988362, 0.998364, 0.644924, 1.0),
 (0.981173, 0.759135, 0.156863, 1.0),
 (0.961293, 0.488716, 0.084289, 1.0),
 (0.832299, 0.283913, 0.257383, 1.0),
 (0.621685, 0.164184, 0.388781, 1.0),
 (0.397674, 0.083257, 0.433183, 1.0),
 (0.15585, 0.044559, 0.325338, 1.0)]
 

Titanic disaster

We ought to find which passengers have chance to survive according to their affiliation to the established groups.

Source of data: https://www.kaggle.com/shivamp629/traincsv

In [7]:
df2 = pd.read_csv('c:/1/kaggletrain.csv')
df2.head(3)
Out[7]:
  Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th… female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
In [8]:
dfTDK = df2.groupby('Pclass').size().reset_index(name='counts')
dfTDK
Out[8]:
  Pclass counts
0 1 216
1 2 184
2 3 491
 

paltes: https://matplotlib.org/examples/color/colormaps_reference.html

cmaps = [(’Perceptually Uniform Sequential’, [
’viridis’, 'plasma’, 'inferno’, 'magma’]),
(’Sequential’, [
’Greys’, 'Purples’, 'Blues’, 'Greens’, 'Oranges’, 'Reds’,
’YlOrBr’, 'YlOrRd’, 'OrRd’, 'PuRd’, 'RdPu’, 'BuPu’,
’GnBu’, 'PuBu’, 'YlGnBu’, 'PuBuGn’, 'BuGn’, 'YlGn’]),
(’Sequential (2)’, [
’binary’, 'gist_yarg’, 'gist_gray’, 'gray’, 'bone’, 'pink’,
’spring’, 'summer’, 'autumn’, 'winter’, 'cool’, 'Wistia’,
’hot’, 'afmhot’, 'gist_heat’, 'copper’]),
(’Diverging’, [
’PiYG’, 'PRGn’, 'BrBG’, 'PuOr’, 'RdGy’, 'RdBu’,
’RdYlBu’, 'RdYlGn’, 'Spectral’, 'coolwarm’, 'bwr’, 'seismic’]),
(’Qualitative’, [
’Pastel1′, 'Pastel2′, 'Paired’, 'Accent’,
’Dark2′, 'Set1′, 'Set2′, 'Set3′,
’tab10′, 'tab20′, 'tab20b’, 'tab20c’]),
(’Miscellaneous’, [
’flag’, 'prism’, 'ocean’, 'gist_earth’, 'terrain’, 'gist_stern’,
’gnuplot’, 'gnuplot2′, 'CMRmap’, 'cubehelix’, 'brg’, 'hsv’,
’gist_rainbow’, 'rainbow’, 'jet’, 'nipy_spectral’, 'gist_ncar’])]

In [26]:
# Prepare Data

n_categories = dfTDK.shape[0]
colors2 = [plt.cm.cubehelix(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': dfTDK['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in dfTDK[['Pclass', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Structure of passengers population of Titanic', 'loc': 'center', 'fontsize':68,'alpha':0.5}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)
 

Embarked: (C = Cherbourg, Q = Queenstown, S = Southampton)

In [10]:
df2.Embarked = df2.Embarked.str.replace('C', 'Cherbourg')
df2.Embarked = df2.Embarked.str.replace('Q', 'Queenstown')
df2.Embarked = df2.Embarked.str.replace('S', 'Southampton')
df2.sample(4)
Out[10]:
  Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
368 368 369 1 3 Jermyn, Miss. Annie female NaN 0 0 14313 7.7500 NaN Queenstown
871 871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 Southampton
125 125 126 1 3 Nicola-Yarred, Master. Elias male 12.0 1 0 2651 11.2417 NaN Cherbourg
462 462 463 0 1 Gee, Mr. Arthur H male 47.0 0 0 111320 38.5000 E63 Southampton
In [11]:
dfPKP = df2.groupby('Embarked').size().reset_index(name='counts')
dfPKP
Out[11]:
  Embarked counts
0 Cherbourg 168
1 Queenstown 77
2 Southampton 644
In [17]:
# Prepare Data

n_categories = dfPKP.shape[0]
colors2 = [plt.cm.bwr(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': dfPKP['counts'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in dfPKP[['Embarked', 'counts']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Place of embarking of passengers on the Titanic', 'loc': 'center', 'fontsize':58,'alpha':0.5}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)
In [41]:
df3 = pd.read_csv('c:/1/bank.csv')
df3.head(3)
Out[41]:
  Unnamed: 0 Unnamed: 0.1 age job marital education default housing loan contact campaign pdays previous poutcome emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
0 0 0 44 blue-collar married basic.4y unknown yes no cellular 1 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1 0
1 1 1 53 technician married unknown no no no cellular 1 999 0 nonexistent -0.1 93.200 -42.0 4.021 5195.8 0
2 2 2 28 management single university.degree no yes no cellular 3 6 2 success -1.7 94.055 -39.8 0.729 4991.6 1

3 rows × 23 columns

In [48]:
df_STS = df3.pivot_table(index='job', values = 'Unnamed: 0',aggfunc='count').reset_index()
df_STS.sort_values('Unnamed: 0', ascending=False)
Out[48]:
  job Unnamed: 0
0 admin. 10422
1 blue-collar 9254
9 technician 6743
7 services 3969
4 management 2924
5 retired 1720
2 entrepreneur 1456
6 self-employed 1421
3 housemaid 1060
10 unemployed 1014
8 student 875
11 unknown 330
In [55]:
n_categories = df_STS.shape[0]
colors2 = [plt.cm.bwr(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=180, FigureClass=Waffle,
    plots={
        '111': {
            'values': df_STS['Unnamed: 0'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in df_STS[['job', 'Unnamed: 0']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 128},
            'title': {'label': 'Structure of bank customers by occupation', 'loc': 'center', 'fontsize':158,'alpha':0.5}
        },
    },
    rows=150,
    colors=colors2,
    figsize=(118, 70)
)
 

Structure of airports

Source of data: http://ourairports.com/data/airports.csv

In [57]:
df4= pd.read_csv('c:/1/airports.csv')
df4.head(4)
Out[57]:
  id ident type name latitude_deg longitude_deg elevation_ft continent iso_country iso_region municipality scheduled_service gps_code iata_code local_code home_link wikipedia_link keywords
0 6523 00A heliport Total Rf Heliport 40.070801 -74.933601 11.0 NaN US US-PA Bensalem no 00A NaN 00A NaN NaN NaN
1 323361 00AA small_airport Aero B Ranch Airport 38.704022 -101.473911 3435.0 NaN US US-KS Leoti no 00AA NaN 00AA NaN NaN NaN
2 6524 00AK small_airport Lowell Field 59.949200 -151.695999 450.0 NaN US US-AK Anchor Point no 00AK NaN 00AK NaN NaN NaN
3 6525 00AL small_airport Epps Airpark 34.864799 -86.770302 820.0 NaN US US-AL Harvest no 00AL NaN 00AL NaN NaN NaN
In [59]:
PL = df4[df4['iso_country']=='PL']
In [63]:
PPL = PL.pivot_table(index='type', values='id',aggfunc = 'count').reset_index()
PPL
Out[63]:
  type id
0 closed 25
1 heliport 3
2 large_airport 7
3 medium_airport 22
4 small_airport 218
In [76]:
n_categories = PPL.shape[0]
colors2 = [plt.cm.RdYlBu(i/float(n_categories)) for i in range(n_categories)]

## paltes: https://matplotlib.org/examples/color/colormaps_reference.html
# Draw Plot and Decorate
fig = plt.figure(dpi=380, FigureClass=Waffle,
    plots={
        '111': {
            'values': PPL['id'],
            'labels': ["{0} ({1})".format(n[0], n[1]) for n in PPL[['type', 'id']].itertuples()],
            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.0, 1), 'fontsize': 28},
            'title': {'label': 'Structure of airports in Poland', 'loc': 'center', 'fontsize':48,'alpha':0.8}
        },
    },
    rows=10,
    colors=colors2,
    figsize=(28, 7)
)

Artykuł Perfect Plots: Waffle plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: Slope Chart https://sigmaquality.pl/data-plots/perfect-plot-slope-chart-part1/ Tue, 22 Oct 2019 19:40:00 +0000 http://sigmaquality.pl/perfect-plot-slope-chart-part1/ Feel free to read the code on GitHub   In [1]: import pandas as pd import matplotlib.pyplot as plt import matplotlib.lines as mlines import numpy [...]

Artykuł Perfect Plots: Slope Chart pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Feel free to read the code on GitHub

 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np

# Import Data
df = pd.read_csv('c:/2/gdppercap.txt')
df
Out[1]:
  continent 1952 1957
0 Africa 1252.572466 1385.236062
1 Americas 4079.062552 4616.043733
2 Asia 5195.484004 4003.132940
3 Europe 5661.057435 6963.012816
4 Oceania 10298.085650 11598.522455
 

To make vector of data: series 1, series 2 and labels

In [2]:
lebel = df.continent.to_list()
lebel
Out[2]:
['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']
In [3]:
series1=np.round(df['1952'].to_list(), decimals=1)
series1
Out[3]:
array([ 1252.6,  4079.1,  5195.5,  5661.1, 10298.1])
In [4]:
series2=np.round(df['1957'].to_list(), decimals=1)
series2
Out[4]:
array([ 1385.2,  4616. ,  4003.1,  6963. , 11598.5])
 

Needed parameters

In [5]:
lebel = df.continent.to_list()
series1=np.round(df['1952'].to_list(), decimals=1)
series2=np.round(df['1957'].to_list(), decimals=1)
title = 'Slopechart: Comparing GDP Per Capita between 1952 vs 1957'
ylabel ='Mean GDP Per Capita'
xlabel =["1952", "1957"]
 

Definition of the trigger

In [6]:
def SlopeChart(series1, series2, title, xlabel, ylabel, lebel):
    
# Labels formatting
    left_label = [str(c) + ', '+ str(round(y)) for c, y in zip(lebel, series1)]
    right_label = [str(c) + ', '+ str(round(y)) for c, y in zip(lebel, series2)]
    klass = ['red' if (y1-y2) < 0 else 'green' for y1, y2 in zip(series1, series2)]

# lines color
    def newline(p1, p2, color='black'):
        ax = plt.gca()
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='red' if p1[1]-p2[1] > 0 else 'green', marker='o', markersize=6)
        ax.add_line(l)
        return l

    fig, ax = plt.subplots(1,1,figsize=(14,14), dpi= 380)

# Vertical Lines
    ax.vlines(x=1, ymin=500, ymax=13000, color='black', alpha=0.7, linewidth=1, linestyles='dotted')
    ax.vlines(x=3, ymin=500, ymax=13000, color='black', alpha=0.7, linewidth=1, linestyles='dotted')

# Points
    ax.scatter(y=series1, x=np.repeat(1, df.shape[0]), s=10, color='black', alpha=0.7)
    ax.scatter(y=series2, x=np.repeat(3, df.shape[0]), s=10, color='black', alpha=0.7)

# Line Segmentsand Annotation
    for p1, p2, c in zip(series1, series2, df['continent']):
        newline([1,p1], [3,p2])
        ax.text(1-0.05, p1, c + ', ' + str(round(p1)), horizontalalignment='right', verticalalignment='center', fontdict={'size':14})
        ax.text(3+0.05, p2, c + ', ' + str(round(p2)), horizontalalignment='left', verticalalignment='center', fontdict={'size':14})

# 'Before' and 'After' Annotations
    ax.text(1-0.05, 13000, 'BEFORE', horizontalalignment='right', verticalalignment='center', fontdict={'size':18, 'weight':700})
    ax.text(3+0.05, 13000, 'AFTER', horizontalalignment='left', verticalalignment='center', fontdict={'size':18, 'weight':700})

# Decoration
    ax.set_title(title, fontdict={'size':22})
    ax.set(xlim=(0,4), ylim=(0,14000), ylabel=ylabel)
    ax.set_xticks([1,3])
    ax.set_xticklabels(xlabel)
    plt.yticks(np.arange(500, 13000, 2000), fontsize=12)

# Lighten borders
    plt.gca().spines["top"].set_alpha(.0)
    plt.gca().spines["bottom"].set_alpha(.0)
    plt.gca().spines["right"].set_alpha(.0)
    plt.gca().spines["left"].set_alpha(.0)
    plt.show()
 

Slope Chart realization

In [7]:
SlopeChart(series1, series2, title, xlabel, ylabel, lebel)
In [8]:
df2 = pd.read_csv('c:/1/WorldHappinessReport.csv')
df2.head(5)
Out[8]:
  Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year
0 0 Afghanistan Southern Asia 153.0 3.575 0.31982 0.30285 0.30335 0.23414 0.09719 0.36510 1.95210 2015.0
1 1 Albania Central and Eastern Europe 95.0 4.959 0.87867 0.80434 0.81325 0.35733 0.06413 0.14272 1.89894 2015.0
2 2 Algeria Middle East and Northern Africa 68.0 5.605 0.93929 1.07772 0.61766 0.28579 0.17383 0.07822 2.43209 2015.0
3 3 Angola Sub-Saharan Africa 137.0 4.033 0.75778 0.86040 0.16683 0.10384 0.07122 0.12344 1.94939 2015.0
4 4 Argentina Latin America and Caribbean 30.0 6.574 1.05351 1.24823 0.78723 0.44974 0.08484 0.11451 2.83600 2015.0
In [9]:
kot = df2.pivot_table(index='Region',columns=['Year'], values='Happiness Rank', aggfunc='mean')
kot
Out[9]:
Year 2015.0 2016.0 2017.0
Region      
Australia and New Zealand 9.500000 8.500000 9.000000
Central and Eastern Europe 79.000000 78.448276 75.068966
Eastern Asia 64.500000 67.166667 63.600000
Latin America and Caribbean 46.909091 48.333333 50.772727
Middle East and Northern Africa 77.600000 78.105263 76.421053
North America 10.000000 9.500000 10.500000
Southeastern Asia 81.222222 80.000000 73.750000
Southern Asia 113.142857 111.714286 109.857143
Sub-Saharan Africa 127.900000 129.657895 127.871795
Western Europe 29.523810 29.190476 27.142857
In [10]:
PKP = kot.reset_index()
labelPKP = PKP['Region'].to_list()
labelPKP
Out[10]:
['Australia and New Zealand',
 'Central and Eastern Europe',
 'Eastern Asia',
 'Latin America and Caribbean',
 'Middle East and Northern Africa',
 'North America',
 'Southeastern Asia',
 'Southern Asia',
 'Sub-Saharan Africa',
 'Western Europe']
In [11]:
PKP.columns
Out[11]:
Index(['Region', 2015.0, 2016.0, 2017.0], dtype='object', name='Year')
In [12]:
PKP.columns = ['Region', '2015', '2016', '2017']
In [13]:
series1 = np.round(PKP['2015'].to_list(), decimals=1)
series1
Out[13]:
array([  9.5,  79. ,  64.5,  46.9,  77.6,  10. ,  81.2, 113.1, 127.9,
        29.5])
In [14]:
series2 = np.round(PKP['2017'].to_list(), decimals=1)
series2
Out[14]:
array([  9. ,  75.1,  63.6,  50.8,  76.4,  10.5,  73.8, 109.9, 127.9,
        27.1])
In [15]:
def SlopeChart(series1, series2, title, xlabel, ylabel, lebelPKP):
    
# Labels formatting
    left_label = [str(c) + ', '+ str(round(y)) for c, y in zip(labelPKP, series1)]
    right_label = [str(c) + ', '+ str(round(y)) for c, y in zip(labelPKP, series2)]
    klass = ['red' if (y1-y2) < 0 else 'green' for y1, y2 in zip(series1, series2)]

# lines color
    def newline(p1, p2, color='black'):
        ax = plt.gca()
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='red' if p1[1]-p2[1] > 0 else 'green', marker='o', markersize=6)
        ax.add_line(l)
        return l

    fig, ax = plt.subplots(1,1,figsize=(14,14), dpi= 380)

# Vertical Lines
    ax.vlines(x=1, ymin=0, ymax=120, color='black', alpha=0.7, linewidth=1, linestyles='dotted')
    ax.vlines(x=3, ymin=0, ymax=120, color='black', alpha=0.7, linewidth=1, linestyles='dotted')

# Points
    ax.scatter(y=series1, x=np.repeat(1, PKP.shape[0]), s=10, color='black', alpha=0.7)
    ax.scatter(y=series2, x=np.repeat(3, PKP.shape[0]), s=10, color='black', alpha=0.7)

# Line Segmentsand Annotation
    for p1, p2, c in zip(series1, series2, labelPKP):
        newline([1,p1], [3,p2])
        ax.text(1-0.05, p1, c + ', ' + str(round(p1)), horizontalalignment='right', verticalalignment='center', fontdict={'size':14})
        ax.text(3+0.05, p2, c + ', ' + str(round(p2)), horizontalalignment='left', verticalalignment='center', fontdict={'size':14})

# 'Before' and 'After' Annotations
    ax.text(1-0.05, 140, 'BEFORE', horizontalalignment='right', verticalalignment='center', fontdict={'size':18, 'weight':700})
    ax.text(3+0.05, 140, 'AFTER', horizontalalignment='left', verticalalignment='center', fontdict={'size':18, 'weight':700})

# Decoration
    ax.set_title(title, fontdict={'size':22})
    ax.set(xlim=(0,4), ylim=(0,150), ylabel=ylabel) ## Skala osi Y
    ax.set_xticks([1,3])
    ax.set_xticklabels(xlabel)
    plt.yticks(np.arange(20, 150, 40), fontsize=12)  ## Podziałaka osi Y

# Lighten borders
    plt.gca().spines["top"].set_alpha(.0)
    plt.gca().spines["bottom"].set_alpha(.0)
    plt.gca().spines["right"].set_alpha(.0)
    plt.gca().spines["left"].set_alpha(.0)
    plt.show()
In [16]:
labelPKP
series1
series2
title = 'Changing the the level of happiness: 2015 vs. 2017'
ylabel ='Less points - more happiness'
xlabel =["2015", "2017"]
In [17]:
SlopeChart(series1, series2, title, xlabel, ylabel, labelPKP)

Source IPYNB file

Artykuł Perfect Plots: Slope Chart pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plot: Treemap https://sigmaquality.pl/data-plots/perfect-plot_-treemap/ Tue, 22 Oct 2019 19:11:00 +0000 http://sigmaquality.pl/perfect-plot_-treemap/ Feel free to read the code on GitHub An old Chinese proverb says: one picture says more than one thousands words.   One good plot [...]

Artykuł Perfect Plot: Treemap pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Feel free to read the code on GitHub

An old Chinese proverb says: one picture says more than one thousands words.
 
One good plot can rescue entire presentation. One poor picture can drown down all good speech. After plenty of shame appointment and boring presentations I decided to improve my tools of visualisation.
In [1]:
import squarify 
import pandas as pd
import matplotlib.pyplot as plt

df1 = pd.read_csv('c:/11/freeFormResponses.csv', skiprows = 1)
In [2]:
headers = ['Duration (in seconds)', 'Gender', 'Gender2','Age','Country','Education', 'Major_undergraduate','Recent_role', 'Recent_role2', 'Industry','Industry2' ,'Years_of_experience', 'compensation$USD'] 
df = pd.read_csv('c:/11/multipleChoiceResponses.csv', usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12], header=None, names=headers, skiprows=2)
df.head(4)
Out[2]:
  Duration (in seconds) Gender Gender2 Age Country Education Major_undergraduate Recent_role Recent_role2 Industry Industry2 Years_of_experience compensation$USD
0 710 Female -1 45-49 United States of America Doctoral degree Other Consultant -1 Other 0 NaN NaN
1 434 Male -1 30-34 Indonesia Bachelor’s degree Engineering (non-computer focused) Other 0 Manufacturing/Fabrication -1 5-10 10-20,000
2 718 Female -1 30-34 United States of America Master’s degree Computer science (software engineering, etc.) Data Scientist -1 I am a student -1 0-1 0-10,000
3 621 Male -1 35-39 United States of America Master’s degree Social sciences (anthropology, psychology, soc… Not employed -1 NaN -1 NaN NaN
In [3]:
df.drop(['Gender2','Recent_role2','Industry2'], axis=1, inplace=True)
 

Correcting data

Every time when we want to do plot we will need to check and improve data. Especially check of unique occurrences and elimination of minority of rubbish and NaN cells (lack of data).

In [4]:
df.isnull().sum()
Out[4]:
Duration (in seconds)       0
Gender                      0
Age                         0
Country                     0
Education                 421
Major_undergraduate       912
Recent_role               959
Industry                 2174
Years_of_experience      2758
compensation$USD         3674
dtype: int64
In [5]:
df.dtypes
Out[5]:
Duration (in seconds)     int64
Gender                   object
Age                      object
Country                  object
Education                object
Major_undergraduate      object
Recent_role              object
Industry                 object
Years_of_experience      object
compensation$USD         object
dtype: object
 

Very important is reduction of the class or join some similar groups if it is not bad for the project.

In [6]:
df['Gender']=df['Gender'].replace('Prefer to self-describe', 'Prefer not to say')
In [7]:
df.Education.value_counts(dropna = False)
Out[7]:
Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
Professional degree                                                    599
NaN                                                                    421
I prefer not to answer                                                 345
No formal education past high school                                   232
Name: Education, dtype: int64
 

We can get assumption if somebody didn’t answer he didn’t want to give information: 'I prefer not to answer’.

In [8]:
import numpy as np

df['Education']=df['Education'].replace(np.NaN, 'I prefer not to answer')
In [9]:
df.Education.value_counts(dropna = False)
Out[9]:
Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
I prefer not to answer                                                 766
Professional degree                                                    599
No formal education past high school                                   232
Name: Education, dtype: int64
In [10]:
df.Education.isnull().sum()
Out[10]:
0
In [11]:
df.Major_undergraduate.value_counts(dropna = False)
Out[11]:
Computer science (software engineering, etc.)                    9430
Engineering (non-computer focused)                               3705
Mathematics or statistics                                        2950
A business discipline (accounting, economics, finance, etc.)     1791
Physics or astronomy                                             1110
Information technology, networking, or system administration     1029
NaN                                                               912
Medical or life sciences (biology, chemistry, medicine, etc.)     871
Other                                                             770
Social sciences (anthropology, psychology, sociology, etc.)       554
Humanities (history, literature, philosophy, etc.)                269
Environmental science or geology                                  253
I never declared a major                                          128
Fine arts or performing arts                                       87
Name: Major_undergraduate, dtype: int64
 

Rozumiem, że NaN i 'Other’ jest wtedy, gdy ktoś nie chce zadeklarować swojej specjalizacji:’I never declared a major’

In [12]:
df['Major_undergraduate']=df['Major_undergraduate'].replace(np.NaN, 'I never declared a major')
df['Major_undergraduate']=df['Major_undergraduate'].replace('Other', 'I never declared a major')
In [13]:
df.Major_undergraduate.value_counts(dropna = False, normalize=True).plot(kind='barh')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x287b9519550>
In [14]:
df.Recent_role.value_counts(dropna=False)
Out[14]:
Student                    5253
Data Scientist             4137
Software Engineer          3130
Data Analyst               1922
Other                      1322
Research Scientist         1189
NaN                         959
Not employed                842
Consultant                  785
Business Analyst            772
Data Engineer               737
Research Assistant          600
Manager                     590
Product/Project Manager     428
Chief Officer               360
Statistician                237
DBA/Database Engineer       145
Developer Advocate          117
Marketing Analyst           115
Salesperson                 102
Principal Investigator       97
Data Journalist              20
Name: Recent_role, dtype: int64
In [15]:
df['Recent_role']=df['Recent_role'].replace(np.NaN, 'Other')
 

Poland in data

Because I am from Poland, most interesting data for me is information from my country. I separate data about Poland from original data.

In [16]:
PL= df[df.Country=='Poland']
In [17]:
Z5 = PL.pivot_table(index=['Major_undergraduate'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z5.head(10)
Out[17]:
  Age
Major_undergraduate  
Computer science (software engineering, etc.) 112
Mathematics or statistics 52
A business discipline (accounting, economics, finance, etc.) 34
Physics or astronomy 24
Engineering (non-computer focused) 22
I never declared a major 17
Information technology, networking, or system administration 14
Social sciences (anthropology, psychology, sociology, etc.) 12
Medical or life sciences (biology, chemistry, medicine, etc.) 7
Humanities (history, literature, philosophy, etc.) 4
 

The Treemap

I came across this publication and decided to do Treemap by this way.
https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

In [18]:
PPL=Z5.reset_index()
PPL.head(5)
Out[18]:
  Major_undergraduate Age
0 Computer science (software engineering, etc.) 112
1 Mathematics or statistics 52
2 A business discipline (accounting, economics, … 34
3 Physics or astronomy 24
4 Engineering (non-computer focused) 22
 

Cut out too long descriptions

In [19]:
PPL['Major_undergraduate']= PPL['Major_undergraduate'].str.split('(').apply(lambda x: x[0])
PPL['Major_undergraduate']
Out[19]:
0                                     Computer science 
1                             Mathematics or statistics
2                                A business discipline 
3                                  Physics or astronomy
4                                          Engineering 
5                              I never declared a major
6     Information technology, networking, or system ...
7                                      Social sciences 
8                             Medical or life sciences 
9                                           Humanities 
10                     Environmental science or geology
Name: Major_undergraduate, dtype: object
 

Adds numbers of occurrences to the descriptions

In [20]:
label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
label
Out[20]:
0                             Computer science n (112)
1                      Mathematics or statisticsn (52)
2                         A business discipline n (34)
3                           Physics or astronomyn (24)
4                                   Engineering n (22)
5                       I never declared a majorn (17)
6     Information technology, networking, or system ...
7                               Social sciences n (12)
8                       Medical or life sciences n (7)
9                                     Humanities n (4)
10               Environmental science or geologyn (3)
dtype: object
 

To pull vectors of data from the pivot table

In [21]:
PPL.reset_index()

label
sizes = PPL['Age'].to_list()

colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3']
In [22]:
import squarify
import matplotlib.pyplot as plt
In [23]:
# Plot
plt.figure(figsize=(12,8), dpi= 380)
squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)

plt.title('Data Scientist society in Poland (2018)',  fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.8, 'y':1.02})
plt.axis('off') # brak numerów na osiach
plt.show()
 

Trigger to create Treemap

Components to create perfect pie plot: labels, sizes, colors, title

To prepare perfect treemap first I will need to pull vectors of data from the pivot table.

 

To pull vectors of data from the pivot table

In [24]:
PPL.reset_index()

label = label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
sizes = PPL['Age'].to_list()
title = 'Data Scientist society in Poland (2018)'

# https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/
#colors = ['#274e13','#6aa84f','#93c47d', '#b6d7a8','#d9ead3','#b7b7b7','#38761d'] #green
#colors = ['#0c343d','#134f5c','#45818e','#76a5af','#a2c4c9','#d0e0e3'] #cyan
#colors = ['#7f6000','#bf9000','#f1c232','#ffd966','#ffe599','#fff2cc'] #yelow
#colors = ['#4c1130','#a64d79','#c27ba0','#d5a6bd','#ead1dc','#741b47',] #magenta
#colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6'] #purple
#colors = ['#cfe2f3','#9fc5e8','#6fa8dc'] #blue
#colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']

#colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3'] #=> niemieckie czasopismo
In [25]:
import squarify
import matplotlib.pyplot as plt

def Tmap(sizes, labels, colors, title):
    plt.figure(figsize=(12,8), dpi= 380)
    squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)

    plt.title(title,  fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.9, 'y':1.02})
    plt.axis('off') # brak numerów na osiach
    plt.show()
In [26]:
Tmap(sizes, label, colors, title)

Artykuł Perfect Plot: Treemap pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: Categorical Plot https://sigmaquality.pl/data-plots/perfect-plot-categorical-plot/ Tue, 22 Oct 2019 18:20:00 +0000 http://sigmaquality.pl/perfect-plot-categorical-plot/ Feel free to read the code on GitHub Analysis of the categorical results. In [1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as [...]

Artykuł Perfect Plots: Categorical Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>

Analysis of the categorical results.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Titanic disaster

Analysis of the categorical results.
We ought to find which passengers have chance to survive according to their affiliation to the established groups.

Source of data: https://www.kaggle.com/shivamp629/traincsv

In [2]:
df = pd.read_csv('c:/1/kaggletrain.csv')
df.head()
Out[2]:
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th… female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [3]:
Woj = ['#b6d7a8','#6aa84f']

# Plot
g = sns.catplot("Survived", col="Pclass", col_wrap=4,
                data=df[df.Pclass.notnull()],
                kind="count", height=3.5, aspect=.8, 
                palette=Woj)

plt.show()

Banking marketing

Analysis of the categorical results.
Source of data: https://archive.ics.uci.edu/ml/machine-learning-databases/00222/

In [4]:
df2 = pd.read_csv('c:/1/bank.csv')
df2.head(3)
Out[4]:
Unnamed: 0 Unnamed: 0.1 age job marital education default housing loan contact campaign pdays previous poutcome emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
0 0 0 44 blue-collar married basic.4y unknown yes no cellular 1 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1 0
1 1 1 53 technician married unknown no no no cellular 1 999 0 nonexistent -0.1 93.200 -42.0 4.021 5195.8 0
2 2 2 28 management single university.degree no yes no cellular 3 6 2 success -1.7 94.055 -39.8 0.729 4991.6 1

3 rows × 23 columns

In [5]:
Kot = ['grey', 'red']
plt.figure(dpi= 380)
# Plot
g = sns.catplot("y", col="marital", col_wrap=4,
                data=df2[df2.marital.notnull()],
                kind="count", height=3.5, aspect=.8, 
                palette=Kot,  alpha=0.5, legend=True)

plt.rc("font", size=15)

plt.show()
<Figure size 2280x1520 with 0 Axes>

Clinical tests

Source of data: https://www.kaggle.com/saurabh00007/diabetescsv

In [6]:
df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head(3)
Out[6]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
In [10]:
kot = ['young patient', 'medium patient', 'senior patient']
df3['Age group'] = pd.qcut(df['Age'],3, labels=kot)
In [11]:
Kot = ['#ff9900', '#783f04']
plt.figure(dpi= 380)
# Plot
g = sns.catplot("Outcome", col='Age group', col_wrap=4,
                data=df3[df2.marital.notnull()],
                kind="count", height=5.5, aspect=.7, 
                palette=Kot,  alpha=0.4)

plt.rc("font", size=14)

plt.show()
C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """
<Figure size 2280x1520 with 0 Axes>

Artykuł Perfect Plots: Categorical Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Perfect Plots: Pie Plot https://sigmaquality.pl/data-plots/perfect-plots_-pie-plot/ Thu, 17 Oct 2019 19:22:00 +0000 http://sigmaquality.pl/perfect-plots_-pie-plot/ Feel free to read the code on GitHub   An old Chinese proverb says: one picture says more than one thousands words. One good plot [...]

Artykuł Perfect Plots: Pie Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
Feel free to read the code on GitHub

 

An old Chinese proverb says: one picture says more than one thousands words. One good plot can rescue entire presentation. One poor picture can drown down all good speech. After plenty of shame appointment and boring presentations I decided to improve my tools of visualisation.

In [1]:
import pandas as pd

df1 = pd.read_csv('c:/11/freeFormResponses.csv', skiprows = 1)
In [2]:
headers = ['Duration (in seconds)', 'Gender', 'Gender2','Age','Country','Education', 'Major_undergraduate','Recent_role', 'Recent_role2', 'Industry','Industry2' ,'Years_of_experience', 'compensation$USD'] 
df = pd.read_csv('c:/11/multipleChoiceResponses.csv', usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12], header=None, names=headers, skiprows=2)
df.head(4)
Out[2]:
  Duration (in seconds) Gender Gender2 Age Country Education Major_undergraduate Recent_role Recent_role2 Industry Industry2 Years_of_experience compensation$USD
0 710 Female -1 45-49 United States of America Doctoral degree Other Consultant -1 Other 0 NaN NaN
1 434 Male -1 30-34 Indonesia Bachelor’s degree Engineering (non-computer focused) Other 0 Manufacturing/Fabrication -1 5-10 10-20,000
2 718 Female -1 30-34 United States of America Master’s degree Computer science (software engineering, etc.) Data Scientist -1 I am a student -1 0-1 0-10,000
3 621 Male -1 35-39 United States of America Master’s degree Social sciences (anthropology, psychology, soc… Not employed -1 NaN -1 NaN NaN
In [3]:
df.drop(['Gender2','Recent_role2','Industry2'], axis=1, inplace=True)
 

Correcting data

Every time when we want to do plot we will need to check and improve data. Especially check of unique occurrences and elimination of minority of rubbish and NaN cells (lack of data).

In [4]:
df.isnull().sum()
Out[4]:
Duration (in seconds)       0
Gender                      0
Age                         0
Country                     0
Education                 421
Major_undergraduate       912
Recent_role               959
Industry                 2174
Years_of_experience      2758
compensation$USD         3674
dtype: int64
In [5]:
df.dtypes
Out[5]:
Duration (in seconds)     int64
Gender                   object
Age                      object
Country                  object
Education                object
Major_undergraduate      object
Recent_role              object
Industry                 object
Years_of_experience      object
compensation$USD         object
dtype: object
 

Very important is reduction of the class or join some similar groups if it is not bad for the project.

In [6]:
df['Gender']=df['Gender'].replace('Prefer to self-describe', 'Prefer not to say')
In [7]:
df.Education.value_counts(dropna = False)
Out[7]:
Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
Professional degree                                                    599
NaN                                                                    421
I prefer not to answer                                                 345
No formal education past high school                                   232
Name: Education, dtype: int64
 

We can get assumption if somebody didn’t answer he didn’t want to give information: 'I prefer not to answer’.

In [8]:
import numpy as np

df['Education']=df['Education'].replace(np.NaN, 'I prefer not to answer')
In [9]:
df.Education.value_counts(dropna = False)
Out[9]:
Master’s degree                                                      10855
Bachelor’s degree                                                     7083
Doctoral degree                                                       3357
Some college/university study without earning a bachelor’s degree      967
I prefer not to answer                                                 766
Professional degree                                                    599
No formal education past high school                                   232
Name: Education, dtype: int64
In [10]:
df.Education.isnull().sum()
Out[10]:
0
In [11]:
df.Major_undergraduate.value_counts(dropna = False)
Out[11]:
Computer science (software engineering, etc.)                    9430
Engineering (non-computer focused)                               3705
Mathematics or statistics                                        2950
A business discipline (accounting, economics, finance, etc.)     1791
Physics or astronomy                                             1110
Information technology, networking, or system administration     1029
NaN                                                               912
Medical or life sciences (biology, chemistry, medicine, etc.)     871
Other                                                             770
Social sciences (anthropology, psychology, sociology, etc.)       554
Humanities (history, literature, philosophy, etc.)                269
Environmental science or geology                                  253
I never declared a major                                          128
Fine arts or performing arts                                       87
Name: Major_undergraduate, dtype: int64
 

Rozumiem, że NaN i 'Other’ jest wtedy, gdy ktoś nie chce zadeklarować swojej specjalizacji:’I never declared a major’

In [12]:
df['Major_undergraduate']=df['Major_undergraduate'].replace(np.NaN, 'I never declared a major')
df['Major_undergraduate']=df['Major_undergraduate'].replace('Other', 'I never declared a major')
In [13]:
import matplotlib as plt
df.Major_undergraduate.value_counts(dropna = False, normalize=True).plot(kind='barh')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x260cea62cc0>
In [14]:
df.Recent_role.value_counts(dropna=False)
Out[14]:
Student                    5253
Data Scientist             4137
Software Engineer          3130
Data Analyst               1922
Other                      1322
Research Scientist         1189
NaN                         959
Not employed                842
Consultant                  785
Business Analyst            772
Data Engineer               737
Research Assistant          600
Manager                     590
Product/Project Manager     428
Chief Officer               360
Statistician                237
DBA/Database Engineer       145
Developer Advocate          117
Marketing Analyst           115
Salesperson                 102
Principal Investigator       97
Data Journalist              20
Name: Recent_role, dtype: int64
In [15]:
df['Recent_role']=df['Recent_role'].replace(np.NaN, 'Other')
In [16]:
Z1 = df.pivot_table(index=['Major_undergraduate'], columns = 'Gender', values='Age',aggfunc='count').sort_values('Male',ascending=False)
Z1
Out[16]:
Gender Female Male Prefer not to say
Major_undergraduate      
Computer science (software engineering, etc.) 1463 7837 130
Engineering (non-computer focused) 432 3223 50
Mathematics or statistics 660 2241 49
I never declared a major 297 1438 75
A business discipline (accounting, economics, finance, etc.) 334 1435 22
Physics or astronomy 119 968 23
Information technology, networking, or system administration 186 832 11
Medical or life sciences (biology, chemistry, medicine, etc.) 203 646 22
Social sciences (anthropology, psychology, sociology, etc.) 160 379 15
Environmental science or geology 57 190 6
Humanities (history, literature, philosophy, etc.) 74 185 10
Fine arts or performing arts 25 56 6
In [17]:
Z1.plot(kind='barh', legend=True, title='Data Scientists by Major undergraduate and Gender (Kaggle 2018)', figsize=(7, 4), color=('b','g','y'))
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x260cef146d8>
In [18]:
Z2 = df.pivot_table(index=['Country'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False).nlargest(20,'Male')
Z2
Out[18]:
Gender Female Male Prefer not to say SUM
Country        
SUM 4010.0 19430.0 419.0 23859
India 657.0 3719.0 41.0 4417
United States of America 1082.0 3530.0 104.0 4716
China 267.0 1337.0 40.0 1644
Other 165.0 849.0 22.0 1036
Russia 113.0 750.0 16.0 879
Brazil 65.0 666.0 5.0 736
Germany 103.0 621.0 10.0 734
Japan 34.0 557.0 6.0 597
United Kingdom of Great Britain and Northern Ireland 131.0 554.0 17.0 702
France 104.0 494.0 6.0 604
Canada 123.0 475.0 6.0 604
Spain 75.0 406.0 4.0 485
Italy 47.0 303.0 5.0 355
Australia 51.0 272.0 7.0 330
Turkey 56.0 267.0 4.0 327
I do not wish to disclose my location 83.0 250.0 61.0 394
Poland 54.0 243.0 4.0 301
Netherlands 41.0 225.0 4.0 270
Ukraine 31.0 218.0 3.0 252
 

Poland in data

Because I am from Poland, most interesting data for me is information from my country. I separate data about Poland from original data.

In [19]:
PL= df[df.Country=='Poland']
In [20]:
Z3 = PL.pivot_table(index=['Major_undergraduate'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False)
Z3
Out[20]:
Gender Female Male Prefer not to say SUM
Major_undergraduate        
SUM 54.0 243.0 4.0 301
Computer science (software engineering, etc.) 15.0 96.0 1.0 112
Mathematics or statistics 12.0 39.0 1.0 52
A business discipline (accounting, economics, finance, etc.) 9.0 24.0 1.0 34
Physics or astronomy 4.0 20.0 NaN 24
Engineering (non-computer focused) 3.0 18.0 1.0 22
I never declared a major 2.0 15.0 NaN 17
Information technology, networking, or system administration 3.0 11.0 NaN 14
Medical or life sciences (biology, chemistry, medicine, etc.) NaN 7.0 NaN 7
Social sciences (anthropology, psychology, sociology, etc.) 5.0 7.0 NaN 12
Humanities (history, literature, philosophy, etc.) NaN 4.0 NaN 4
Environmental science or geology 1.0 2.0 NaN 3
In [21]:
Z3 = PL.pivot_table(index=['Recent_role'], columns = 'Gender', values='Age',aggfunc='count', margins=True, margins_name='SUM').sort_values('Male',ascending=False)
Z3
Out[21]:
Gender Female Male Prefer not to say SUM
Recent_role        
SUM 54.0 243.0 4.0 301
Data Scientist 15.0 58.0 NaN 73
Software Engineer 4.0 46.0 NaN 50
Student 5.0 28.0 NaN 33
Other 5.0 21.0 NaN 26
Data Analyst 9.0 19.0 1.0 29
Research Scientist 2.0 15.0 NaN 17
Consultant 1.0 10.0 NaN 11
Business Analyst 2.0 9.0 1.0 12
Manager 1.0 6.0 NaN 7
Research Assistant 2.0 6.0 NaN 8
Data Engineer 2.0 5.0 1.0 8
Not employed 3.0 5.0 1.0 9
Chief Officer 1.0 4.0 NaN 5
Product/Project Manager 1.0 3.0 NaN 4
DBA/Database Engineer NaN 3.0 NaN 3
Statistician NaN 2.0 NaN 2
Data Journalist NaN 1.0 NaN 1
Principal Investigator NaN 1.0 NaN 1
Salesperson 1.0 1.0 NaN 2
 

Let’s do standard, quick Pie Plot

We can see banal, predictable visualization.

In [22]:
Z4 = PL.pivot_table(index=['Recent_role'], values='Age',aggfunc='count').sort_values('Age', ascending=False)

Z4.plot(kind='pie', subplots=True, legend=False, title="Data Scientists by Recent_role (Kaggle 2018)",figsize=(15,7), autopct='
Out[22]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000260CF010F60>],
      dtype=object)
In [23]:
Z5 = PL.pivot_table(index=['Major_undergraduate'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z5.head(10)
Out[23]:
  Age
Major_undergraduate  
Computer science (software engineering, etc.) 112
Mathematics or statistics 52
A business discipline (accounting, economics, finance, etc.) 34
Physics or astronomy 24
Engineering (non-computer focused) 22
I never declared a major 17
Information technology, networking, or system administration 14
Social sciences (anthropology, psychology, sociology, etc.) 12
Medical or life sciences (biology, chemistry, medicine, etc.) 7
Humanities (history, literature, philosophy, etc.) 4
 

Better Pie Plot with interesting colors

At the beginning we can change colors and give better descriptions.

GSuite Text and Background Palette: https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/

In [24]:
import matplotlib.pyplot as plt
## Wielkość wykresu
plt.figure(figsize=(16,8))


## informacja że jest to wykres złożony
ax1 = plt.subplot(aspect='equal')



## ustalenie koloru
colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']

## równanie podstawowe
Z5.plot(kind='pie',colors =colors , y = 'Age', ax=ax1, autopct='

# opisy, nazwy itp
ax1.set_xlabel('Something to write',  fontsize=15, color='darkred', alpha=1)
ax1.set_ylabel('Something to write', fontsize=11,  color='grey', alpha=0.8)
ax1.set_title('Major_undergraduate in Data Scientists (Kaggle 2018)',  fontsize=18, color='grey', alpha=0.8)
ax1.set_facecolor('#d8dcd6')
 

The best Pie Plot

I came across this publication and decided to do Pie Plot by this way.
https://medium.com/@kvnamipara/a-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

In [25]:
PPL=Z5.reset_index()
PPL.head(5)
Out[25]:
  Major_undergraduate Age
0 Computer science (software engineering, etc.) 112
1 Mathematics or statistics 52
2 A business discipline (accounting, economics, … 34
3 Physics or astronomy 24
4 Engineering (non-computer focused) 22
 

To pull vectors of data from the pivot table.

In [26]:
PPL.reset_index()
labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()

fig1, ax1 = plt.subplots(figsize=(10,5))


ax1.pie(sizes, labels=labels, autopct='

ax1.axis('equal')  
plt.tight_layout()
plt.show()
 

Colors changing

In [27]:
# linia wskazuje że będzie to wykres złożony - wymiary: 6:6
fig1, ax1 = plt.subplots(figsize=(10,5))

colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6']

ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

ax1.axis('equal')  
plt.tight_layout()
plt.show()
 

Changing size and color of the all fonts

textprops={’fontsize’: 30, 'color’:”green”}

In [28]:
# linia wskazuje że będzie to wykres złożony - wymiary: 6:6
fig1, ax1 = plt.subplots(figsize=(18,12))

colors = ['#e06666','#ea9999','#f4cccc','#ff0000','#434343']

ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

ax1.axis('equal')  
plt.tight_layout()
plt.show()
C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:10: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  # Remove the CWD from sys.path while we load stuff.
 

Changing size and color of the separate fonts

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')
In [29]:
fig1, ax1 = plt.subplots(figsize=(15,12))

colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')
    
ax1.axis('equal')  
plt.tight_layout()
plt.show()
C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:13: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
  del sys.path[0]
 

Changing size and color for the chosen categories

In [30]:
fig1, ax1 = plt.subplots(figsize=(6,6))

colors = ['#ff9999','#747574','#99ff99','#ffcc99','#f1c232']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('grey')

    
texts[0].set_fontsize(24)
texts[0].set_color('black')
texts[4].set_fontsize(33)
texts[4].set_color('green')
    
ax1.axis('equal')  
plt.tight_layout()
plt.show()
C:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:20: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
 

Making a bagel

In [31]:
fig1, ax1 = plt.subplots(figsize=(18,6))

colors = ['#a2c4c9','#b6d7a8','#747574','#99ff99','#ffcc99','#76a5af']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='
# Equal aspect ratio ensures that pie is drawn as a circle

for text in texts:
    text.set_color('darkred')
for autotext in autotexts:
    autotext.set_color('grey')
    
ax1.axis('equal')  
plt.tight_layout()

plt.show()
 

Making the better bangle

In [32]:
fig1, ax1 = plt.subplots(figsize=(18,8))

colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('black')
    autotext.set_fontsize(22)

texts[0].set_fontsize(18)
texts[0].set_color('black')
    
#draw circle
centre_circle = plt.Circle((0,0),0.40,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)    
    
    
ax1.set_xlabel('Year 2018',  fontsize=15, color='darkred', alpha=1)
ax1.set_ylabel('Data Scientist', fontsize=11,  color='grey', alpha=0.8)
ax1.set_title('Data Scientist by profession',  fontsize=58, color='#d0e0e3', alpha=0.8)
ax1.set_facecolor('#d8dcd6')


ax1.axis('equal')  
plt.tight_layout()

plt.show()
 

We enter the gender variable

In [33]:
PL.columns
Out[33]:
Index(['Duration (in seconds)', 'Gender', 'Age', 'Country', 'Education',
       'Major_undergraduate', 'Recent_role', 'Industry', 'Years_of_experience',
       'compensation$USD'],
      dtype='object')
In [34]:
Z6 = PL.pivot_table(index=['Major_undergraduate','Gender'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z6.head(10)
Out[34]:
    Age
Major_undergraduate Gender  
Computer science (software engineering, etc.) Male 96
Mathematics or statistics Male 39
A business discipline (accounting, economics, finance, etc.) Male 24
Physics or astronomy Male 20
Engineering (non-computer focused) Male 18
Computer science (software engineering, etc.) Female 15
I never declared a major Male 15
Mathematics or statistics Female 12
Information technology, networking, or system administration Male 11
A business discipline (accounting, economics, finance, etc.) Female 9
 

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

In [35]:
PLG=Z6.reset_index()
PLG.head(2)
Out[35]:
  Major_undergraduate Gender Age
0 Computer science (software engineering, etc.) Male 96
1 Mathematics or statistics Male 39
In [36]:
PLG.reset_index()
labels_gender = PLG['Gender'].to_list()
sizes_gender = PLG['Age'].to_list()
 

The double bangle

In [37]:
import matplotlib.pyplot as plt


colors_gender = ['#c2c2f0','#ffb3e6']
 

fig1, ax1 = plt.subplots(figsize=(18,6))

colors = ['#ff0000','#747574','#ffd966','#ffcc99','#ea9999']

patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='

plt.pie(sizes_gender,colors=colors_gender,radius=0.75,startangle=0)
centre_circle = plt.Circle((0,0),0.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)




for text in texts:
    text.set_color('grey')
for autotext in autotexts:
    autotext.set_color('black')

    
#draw circle
centre_circle = plt.Circle((0,0),0.50,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)    
    
    
ax1.axis('equal')  
plt.tight_layout()

plt.show()
 

The double bangle is: „one bridge too far”. This plot is beautiful but bangles are not correlated each other. To achieve adequate connection vectors should be come from one pivot tables. At the moment I have no idea how to do it (this groupby, query, pivot ….).

 

Trigger to create Pie Plot

Components to create perfect pie plot: labels, sizes, colors

To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.

In [38]:
PPL=Z5.reset_index()
PPL.head(5)
PPL.reset_index()

labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()

colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']
In [39]:
def PPieP(sizes,labels,colors):
    fig1, ax1 = plt.subplots(figsize=(18,8))

    patches, texts, autotexts = ax1.pie(sizes, colors=colors, labels=labels, autopct='


    for text in texts:
        text.set_color('grey')
    for autotext in autotexts:
        autotext.set_color('black')
        autotext.set_fontsize(22)

    texts[0].set_fontsize(18)
    texts[0].set_color('black')
    
    #draw circle
    centre_circle = plt.Circle((0,0),0.40,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)    
    
    
    ax1.set_xlabel('Year 2018',  fontsize=15, color='darkred', alpha=1)
    ax1.set_ylabel('Data Scientist', fontsize=11,  color='grey', alpha=0.8)
    ax1.set_title('Data Scientist by profession',  fontsize=58, color='#d0e0e3', alpha=0.8)
    ax1.set_facecolor('#d8dcd6')


    ax1.axis('equal')  
    plt.tight_layout()

    plt.show()
In [40]:
# Variables to the trigger:

labels = PPL['Major_undergraduate'].to_list()
sizes = PPL['Age'].to_list()
#colors = ['#a2c4c9','#76a5af','#c9daf8','#a4c2f4', '#cfe2f3']
#colors = ['#ff0000','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
#colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6']
#colors = ['#cfe2f3','#9fc5e8','#6fa8dc']
colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']



# Trigger:

PPieP(sizes,labels,colors)
 

Good advice in making presentation is to prepare plots using one standard.

As says man who built my house: messy but equally!

Artykuł Perfect Plots: Pie Plot pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>
How to make my own template for plots https://sigmaquality.pl/uncategorized/how-to-make-my-own-template-for-plots/ Wed, 05 Sep 2018 19:24:00 +0000 http://sigmaquality.pl/?p=6020 Today we learn how to make my own template for plots I have to confess something. I have a problem with plots, graphics, visualizations. I [...]

Artykuł How to make my own template for plots pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>

Today we learn how to make my own template for plots

I have to confess something. I have a problem with plots, graphics, visualizations. I have no problem with image or with decision what I have to create. I have a problem with realization.

In frankly speaking there are so many methods of creating plots in python, I can remember what to use it. Sure, if I could do some more exercises it would be easier for me. Never mind!

Fortunately somebody invented computer, who can remember this pretty mess. I decided to create special library of plots. This solution gave me independence.

I can make presentation faster because I don't have thought about colors or plot size. Every plot is the same, a have, prepared earlier my own style.

Are you convenience? Let's go to do template for plots !

Data preparation

At the first step we open data and needed libraries.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## data source: https://s3.amazonaws.com/dq-blog-files/fortune500.csv
df = pd.read_csv('c:/2/fortune500.csv')
df.columns = ['year', 'rank', 'company', 'revenue', 'profit']
df.head(3)

We routinely check how formats have our columns. Turn out we have non numeric data in column: 'profit'. The reason of that may be any words or signs in place of numbers. We have to find out what kind of contamination are there.

df.dtypes

df.profit.value_counts

I detected contamination. So I wipe it out and exchange format from str in to float.

df.loc[df.profit=='N.A.']
df.profit.replace('N.A.',np.nan, inplace = True)
df = df.dropna(how='any')
df['profit'] = df['profit'].apply(pd.to_numeric)

Ok, we have data ready to next steps!

We do template for plots

I prepared template for linear plots. I use them most frequently because I am a financial analyst.

This ready for using template I put to my repository.

Now we need to have adequate prepared data to put into the template.

def LinearPlot(x, y, ax, title, x_label, y_label):
    ax.set_title(title, color='darkred', alpha=1)
    ax.set_ylabel(y_label, color='grey', alpha=0.6)
    ax.set_xlabel(x_label, color='grey', alpha=0.6)
    ax.plot(x, y, color='black', alpha=0.6, linestyle='dashed')
    ax.grid(linewidth=0.85, alpha=0.2)
    ax.margins(x=0, y=0)

Pivot table is the best

To have good linear plot we need three things: x arrow, y arrow and data. Additionally, a title and descriptions of axes could be useful. Now we create pivot table, next exchange it into dataframe. Next easily query I separate x and y and data.

Ewa = df.pivot_table(index='year', values=['revenue', 'profit'], aggfunc='mean')
df2 = Ewa.reset_index()

x = df2.year
y = df2.profit
title = 'Profit fortune500'
y_label = 'Profit (millions)'
x_label = 'Years'

Use template

fig, ax = plt.subplots(figsize=(6, 2))
LinearPlot(x, y, ax, title, x_label, y_label)

 

x = df2.year
y = df2.revenue
title = 'Revenue fortune500'
y_label = 'Profit (millions)'
x_label = 'Years'

fig, ax = plt.subplots(figsize=(6, 2))
LinearPlot(x, y, ax, title, x_label, y_label)

I hope this is good solution to do template for plots!

Entire code:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
## data source: https://s3.amazonaws.com/dq-blog-files/fortune500.csv 
df = pd.read_csv('c:/2/fortune500.csv')
df.columns = ['year', 'rank', 'company', 'revenue', 'profit']
df.head(3)

df.dtypes

df.profit.value_counts

df.profit.value_counts
df.loc[df.profit=='N.A.']
df.profit.replace('N.A.',np.nan, inplace = True)
df = df.dropna(how='any')
df['profit'] = df['profit'].apply(pd.to_numeric)

def LinearPlot(x, y, ax, title, x_label, y_label):
    ax.set_title(title, color='darkred', alpha=1)
    ax.set_ylabel(y_label, color='grey', alpha=0.6)
    ax.set_xlabel(x_label, color='grey', alpha=0.6)
    ax.plot(x, y, color='black', alpha=0.6, linestyle='dashed')
    ax.grid(linewidth=0.85, alpha=0.2)
    ax.margins(x=0, y=0)

Ewa = df.pivot_table(index='year', values=['revenue', 'profit'], aggfunc='mean')
df2 = Ewa.reset_index()

x = df2.year
y = df2.profit
title = 'Profit fortune500'
y_label = 'Profit (millions)'
x_label = 'Years'

fig, ax = plt.subplots(figsize=(6, 2))
LinearPlot(x, y, ax, title, x_label, y_label)

x = df2.year
y = df2.revenue
title = 'Revenue fortune500'
y_label = 'Profit (millions)'
x_label = 'Years'

fig, ax = plt.subplots(figsize=(6, 2))
LinearPlot(x, y, ax, title, x_label, y_label)

 

Artykuł How to make my own template for plots pochodzi z serwisu THE DATA SCIENCE LIBRARY.

]]>