Feel free to read the code on GitHub
An old Chinese proverb says: one picture says more than one thousands words.
import squarify
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('c:/11/freeFormResponses.csv', skiprows = 1)
headers = ['Duration (in seconds)', 'Gender', 'Gender2','Age','Country','Education', 'Major_undergraduate','Recent_role', 'Recent_role2', 'Industry','Industry2' ,'Years_of_experience', 'compensation$USD']
df = pd.read_csv('c:/11/multipleChoiceResponses.csv', usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12], header=None, names=headers, skiprows=2)
df.head(4)
df.drop(['Gender2','Recent_role2','Industry2'], axis=1, inplace=True)
Correcting data
Every time when we want to do plot we will need to check and improve data. Especially check of unique occurrences and elimination of minority of rubbish and NaN cells (lack of data).
df.isnull().sum()
df.dtypes
Very important is reduction of the class or join some similar groups if it is not bad for the project.
df['Gender']=df['Gender'].replace('Prefer to self-describe', 'Prefer not to say')
df.Education.value_counts(dropna = False)
We can get assumption if somebody didn’t answer he didn’t want to give information: 'I prefer not to answer’.
import numpy as np
df['Education']=df['Education'].replace(np.NaN, 'I prefer not to answer')
df.Education.value_counts(dropna = False)
df.Education.isnull().sum()
df.Major_undergraduate.value_counts(dropna = False)
Rozumiem, że NaN i 'Other’ jest wtedy, gdy ktoś nie chce zadeklarować swojej specjalizacji:’I never declared a major’
df['Major_undergraduate']=df['Major_undergraduate'].replace(np.NaN, 'I never declared a major')
df['Major_undergraduate']=df['Major_undergraduate'].replace('Other', 'I never declared a major')
df.Major_undergraduate.value_counts(dropna = False, normalize=True).plot(kind='barh')
df.Recent_role.value_counts(dropna=False)
df['Recent_role']=df['Recent_role'].replace(np.NaN, 'Other')
PL= df[df.Country=='Poland']
Z5 = PL.pivot_table(index=['Major_undergraduate'], values='Age',aggfunc='count').sort_values('Age', ascending=False)
Z5.head(10)
The Treemap
I came across this publication and decided to do Treemap by this way.
https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/
To prepare perfect pie plot first I will need to pull vectors of data from the pivot table.
PPL=Z5.reset_index()
PPL.head(5)
Cut out too long descriptions
PPL['Major_undergraduate']= PPL['Major_undergraduate'].str.split('(').apply(lambda x: x[0])
PPL['Major_undergraduate']
Adds numbers of occurrences to the descriptions
label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
label
To pull vectors of data from the pivot table
PPL.reset_index()
label
sizes = PPL['Age'].to_list()
colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3']
import squarify
import matplotlib.pyplot as plt
# Plot
plt.figure(figsize=(12,8), dpi= 380)
squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)
plt.title('Data Scientist society in Poland (2018)', fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.8, 'y':1.02})
plt.axis('off') # brak numerów na osiach
plt.show()
Trigger to create Treemap
Components to create perfect pie plot: labels, sizes, colors, title
To prepare perfect treemap first I will need to pull vectors of data from the pivot table.
To pull vectors of data from the pivot table
PPL.reset_index()
label = label = PPL['Major_undergraduate'].to_list()
label = PPL.apply(lambda x: str(x[0]) + "n (" + str(x[1]) + ")", axis=1)
sizes = PPL['Age'].to_list()
title = 'Data Scientist society in Poland (2018)'
# https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/
#colors = ['#274e13','#6aa84f','#93c47d', '#b6d7a8','#d9ead3','#b7b7b7','#38761d'] #green
#colors = ['#0c343d','#134f5c','#45818e','#76a5af','#a2c4c9','#d0e0e3'] #cyan
#colors = ['#7f6000','#bf9000','#f1c232','#ffd966','#ffe599','#fff2cc'] #yelow
#colors = ['#4c1130','#a64d79','#c27ba0','#d5a6bd','#ead1dc','#741b47',] #magenta
#colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
#colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
colors = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6'] #purple
#colors = ['#cfe2f3','#9fc5e8','#6fa8dc'] #blue
#colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']
#colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3'] #=> niemieckie czasopismo
import squarify
import matplotlib.pyplot as plt
def Tmap(sizes, labels, colors, title):
plt.figure(figsize=(12,8), dpi= 380)
squarify.plot(sizes=sizes, label=label, color=colors, alpha=0.9)
plt.title(title, fontdict={'fontsize': 30, 'fontweight': 'medium', 'color':'#d0e0e3','alpha':0.9, 'y':1.02})
plt.axis('off') # brak numerów na osiach
plt.show()
Tmap(sizes, label, colors, title)


