Perfect Plots: Violinplot

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

Titanic disaster

Source of data: https://www.kaggle.com/shivamp629/traincsv

In [2]:
df = pd.read_csv('c:/1/kaggletrain.csv')
df.head()

In [3]:

SKS = df.pivot_table(index=['Sex','Age'], values = 'Fare', aggfunc='count').reset_index()
SKS.head()
Out[3]:
Sex Age Fare
0 female 0.75 2
1 female 1.00 2
2 female 2.00 6
3 female 3.00 2
4 female 4.00 5
In [4]:
fig, ax = plt.subplots(figsize=(6,5), dpi= 80)  
import seaborn as sns

# Draw Stripplot
  
sns.stripplot(SKS.Sex, SKS.Age, size=SKS.Fare*2, ax=ax)

# Decorations
plt.title('Counts Plot - Size of circle is bigger as bigger is subpopulation', fontsize=12)
plt.show()
In [5]:
# Draw Plot
plt.figure(figsize=(6,4), dpi= 80)
sns.violinplot(x=SKS.Sex, y=SKS.Age, data=df, scale='width', inner='quartile')

# Decoration
plt.title('Age of the Titanic passengers by sex', fontsize=12)
plt.show()
In [6]:
PKP = df.pivot_table(index=['Pclass','Sex','Age'], values = 'Fare', aggfunc='count').reset_index()
PKP.rename(columns={'Fare':'Count'}, inplace=True)
PKP.head()
Out[6]:
Pclass Sex Age Count
0 1 female 2.0 1
1 1 female 14.0 1
2 1 female 15.0 1
3 1 female 16.0 3
4 1 female 17.0 2
In [7]:
# Draw Plot
plt.figure(figsize=(6,4), dpi= 80)
sns.violinplot(x=PKP.Pclass, y=PKP.Age, data=df, scale='width', inner='quartile',  palette="husl")

# Decoration
plt.title('Age of the Titanic passengers by sex', fontsize=18)
plt.show()

https://yagisanatode.com/2019/08/06/google-apps-script-hexadecimal-color-codes-for-google-docs-sheets-and-slides-standart-palette/

green = ['#274e13','#6aa84f','#93c47d', '#b6d7a8','#d9ead3','#b7b7b7','#38761d'] 
cyan = ['#0c343d','#134f5c','#45818e','#76a5af','#a2c4c9','#d0e0e3'] #cyan
yellow = ['#7f6000','#bf9000','#f1c232','#ffd966','#ffe599','#fff2cc'] #yellow
magenta = ['#4c1130','#a64d79','#c27ba0','#d5a6bd','#ead1dc','#741b47',] #magenta
colors = ['#e6b8af','#b6d7a8','#e06666','#747574','#ffd966','#ffcc99','#ea9999']
colors = ['#93c47d','#b6d7a8','#d9ead3','#d0e0e3','#a2c4c9','#76a5af']
purple = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6'] #purple
blue = ['#cfe2f3','#9fc5e8','#6fa8dc'] #blue
colors = ['#d9ead3','#b6d7a8','#93c47d','#6aa84f']
grey = ['#000000', '#434343', '#666666', '#999999', '#b7b7b7', '#cccccc', '#d9d9d9','#efefef','#f3f3f3']
lightCornflower = ['#1c4587', '#1155cc', '#3c78d8', '#6d9eeb', '#a4c2f4', '#c9daf8', '#4a86e8', '#d9d9d9']

#colors = ['#ff0000','#434343','#666666','#999999','#b7b7b7','#cccccc','#d9d9d9','#efefef','#ffffff','#f3f3f3'] #=> niemieckie czasopismo
In [8]:
blue = ['#cfe2f3','#9fc5e8','#6fa8dc']
sns.violinplot(x=PKP.Pclass, y=PKP.Age, data=df, scale='width', inner='quartile',  palette=blue, alpha=0.1)
# Decoration
plt.title('Age of the Titanic passengers by sex', fontsize=18)
plt.show()
In [9]:
purple = ['#c27ba0','#d5a6bd','#ead1dc','#ffffff','#a64d79','#d9d2e9','#b4a7d6']

sns.violinplot(x=PKP.Pclass, y=PKP.Age, data=df, scale='width', inner='quartile',  palette=purple)
sns.swarmplot(x=PKP.Pclass, y=PKP.Age, data=df, color='lightblue', alpha=0.9)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1abd5251eb8>

WorldHappinessReport

In [10]:
df3 = pd.read_csv('c:/1/WorldHappinessReport.csv')
df3= df3[df3['Year']==2017]
df3.head(3)
Out[10]:
Unnamed: 0 Country Region Happiness Rank Happiness Score Economy (GDP per Capita) Family Health (Life Expectancy) Freedom Trust (Government Corruption) Generosity Dystopia Residual Year
330 330 Afghanistan Southern Asia 141.0 3.794 0.401477 0.581543 0.180747 0.106180 0.061158 0.311871 2.150801 2017.0
331 331 Albania Central and Eastern Europe 109.0 4.644 0.996193 0.803685 0.731160 0.381499 0.039864 0.201313 1.490442 2017.0
332 332 Algeria Middle East and Northern Africa 53.0 5.872 1.091864 1.146217 0.617585 0.233336 0.146096 0.069437 2.567604 2017.0
In [11]:
NIK = df3.pivot_table(index=['Region','Happiness Score'], values = 'Country', aggfunc='count').reset_index()
print(NIK.head())
REG = NIK['Region'].unique()
                       Region  Happiness Score  Country
0   Australia and New Zealand            7.284        1
1   Australia and New Zealand            7.314        1
2  Central and Eastern Europe            4.096        1
3  Central and Eastern Europe            4.286        1
4  Central and Eastern Europe            4.376        1
In [12]:
plt.figure(figsize=(16,4), dpi= 80)

cyan = ['#0c343d','#134f5c','#45818e','#76a5af','#a2c4c9','#d0e0e3'] 
sns.violinplot(x=NIK.Region, y=NIK['Happiness Score'], data=df3, scale='width', inner='quartile',  palette=cyan)
sns.swarmplot(x=NIK.Region, y=NIK['Happiness Score'], data=df3, color='white', alpha=0.4)


plt.gca().set_xticklabels(REG, rotation=90, horizontalalignment= 'right', fontsize=18)
plt.title("Happiness Score by regions 2017", fontsize=25, alpha=0.4)
plt.ylabel('Happiness Score')
#plt.ylim(0, 37000)
plt.show()

Drinksbycountry

In [13]:
df4 = pd.read_csv('c:/1/drinksbycountry.csv')
df4.head(3)
Out[13]:
Unnamed: 0 country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 0 Afghanistan 0 0 0 0.0 Asia
1 1 Albania 89 132 54 4.9 Europe
2 2 Algeria 25 0 14 0.7 Africa
In [14]:
PKS = df4.pivot_table(index =['continent','total_litres_of_pure_alcohol'], values='country' , aggfunc='count').reset_index()
PKS.rename(columns={'country':'count'}, inplace=True)
PKS.head()
Out[14]:
continent total_litres_of_pure_alcohol count
0 Africa 0.0 3
1 Africa 0.1 2
2 Africa 0.2 2
3 Africa 0.3 1
4 Africa 0.4 1
In [15]:
plt.figure(figsize=(10,4), dpi= 280)

grey = ['#000000', '#434343', '#666666', '#999999', '#b7b7b7', '#cccccc', '#d9d9d9','#efefef','#f3f3f3']
sns.violinplot(x=PKS.continent, y=PKS['total_litres_of_pure_alcohol'], data=PKS, scale='width', inner='quartile',  palette=grey)
sns.swarmplot(x=PKS.continent, y=PKS['total_litres_of_pure_alcohol'], data=PKS, color='yellow', alpha=0.4)


#plt.gca().set_xticklabels(xxx, rotation=90, horizontalalignment= 'right', fontsize=14)
plt.title("Litres Of Pure Alcohol per person", fontsize=22, alpha=0.4)
plt.ylabel('counts')
#plt.ylim(0, 37000)
plt.show()

Imdbratings

In [16]:
df5 = pd.read_csv('c:/1/imdbratings.csv')
df5.head(3)
Out[16]:
Unnamed: 0 star_rating title content_rating genre duration actors_list
0 0 9.3 The Shawshank Redemption R Crime 142 [u’Tim Robbins’, u’Morgan Freeman’, u’Bob Gunt…
1 1 9.2 The Godfather R Crime 175 [u’Marlon Brando’, u’Al Pacino’, u’James Caan’]
2 2 9.1 The Godfather: Part II R Crime 200 [u’Al Pacino’, u’Robert De Niro’, u’Robert Duv…
In [17]:
SKO = df5.pivot_table(index=['genre','duration'], values='title', aggfunc='count').reset_index()
KOT = SKO['genre'].unique()
SKO.head()
Out[17]:
genre duration title
0 Action 80 1
1 Action 92 1
2 Action 93 2
3 Action 94 1
4 Action 98 1
In [18]:
plt.figure(figsize=(10,4), dpi= 280)

lightCornflower = ['#1c4587', '#1155cc', '#3c78d8', '#6d9eeb', '#a4c2f4', '#c9daf8', '#4a86e8', '#d9d9d9']
sns.violinplot(x=SKO.genre, y=SKO['duration'], data=SKO, scale='width', inner='quartile',  palette=lightCornflower, markers='d')
sns.swarmplot(x=SKO.genre, y=SKO['duration'], data=SKO, color='black', alpha=0.4)


plt.gca().set_xticklabels(KOT, rotation=90, horizontalalignment= 'right', fontsize=14, color='#45818e')
plt.title("Duration films by genre", fontsize=22, alpha=0.4)
plt.ylabel('Duration')
#plt.ylim(0, 37000)
plt.show()