# Formuła Pandas: groupby part 2

In [4]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['foo', 'foo', 'bar', 'foo', 'bar', 'foo'],
'B' : ['one', 'two', 'three','two', 'two', 'one'],
'C' : np.random.randint(5, size=6),
'D' : np.random.randint(5, size=6),
'E' : np.random.randint(5, size=6)})
print (df)

     A      B  C  D  E
0  foo    one  4  4  1
1  foo    two  3  1  4
2  bar  three  3  3  4
3  foo    two  3  1  4
4  bar    two  1  0  0
5  foo    one  4  2  1

In [5]:
df1 = df.groupby(['A', 'B'], as_index=False)['C'].sum()
print (df1)

     A      B  C
0  bar  three  3
1  bar    two  1
2  foo    one  8
3  foo    two  6

In [6]:
df2 = df.groupby(['A', 'B'], as_index=False).sum()
print (df2)

     A      B  C  D  E
0  bar  three  3  3  4
1  bar    two  1  0  0
2  foo    one  8  6  2
3  foo    two  6  2  8

In [7]:
df3 = df.groupby(['A', 'B'], as_index=False)['C','D'].sum()
print (df3)

     A      B  C  D
0  bar  three  3  3
1  bar    two  1  0
2  foo    one  8  6
3  foo    two  6  2

In [8]:
df1 = df.groupby(['A', 'B'], as_index=False)['C'].agg('sum')
print (df1)

     A      B  C
0  bar  three  3
1  bar    two  1
2  foo    one  8
3  foo    two  6

In [9]:
df2 = df.groupby(['A', 'B'], as_index=False).agg('sum')
print (df2)

     A      B  C  D  E
0  bar  three  3  3  4
1  bar    two  1  0  0
2  foo    one  8  6  2
3  foo    two  6  2  8

In [12]:
df4 = (df.groupby(['A', 'B'])['C'].agg([('average','mean'),('total','sum')]).reset_index())
print (df4)

     A      B  average  total
0  bar  three        3      3
1  bar    two        1      1
2  foo    one        4      8
3  foo    two        3      6

In [11]:
df5 = (df.groupby(['A', 'B']).agg([('average','mean'),('total','sum')]))

print (df5)

                C             D             E
average total average total average total
A   B
bar three       3     3       3     3       4     4
two         1     1       0     0       0     0
foo one         4     8       3     6       1     2
two         3     6       1     2       4     8

In [17]:
# łączenie w jedną GŁOWĘ TABELI
df5.columns = df5.columns.map('_'.join)
df5 = df5.reset_index()
print (df5)

     A      B  C_average  C_total  D_average  D_total  E_average  E_total
0  bar  three          3        3          3        3          4        4
1  bar    two          1        1          0        0          0        0
2  foo    one          4        8          3        6          1        2
3  foo    two          3        6          1        2          4        8


ZMIANA NAZWY KOLUMN

In [18]:
df5 = df.groupby(['A', 'B']).agg(['mean','sum'])

df5.columns = (df5.columns.map('_'.join)
.str.replace('sum','total')
.str.replace('mean','average'))

df5 = df5.reset_index()
print (df5)

     A      B  C_average  C_total  D_average  D_total  E_average  E_total
0  bar  three          3        3          3        3          4        4
1  bar    two          1        1          0        0          0        0
2  foo    one          4        8          3        6          1        2
3  foo    two          3        6          1        2          4        8


ZMIANA NAZWY KOLUMN BEZPOŚREDNIO

In [19]:
df6 = (df.groupby(['A', 'B'], as_index=False)
.agg({'C':'sum','D':'mean'})
.rename(columns={'C':'C_total', 'D':'D_average'}))
print (df6)

     A      B  C_total  D_average
0  bar  three        3          3
1  bar    two        1          0
2  foo    one        8          3
3  foo    two        6          1


TEGO NIE ROZUMIEM

In [20]:
def func(x):
return x.iat[0] + x.iat[-1]

df7 = (df.groupby(['A', 'B'], as_index=False)
.agg({'C':'sum','D': func})
.rename(columns={'C':'C_total', 'D':'D_sum_first_and_last'}))
print (df7)

     A      B  C_total  D_sum_first_and_last
0  bar  three        3                     6
1  bar    two        1                     0
2  foo    one        8                     6
3  foo    two        6                     2


# No DataFrame after aggregation! What happened?

In [22]:
df1 = df.groupby(['A', 'B'])['C'].sum()
print (df1)

A    B
bar  three    3
two      1
foo  one      8
two      6
Name: C, dtype: int32

In [24]:
# Z DODATKOWYM INDEKSEM
df1 = df.groupby(['A', 'B'], as_index=False)['C'].sum()
print (df1)

     A      B  C
0  bar  three  3
1  bar    two  1
2  foo    one  8
3  foo    two  6

In [25]:
# TO SAMO RESET INDEX
df1 = df.groupby(['A', 'B'])['C'].sum().reset_index()
print (df1)

     A      B  C
0  bar  three  3
1  bar    two  1
2  foo    one  8
3  foo    two  6

In [26]:
df2 = df.groupby('A')['C'].sum()
print (df2)

A
bar     4
foo    14
Name: C, dtype: int32

In [31]:
df2.index

Out[31]:
Index(['bar', 'foo'], dtype='object', name='A')
In [32]:
print (type(df2))

<class 'pandas.core.series.Series'>

In [33]:
df2 = df.groupby('A', as_index=False)['C'].sum()
print (df2)

     A   C
0  bar   4
1  foo  14

In [34]:
df2 = df.groupby('A')['C'].sum().reset_index()
print (df2)

     A   C
0  bar   4
1  foo  14


# How to aggregate mainly strings columns (to lists, tuples, strings with separator)?

In [35]:
df = pd.DataFrame({'A' : ['a', 'c', 'b', 'b', 'a', 'c', 'b'],
'B' : ['one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : ['three', 'one', 'two', 'two', 'three','two', 'one'],
'D' : [1,2,3,2,3,1,2]})
print (df)

   A      B      C  D
0  a    one  three  1
1  c    two    one  2
2  b  three    two  3
3  b    two    two  2
4  a    two  three  3
5  c    one    two  1
6  b  three    one  2


WYŚWIETLANIE ZBIORU WYRAZÓW

In [36]:
df1 = df.groupby('A')['B'].agg(list).reset_index()
print (df1)

   A                    B
0  a           [one, two]
1  b  [three, two, three]
2  c           [two, one]


TO SAMO ALE BEZ NAWIASÓW KWADRATOWYCH

In [37]:
df2 = df.groupby('A')['B'].agg(','.join).reset_index()
print (df2)

   A                B
0  a          one,two
1  b  three,two,three
2  c          two,one


PRZEKSZTAŁCENIE WYRAZÓW NA CYFRY

In [38]:
df3 = (df.groupby('A')['D']
.agg(lambda x: ','.join(x.astype(str)))
.reset_index())
print (df3)

   A      D
0  a    1,3
1  b  3,2,2
2  c    2,1


TO SAMO INNYM SPOSOBEM

In [39]:
df3 = (df.assign(D = df['D'].astype(str))
.groupby('A')['D']
.agg(','.join).reset_index())
print (df3)

   A      D
0  a    1,3
1  b  3,2,2
2  c    2,1

In [40]:
df

Out[40]:
A B C D
0 a one three 1
1 c two one 2
2 b three two 3
3 b two two 2
4 a two three 3
5 c one two 1
6 b three one 2
In [41]:
df4 = df.groupby('A').agg(','.join).reset_index()
print (df4)

   A                B            C
0  a          one,two  three,three
1  b  three,two,three  two,two,one
2  c          two,one      one,two

In [42]:
df5 = (df.groupby('A')
.agg(lambda x: ','.join(x.astype(str)))
.reset_index())
print (df5)

   A                B            C      D
0  a          one,two  three,three    1,3
1  b  three,two,three  two,two,one  3,2,2
2  c          two,one      one,two    2,1


# How to aggregate counts?

In [43]:
df = pd.DataFrame({'A' : ['a', 'c', 'b', 'b', 'a', 'c', 'b'],
'B' : ['one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : ['three', np.nan, np.nan, 'two', 'three','two', 'one'],
'D' : [np.nan,2,3,2,3,np.nan,2]})
print (df)

   A      B      C    D
0  a    one  three  NaN
1  c    two    NaN  2.0
2  b  three    NaN  3.0
3  b    two    two  2.0
4  a    two  three  3.0
5  c    one    two  NaN
6  b  three    one  2.0


Pokazuje ilości występowania – ile razy wystąpiło

In [44]:
df1 = df.groupby('A').size().reset_index(name='COUNT')
print (df1)

   A  COUNT
0  a      2
1  b      3
2  c      2


ILE RAZY BYŁO DLA A BEZ PUSTYCH KOMÓREK

In [45]:
df2 = df.groupby('A')['C'].count().reset_index(name='COUNT')
print (df2)

   A  COUNT
0  a      2
1  b      2
2  c      1

In [48]:
df

Out[48]:
A B C D
0 a one three NaN
1 c two NaN 2.0
2 b three NaN 3.0
3 b two two 2.0
4 a two three 3.0
5 c one two NaN
6 b three one 2.0

Dla każdej kolumny ile razy było a, b i c

Tylko dla wartości bez pustych komórek

In [46]:
df3 = df.groupby('A').count().add_suffix('_COUNT').reset_index()
print (df3)

   A  B_COUNT  C_COUNT  D_COUNT
0  a        2        2        1
1  b        3        2        3
2  c        2        1        1

In [50]:
df

Out[50]:
A B C D
0 a one three NaN
1 c two NaN 2.0
2 b three NaN 3.0
3 b two two 2.0
4 a two three 3.0
5 c one two NaN
6 b three one 2.0

Sama kolumna A

ile razy pojawiła się literka ale sortuje od największej do najmniejszej

In [49]:
df4 = (df['A'].value_counts()
.rename_axis('A')
.reset_index(name='COUNT'))
print (df4)

   A  COUNT
0  b      3
1  a      2
2  c      2