
In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'foo', 'bar', 'foo', 'bar', 'foo'],
'B' : ['one', 'two', 'three','two', 'two', 'one'],
'C' : np.random.randint(5, size=6),
'D' : np.random.randint(5, size=6),
'E' : np.random.randint(5, size=6)})
print (df)
In [5]:
df1 = df.groupby(['A', 'B'], as_index=False)['C'].sum()
print (df1)
In [6]:
df2 = df.groupby(['A', 'B'], as_index=False).sum()
print (df2)
In [7]:
df3 = df.groupby(['A', 'B'], as_index=False)['C','D'].sum()
print (df3)
In [8]:
df1 = df.groupby(['A', 'B'], as_index=False)['C'].agg('sum')
print (df1)
In [9]:
df2 = df.groupby(['A', 'B'], as_index=False).agg('sum')
print (df2)
In [12]:
df4 = (df.groupby(['A', 'B'])['C'].agg([('average','mean'),('total','sum')]).reset_index())
print (df4)
In [11]:
df5 = (df.groupby(['A', 'B']).agg([('average','mean'),('total','sum')]))
print (df5)
In [17]:
# łączenie w jedną GŁOWĘ TABELI
df5.columns = df5.columns.map('_'.join)
df5 = df5.reset_index()
print (df5)
ZMIANA NAZWY KOLUMN
In [18]:
df5 = df.groupby(['A', 'B']).agg(['mean','sum'])
df5.columns = (df5.columns.map('_'.join)
.str.replace('sum','total')
.str.replace('mean','average'))
df5 = df5.reset_index()
print (df5)
ZMIANA NAZWY KOLUMN BEZPOŚREDNIO
In [19]:
df6 = (df.groupby(['A', 'B'], as_index=False)
.agg({'C':'sum','D':'mean'})
.rename(columns={'C':'C_total', 'D':'D_average'}))
print (df6)
TEGO NIE ROZUMIEM
In [20]:
def func(x):
return x.iat[0] + x.iat[-1]
df7 = (df.groupby(['A', 'B'], as_index=False)
.agg({'C':'sum','D': func})
.rename(columns={'C':'C_total', 'D':'D_sum_first_and_last'}))
print (df7)
No DataFrame after aggregation! What happened?
In [22]:
df1 = df.groupby(['A', 'B'])['C'].sum()
print (df1)
In [24]:
# Z DODATKOWYM INDEKSEM
df1 = df.groupby(['A', 'B'], as_index=False)['C'].sum()
print (df1)
In [25]:
# TO SAMO RESET INDEX
df1 = df.groupby(['A', 'B'])['C'].sum().reset_index()
print (df1)
In [26]:
df2 = df.groupby('A')['C'].sum()
print (df2)
In [31]:
df2.index
Out[31]:
In [32]:
print (type(df2))
In [33]:
df2 = df.groupby('A', as_index=False)['C'].sum()
print (df2)
In [34]:
df2 = df.groupby('A')['C'].sum().reset_index()
print (df2)
How to aggregate mainly strings columns (to lists, tuples, strings with separator)?
In [35]:
df = pd.DataFrame({'A' : ['a', 'c', 'b', 'b', 'a', 'c', 'b'],
'B' : ['one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : ['three', 'one', 'two', 'two', 'three','two', 'one'],
'D' : [1,2,3,2,3,1,2]})
print (df)
WYŚWIETLANIE ZBIORU WYRAZÓW
In [36]:
df1 = df.groupby('A')['B'].agg(list).reset_index()
print (df1)
TO SAMO ALE BEZ NAWIASÓW KWADRATOWYCH
In [37]:
df2 = df.groupby('A')['B'].agg(','.join).reset_index()
print (df2)
PRZEKSZTAŁCENIE WYRAZÓW NA CYFRY
In [38]:
df3 = (df.groupby('A')['D']
.agg(lambda x: ','.join(x.astype(str)))
.reset_index())
print (df3)
TO SAMO INNYM SPOSOBEM
In [39]:
df3 = (df.assign(D = df['D'].astype(str))
.groupby('A')['D']
.agg(','.join).reset_index())
print (df3)
In [40]:
df
Out[40]:
In [41]:
df4 = df.groupby('A').agg(','.join).reset_index()
print (df4)
In [42]:
df5 = (df.groupby('A')
.agg(lambda x: ','.join(x.astype(str)))
.reset_index())
print (df5)
How to aggregate counts?
In [43]:
df = pd.DataFrame({'A' : ['a', 'c', 'b', 'b', 'a', 'c', 'b'],
'B' : ['one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : ['three', np.nan, np.nan, 'two', 'three','two', 'one'],
'D' : [np.nan,2,3,2,3,np.nan,2]})
print (df)
Pokazuje ilości występowania – ile razy wystąpiło
In [44]:
df1 = df.groupby('A').size().reset_index(name='COUNT')
print (df1)
ILE RAZY BYŁO DLA A BEZ PUSTYCH KOMÓREK
In [45]:
df2 = df.groupby('A')['C'].count().reset_index(name='COUNT')
print (df2)
In [48]:
df
Out[48]:
Dla każdej kolumny ile razy było a, b i c
Tylko dla wartości bez pustych komórek
In [46]:
df3 = df.groupby('A').count().add_suffix('_COUNT').reset_index()
print (df3)
In [50]:
df
Out[50]:
Sama kolumna A
ile razy pojawiła się literka ale sortuje od największej do najmniejszej
In [49]:
df4 = (df['A'].value_counts()
.rename_axis('A')
.reset_index(name='COUNT'))
print (df4)