import pandas as pd
import dateutil
import numpy as np

df = pd.read_csv('c:/1/phone_data.csv')
df.head(4)

df['date'] = df['date'].apply(dateutil.parser.parse, dayfirst=True)

# How many rows and columns the dataset
df.shape

(830, 7)

# najdłuższa rozmowa telefoniczna
df['duration'].max()

10528.0

# chce się dowiedzieć co to była za rozmowa
df[df['duration']==df['duration'].max()]

df['duration'][df['item'] == 'call'].sum()

92321.0

## wyświetlam 10 największych rozmów
df['duration'][df['item'] == 'call'].nlargest(10)

816    10528.0
742     2328.0
252     2120.0
59      1940.0
648     1863.0
398     1859.0
31      1714.0
809     1325.0
548     1247.0
105     1234.0
Name: duration, dtype: float64

# znajduje 10 najdłuższych rozmów
df.loc[(df['duration']>1200)&(df['item'] == 'call')]

# znajduje 10 najdłuższych rozmów dla sieci Tesco
df['duration'][df['item'] == 'call'][df['network'] == 'Tesco'].nlargest(10)

105    1234.0
27      783.0
631     777.0
615     700.0
83      637.0
272     600.0
438     566.0
417     543.0
428     489.0
603     411.0
Name: duration, dtype: float64

## Wszystkie rozmowy z sieci TEsco - 10 najwiekszych drugi sposób
df.loc[(df['item'] == 'call')&(df['network'] == 'Tesco')]['duration'].nlargest(10)

105    1234.0
27      783.0
631     777.0
615     700.0
83      637.0
272     600.0
438     566.0
417     543.0
428     489.0
603     411.0
Name: duration, dtype: float64

# ile było rozmawiane w każdym miesiącu
df['month'].value_counts()

2014-11    230
2015-01    205
2014-12    157
2015-02    137
2015-03    101
Name: month, dtype: int64

# ile było rozmawiane w każdym miesiącu z każdej z sieci
df.pivot_table(index = ['month', 'network'], values='duration', columns='item', aggfunc=[np.sum])

## ilu miałem operatorów?
df['network'].nunique()

9

df.describe()

df.pivot_table(index='network', values='duration', aggfunc=['sum', 'max','min', np.std, 'median'])

# statystyka rozmów
df['duration'].agg(['min', 'max', np.mean, np.median, np.std])

min           1.000000
max       10528.000000
mean        117.804036
median       24.500000
std         444.129560
Name: duration, dtype: float64

df.columns

Index(['index', 'date', 'duration', 'item', 'month', 'network',
       'network_type'],
      dtype='object')

# użycie wykresy rozwarstwień
import seaborn as sns
sns.relplot(x='month', y='duration',hue='network', size='item',  kind="line", data=df)

<seaborn.axisgrid.FacetGrid at 0x1efe7764390>

Groupby

# jego sposób
df.groupby(['month']).groups.keys()

dict_keys(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'])

# mój sposób
df['month'].unique()

array(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'],
      dtype=object)

# jego sposób
len(df.groupby(['month']).groups['2014-11'])

230

# mój sposób
df[df['month']=='2014-11']['month'].count()

230

## pierwsze zapisy na rejestrze wg miesiąca
df.groupby('month').first()

df.dtypes

index                    int64
date            datetime64[ns]
duration               float64
item                    object
month                   object
network                 object
network_type            object
dtype: object

## dane są kompletne
df.isnull().sum()

index           0
date            0
duration        0
item            0
month           0
network         0
network_type    0
dtype: int64

## 10 najdłyższych rozmów 
df.groupby('duration').first().tail(10)

## ile trwały rozmowy w miesiącach
df.groupby('month')['duration'].sum()

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

# ile było połączeń w miesiącach
df.groupby('month')['date'].count()

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
Name: date, dtype: int64

# ile trwały tylko połączenia telefoniczne
df[df['item'] == 'call'].groupby('network')['duration'].sum()

network
Meteor        7200.0
Tesco        13828.0
Three        36464.0
Vodafone     14621.0
landline     18433.0
voicemail     1775.0
Name: duration, dtype: float64

# ile było usług (m.in. połączeń) zależnie od typu w miesiącu
df.groupby(['month', 'item'])['date'].count()

month    item
2014-11  call    107
         data     29
         sms      94
2014-12  call     79
         data     30
         sms      48
2015-01  call     88
         data     31
         sms      86
2015-02  call     67
         data     31
         sms      39
2015-03  call     47
         data     29
         sms      25
Name: date, dtype: int64

# # ile było usług (m.in. połączeń) zależnie od typu w miesiącu inny sposób
df.pivot_table(index=['month','item'], values='duration', aggfunc=['count'])

## ile było usług według typu usług? TWORZENIE SERII
df.groupby(['month', 'network_type'])['date'].count()

month    network_type
2014-11  data             29
         landline          5
         mobile          189
         special           1
         voicemail         6
2014-12  data             30
         landline          7
         mobile          108
         voicemail         8
         world             4
2015-01  data             31
         landline         11
         mobile          160
         voicemail         3
2015-02  data             31
         landline          8
         mobile           90
         special           2
         voicemail         6
2015-03  data             29
         landline         11
         mobile           54
         voicemail         4
         world             3
Name: date, dtype: int64

## ile było usług według typu usług? TWORZENIE TABELI DANYCH
df.groupby(['month', 'network_type'])[['date']].count()

## ile było usług według operatorów?
df.pivot_table(index=['month','network_type'], values='duration', aggfunc=['count'])

# tworenie serii
df.groupby('month')['duration'].sum()

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

# tworzenie DataFrame
df.groupby('month')[['duration']].sum()

# zadeklarowanie że nie chcemy indexu jako daty
df.groupby('month', as_index=False).agg({"duration": "sum"})

## TEGO NIE UMIEM W TABELI PRZESTAWNEJ
## grupowanie zaawansowane
df.groupby(['month', 'item']).agg({'duration':sum, 'network_type': "count", 'date': 'first' })

## grupowanie zaawansowane
df.groupby(['month', 'network']).agg({'duration':sum,'item':"count"})

df.pivot_table(index=['month', 'network'], values='duration', aggfunc=['sum','count'])

## grupowanie zaawansowane
df.groupby(['month', 'network','item']).agg({'duration':sum,'item':"count"})

df.dtypes

index                    int64
date            datetime64[ns]
duration               float64
item                    object
month                   object
network                 object
network_type            object
dtype: object

#df['date'] = df.date.astype(int)

aggregations = {'duration': lambda x: max(x) - 1}
df.groupby('month').agg(aggregations)

# Najdłuższy czas połączeń wg miesięcy
aggregations = {'duration': lambda x: max(x)}
df.groupby('month').agg(aggregations)

# całkowity czas połączeń wg miesięcy
aggregations = {'duration': lambda x: sum(x)}
df.groupby('month').agg(aggregations)

# całkowity czas połączeń wg operatorów
aggregations = {'duration': lambda x: sum(x)}
df.groupby('network').agg(aggregations)

# boskie
df.groupby(['month', 'item']).agg({'duration': [min, max, sum],'network_type': "count", 'date': [min, 'first', 'nunique']})

# to jest zrozumiałe
grouped = df.groupby('month').agg({'duration': [min, max, np.mean]})
#grouped.columns = ["min_duration","max_duration", "mean_duration"]
#grouped.columns = grouped.columns.droplevel(level=0)
grouped.rename(columns={ "min": "min_duration", "max": "max_duration", "mean": "mean_duration"},inplace=True)
grouped.head()

grouped = df.groupby('month').agg({'duration': [min, max, np.mean]}) 
# Using ravel, and a string join, we can create better names for the columns:
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
grouped

# Define the aggregation calculations
aggregations = {'duration': { 'total_duration': 'sum', 'average_duration': 'mean', 'num_calls': 'count'},
    'date': { 'max_date': 'max', 'min_date': 'min', 'num_days': lambda x: max(x) - min(x)},
    'network': ["count", "max"]}

df[df['item'] == 'call'].groupby('month').agg(aggregations)

C:ProgramDataAnaconda3libsite-packagespandascoregroupbygeneric.py:1315: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)

		sum
	item	call	data	sms
month	network
2014-11	Meteor	1521.0	NaN	10.0
	Tesco	4045.0	NaN	3.0
	Three	12458.0	NaN	25.0
	Vodafone	4316.0	NaN	55.0
	data	NaN	998.441	NaN
	landline	2906.0	NaN	NaN
	special	NaN	NaN	1.0
	voicemail	301.0	NaN	NaN
2014-12	Meteor	2010.0	NaN	12.0
	Tesco	1819.0	NaN	1.0
	Three	6316.0	NaN	13.0
	Vodafone	1302.0	NaN	18.0
	data	NaN	1032.870	NaN
	landline	1424.0	NaN	NaN
	voicemail	690.0	NaN	NaN
	world	NaN	NaN	4.0
2015-01	Meteor	2207.0	NaN	10.0
	Tesco	2904.0	NaN	3.0
	Three	6445.0	NaN	33.0
	Vodafone	3626.0	NaN	40.0
	data	NaN	1067.299	NaN
	landline	1603.0	NaN	NaN
	voicemail	285.0	NaN	NaN
2015-02	Meteor	1188.0	NaN	1.0
	Tesco	4087.0	NaN	2.0
	Three	6279.0	NaN	11.0
	Vodafone	1864.0	NaN	23.0
	data	NaN	1067.299	NaN
	landline	730.0	NaN	NaN
	special	NaN	NaN	2.0
	voicemail	268.0	NaN	NaN
2015-03	Meteor	274.0	NaN	NaN
	Tesco	973.0	NaN	4.0
	Three	4966.0	NaN	5.0
	Vodafone	3513.0	NaN	13.0
	data	NaN	998.441	NaN
	landline	11770.0	NaN	NaN
	voicemail	231.0	NaN	NaN
	world	NaN	NaN	3.0

THE DATA SCIENCE LIBRARY

Wojciech Moszczyński

Formuła Pandas: groupby part 1

Groupby

	index	date	duration	item	month	network	network_type
0	0	15/10/14 06:58	34.429	data	2014-11	data	data
1	1	15/10/14 06:58	13.000	call	2014-11	Vodafone	mobile
2	2	15/10/14 14:46	23.000	call	2014-11	Meteor	mobile
3	3	15/10/14 14:48	4.000	call	2014-11	Tesco	mobile

	index	date	duration	item	month	network	network_type
31	31	2014-10-18 13:10:00	1714.0	call	2014-11	Three	mobile
59	59	2014-10-23 08:34:00	1940.0	call	2014-11	landline	landline
105	105	2014-10-31 13:27:00	1234.0	call	2014-11	Tesco	mobile
171	171	2014-11-07 09:33:00	1205.0	call	2014-11	Vodafone	mobile
252	252	2014-11-19 18:56:00	2120.0	call	2014-12	Three	mobile
398	398	2014-12-17 18:08:00	1859.0	call	2015-01	Vodafone	mobile
548	548	2015-01-08 20:31:00	1247.0	call	2015-01	Three	mobile
648	648	2015-01-25 16:55:00	1863.0	call	2015-02	Three	mobile
742	742	2015-02-17 19:09:00	2328.0	call	2015-03	Three	mobile
809	809	2015-03-03 14:34:00	1325.0	call	2015-03	Vodafone	mobile
816	816	2015-03-04 12:29:00	10528.0	call	2015-03	landline	landline

	index	duration
count	830.000000	830.000000
mean	414.500000	117.804036
std	239.744656	444.129560
min	0.000000	1.000000
25%	207.250000	1.000000
50%	414.500000	24.500000
75%	621.750000	55.000000
max	829.000000	10528.000000

	sum	max	min	std	median
	duration	duration	duration	duration	duration
network
Meteor	7233.00	1090.000	1.000	169.690291	5.000
Tesco	13841.00	1234.000	1.000	227.471611	72.500
Three	36551.00	2328.000	1.000	368.311638	4.000
Vodafone	14770.00	1859.000	1.000	232.527090	1.000
data	5164.35	34.429	34.429	0.000000	34.429
landline	18433.00	10528.000	3.000	1631.415609	75.000
special	3.00	1.000	1.000	0.000000	1.000
voicemail	1775.00	174.000	1.000	44.294984	63.000
world	7.00	1.000	1.000	0.000000	1.000

	index	date	duration	item	network	network_type
month
2014-11	0	2014-10-15 06:58:00	34.429	data	data	data
2014-12	228	2014-11-13 06:58:00	34.429	data	data	data
2015-01	381	2014-12-13 06:58:00	34.429	data	data	data
2015-02	577	2015-01-13 06:58:00	34.429	data	data	data
2015-03	729	2015-02-12 20:15:00	69.000	call	landline	landline

		date
month	network_type
2014-11	data	29
	landline	5
	mobile	189
	special	1
	voicemail	6
2014-12	data	30
	landline	7
	mobile	108
	voicemail	8
	world	4
2015-01	data	31
	landline	11
	mobile	160
	voicemail	3
2015-02	data	31
	landline	8
	mobile	90
	special	2
	voicemail	6
2015-03	data	29
	landline	11
	mobile	54
	voicemail	4
	world	3

	duration
month
2014-11	26639.441
2014-12	14641.870
2015-01	18223.299
2015-02	15522.299
2015-03	22750.441

	duration
month
2014-11	1939.0
2014-12	2119.0
2015-01	1858.0
2015-02	1862.0
2015-03	10527.0

		duration			network_type	date
		min	max	sum	count	min	first	nunique
month	item
2014-11	call	1.000	1940.000	25547.000	107	2014-10-15 06:58:00	2014-10-15 06:58:00	104
	data	34.429	34.429	998.441	29	2014-10-15 06:58:00	2014-10-15 06:58:00	29
	sms	1.000	1.000	94.000	94	2014-10-16 22:18:00	2014-10-16 22:18:00	79
2014-12	call	2.000	2120.000	13561.000	79	2014-11-14 17:24:00	2014-11-14 17:24:00	76
	data	34.429	34.429	1032.870	30	2014-11-13 06:58:00	2014-11-13 06:58:00	30
	sms	1.000	1.000	48.000	48	2014-11-14 17:28:00	2014-11-14 17:28:00	41
2015-01	call	2.000	1859.000	17070.000	88	2014-12-15 20:03:00	2014-12-15 20:03:00	84
	data	34.429	34.429	1067.299	31	2014-12-13 06:58:00	2014-12-13 06:58:00	31
	sms	1.000	1.000	86.000	86	2014-12-15 19:56:00	2014-12-15 19:56:00	58
2015-02	call	1.000	1863.000	14416.000	67	2015-01-15 10:36:00	2015-01-15 10:36:00	67
	data	34.429	34.429	1067.299	31	2015-01-13 06:58:00	2015-01-13 06:58:00	31
	sms	1.000	1.000	39.000	39	2015-01-15 12:23:00	2015-01-15 12:23:00	27
2015-03	call	2.000	10528.000	21727.000	47	2015-02-12 20:15:00	2015-02-12 20:15:00	47
	data	34.429	34.429	998.441	29	2015-02-13 06:58:00	2015-02-13 06:58:00	29
	sms	1.000	1.000	25.000	25	2015-02-19 18:46:00	2015-02-19 18:46:00	17

	duration
	min_duration	max_duration	mean_duration
month
2014-11	1.0	1940.0	115.823657
2014-12	1.0	2120.0	93.260318
2015-01	1.0	1859.0	88.894141
2015-02	1.0	1863.0	113.301453
2015-03	1.0	10528.0	225.251891

	duration			date			network
	total_duration	average_duration	num_calls	max_date	min_date	num_days	count	max
month
2014-11	25547.0	238.757009	107	2014-11-12 19:01:00	2014-10-15 06:58:00	28 days 12:03:00	107	voicemail
2014-12	13561.0	171.658228	79	2014-12-14 19:54:00	2014-11-14 17:24:00	30 days 02:30:00	79	voicemail
2015-01	17070.0	193.977273	88	2015-01-14 20:47:00	2014-12-15 20:03:00	30 days 00:44:00	88	voicemail
2015-02	14416.0	215.164179	67	2015-02-09 17:54:00	2015-01-15 10:36:00	25 days 07:18:00	67	voicemail
2015-03	21727.0	462.276596	47	2015-03-04 12:29:00	2015-02-12 20:15:00	19 days 16:14:00	47	voicemail