import pandas as pd
import numpy as np
pd.set_option("display.notebook_repr_html", False)
df = pd.DataFrame({'state' : ['Texas', 'Texas', 'Kansas', 'Texas', 'Missouri', 'Missouri', 'North Carolina'],
'city' : ['Austin', 'San Antonio', 'Kansas City', 'Llano', 'Kansas City', 'St. Louis', 'Raleigh'],
'bbq_rating' : [8, 9, 7, 10, 7, 6, 5],
'outdoor_rating' : [8, 7, 5, 3, 5, 7, 8]})
df
|
state |
city |
bbq_rating |
outdoor_rating |
0 |
Texas |
Austin |
8 |
8 |
1 |
Texas |
San Antonio |
9 |
7 |
2 |
Kansas |
Kansas City |
7 |
5 |
3 |
Texas |
Llano |
10 |
3 |
4 |
Missouri |
Kansas City |
7 |
5 |
5 |
Missouri |
St. Louis |
6 |
7 |
6 |
North Carolina |
Raleigh |
5 |
8 |
# groupby object is a pd.DataFrame if a list or array is passed to be computed
grouped_df = df.groupby('state')[['bbq_rating']]
grouped_df
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc461866400>
# groupby object is a pd.Series if only a single column is passed to be computed
grouped_series = df.groupby('state')['bbq_rating']
grouped_series
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc461866550>
# creates a GROUPBY object that can be iterated over to compile aggregrates
grouped = df.groupby(df['state'])
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc461866610>
# provides a statistical overview of the groupby object
grouped.describe()
|
bbq_rating |
outdoor_rating |
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
count |
mean |
std |
min |
25% |
50% |
75% |
max |
state |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Kansas |
1.0 |
7.0 |
NaN |
7.0 |
7.00 |
7.0 |
7.00 |
7.0 |
1.0 |
5.0 |
NaN |
5.0 |
5.0 |
5.0 |
5.0 |
5.0 |
Missouri |
2.0 |
6.5 |
0.707107 |
6.0 |
6.25 |
6.5 |
6.75 |
7.0 |
2.0 |
6.0 |
1.414214 |
5.0 |
5.5 |
6.0 |
6.5 |
7.0 |
North Carolina |
1.0 |
5.0 |
NaN |
5.0 |
5.00 |
5.0 |
5.00 |
5.0 |
1.0 |
8.0 |
NaN |
8.0 |
8.0 |
8.0 |
8.0 |
8.0 |
Texas |
3.0 |
9.0 |
1.000000 |
8.0 |
8.50 |
9.0 |
9.50 |
10.0 |
3.0 |
6.0 |
2.645751 |
3.0 |
5.0 |
7.0 |
7.5 |
8.0 |
# applying mean() method to groupby object grouped
grouped.mean()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
bbq_rating |
outdoor_rating |
state |
|
|
Kansas |
7.0 |
5.0 |
Missouri |
6.5 |
6.0 |
North Carolina |
5.0 |
8.0 |
Texas |
9.0 |
6.0 |
# grouping by 'outdoor_rating' field, passing multiple group keys, finding average; creates a pd.Series
means = df['outdoor_rating'].groupby([df['state'], df['city']]).mean()
means
state city
Kansas Kansas City 5
Missouri Kansas City 5
St. Louis 7
North Carolina Raleigh 8
Texas Austin 8
Llano 3
San Antonio 7
Name: outdoor_rating, dtype: int64
# displays the 'size' or count of each group
df.groupby(['state']).size()
state
Kansas 1
Missouri 2
North Carolina 1
Texas 3
dtype: int64
# using 'agg' with groupby object to aggregrate multi columns with multiple aggregrations
agg = {
'city': 'nunique',
'bbq_rating': ['mean', 'max'],
'outdoor_rating': ['mean', 'max']
}
# grouping by state, passing agg dict into agg() method
agg_df = df.groupby('state').agg(agg)
agg_df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
city |
bbq_rating |
outdoor_rating |
|
nunique |
mean |
max |
mean |
max |
state |
|
|
|
|
|
Kansas |
1 |
7.0 |
7 |
5 |
5 |
Missouri |
2 |
6.5 |
7 |
6 |
7 |
North Carolina |
1 |
5.0 |
5 |
8 |
8 |
Texas |
3 |
9.0 |
10 |
6 |
8 |
# list comphrension that creates new column names with column_aggregration notation; reset_index() flattens the summary
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df.reset_index()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
state |
city_nunique |
bbq_rating_mean |
bbq_rating_max |
outdoor_rating_mean |
outdoor_rating_max |
0 |
Kansas |
1 |
7.0 |
7 |
5 |
5 |
1 |
Missouri |
2 |
6.5 |
7 |
6 |
7 |
2 |
North Carolina |
1 |
5.0 |
5 |
8 |
8 |
3 |
Texas |
3 |
9.0 |
10 |
6 |
8 |
# groupby object supports iteration; generates a tuple of group key name, associated data
for state, group in df.groupby('state'):
print(f'State: {state}')
print(group, '\n')
State: Kansas
state city bbq_rating outdoor_rating
2 Kansas Kansas City 7 5
State: Missouri
state city bbq_rating outdoor_rating
4 Missouri Kansas City 7 5
5 Missouri St. Louis 6 7
State: North Carolina
state city bbq_rating outdoor_rating
6 North Carolina Raleigh 5 8
State: Texas
state city bbq_rating outdoor_rating
0 Texas Austin 8 8
1 Texas San Antonio 9 7
3 Texas Llano 10 3
# iterating over groupby object with multiple passed group keys
for (state, city), group in df.groupby(['state', 'city']):
print(f'State: {state}')
print(f'Specific City: {city}')
print(group, '\n')
State: Kansas
Specific City: Kansas City
state city bbq_rating outdoor_rating
2 Kansas Kansas City 7 5
State: Missouri
Specific City: Kansas City
state city bbq_rating outdoor_rating
4 Missouri Kansas City 7 5
State: Missouri
Specific City: St. Louis
state city bbq_rating outdoor_rating
5 Missouri St. Louis 6 7
State: North Carolina
Specific City: Raleigh
state city bbq_rating outdoor_rating
6 North Carolina Raleigh 5 8
State: Texas
Specific City: Austin
state city bbq_rating outdoor_rating
0 Texas Austin 8 8
State: Texas
Specific City: LLano
state city bbq_rating outdoor_rating
3 Texas LLano 10 3
State: Texas
Specific City: San Antonio
state city bbq_rating outdoor_rating
1 Texas San Antonio 9 7
# computing a dict of the data pieces
pieces = dict(list(df.groupby('state')))
pieces['Texas']
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
state |
city |
bbq_rating |
outdoor_rating |
0 |
Texas |
Austin |
8 |
8 |
1 |
Texas |
San Antonio |
9 |
7 |
3 |
Texas |
LLano |
10 |
3 |
# groupby object is a pd.DataFrame if a list or array is passed to be computed
grouped_df = df.groupby('state')[['bbq_rating']]
grouped_df
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc6319e4610>
# groupby object is a pd.Series if only a single column is passed to be computed
grouped_series = df.groupby('state')['bbq_rating']
grouped_series
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc6319f7d90>
df.groupby('state').quantile(0.9)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
bbq_rating |
outdoor_rating |
state |
|
|
Kansas |
7.0 |
5.0 |
Missouri |
6.9 |
6.8 |
North Carolina |
5.0 |
8.0 |
Texas |
9.8 |
7.8 |