Skip to content

Latest commit

 

History

History
791 lines (636 loc) · 13.8 KB

pandas.md

File metadata and controls

791 lines (636 loc) · 13.8 KB
import pandas as pd
import numpy as np

pd.set_option("display.notebook_repr_html", False)
df = pd.DataFrame({'state' : ['Texas', 'Texas', 'Kansas', 'Texas', 'Missouri', 'Missouri', 'North Carolina'],
                   'city' : ['Austin', 'San Antonio', 'Kansas City', 'Llano', 'Kansas City', 'St. Louis', 'Raleigh'],
                   'bbq_rating' : [8, 9, 7, 10, 7, 6, 5],
                   'outdoor_rating' : [8, 7, 5, 3, 5, 7, 8]})

df
state city bbq_rating outdoor_rating
0 Texas Austin 8 8
1 Texas San Antonio 9 7
2 Kansas Kansas City 7 5
3 Texas Llano 10 3
4 Missouri Kansas City 7 5
5 Missouri St. Louis 6 7
6 North Carolina Raleigh 5 8

Groupby

# groupby object is a pd.DataFrame if a list or array is passed to be computed
grouped_df = df.groupby('state')[['bbq_rating']]
grouped_df
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc461866400>
# groupby object is a pd.Series if only a single column is passed to be computed
grouped_series = df.groupby('state')['bbq_rating']
grouped_series
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc461866550>
# creates a GROUPBY object that can be iterated over to compile aggregrates
grouped = df.groupby(df['state'])
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc461866610>
# provides a statistical overview of the groupby object
grouped.describe()
bbq_rating outdoor_rating
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
state
Kansas 1.0 7.0 NaN 7.0 7.00 7.0 7.00 7.0 1.0 5.0 NaN 5.0 5.0 5.0 5.0 5.0
Missouri 2.0 6.5 0.707107 6.0 6.25 6.5 6.75 7.0 2.0 6.0 1.414214 5.0 5.5 6.0 6.5 7.0
North Carolina 1.0 5.0 NaN 5.0 5.00 5.0 5.00 5.0 1.0 8.0 NaN 8.0 8.0 8.0 8.0 8.0
Texas 3.0 9.0 1.000000 8.0 8.50 9.0 9.50 10.0 3.0 6.0 2.645751 3.0 5.0 7.0 7.5 8.0
# applying mean() method to groupby object grouped
grouped.mean()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
bbq_rating outdoor_rating
state
Kansas 7.0 5.0
Missouri 6.5 6.0
North Carolina 5.0 8.0
Texas 9.0 6.0
# grouping by 'outdoor_rating' field, passing multiple group keys, finding average; creates a pd.Series
means = df['outdoor_rating'].groupby([df['state'], df['city']]).mean()
means
state           city       
Kansas          Kansas City    5
Missouri        Kansas City    5
                St. Louis      7
North Carolina  Raleigh        8
Texas           Austin         8
                Llano          3
                San Antonio    7
Name: outdoor_rating, dtype: int64
# displays the 'size' or count of each group
df.groupby(['state']).size()
state
Kansas            1
Missouri          2
North Carolina    1
Texas             3
dtype: int64

Groupby agg

# using 'agg' with groupby object to aggregrate multi columns with multiple aggregrations
agg = {
    'city': 'nunique',
    'bbq_rating': ['mean', 'max'],
    'outdoor_rating': ['mean', 'max']
      }

# grouping by state, passing agg dict into agg() method
agg_df = df.groupby('state').agg(agg)
agg_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
city bbq_rating outdoor_rating
nunique mean max mean max
state
Kansas 1 7.0 7 5 5
Missouri 2 6.5 7 6 7
North Carolina 1 5.0 5 8 8
Texas 3 9.0 10 6 8
# list comphrension that creates new column names with column_aggregration notation; reset_index() flattens the summary
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df.reset_index()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
state city_nunique bbq_rating_mean bbq_rating_max outdoor_rating_mean outdoor_rating_max
0 Kansas 1 7.0 7 5 5
1 Missouri 2 6.5 7 6 7
2 North Carolina 1 5.0 5 8 8
3 Texas 3 9.0 10 6 8

Iterating over Groups

# groupby object supports iteration; generates a tuple of group key name, associated data
for state, group in df.groupby('state'):
    print(f'State: {state}')
    print(group, '\n')
State: Kansas
    state         city  bbq_rating  outdoor_rating
2  Kansas  Kansas City           7               5 

State: Missouri
      state         city  bbq_rating  outdoor_rating
4  Missouri  Kansas City           7               5
5  Missouri    St. Louis           6               7 

State: North Carolina
            state     city  bbq_rating  outdoor_rating
6  North Carolina  Raleigh           5               8 

State: Texas
   state         city  bbq_rating  outdoor_rating
0  Texas       Austin           8               8
1  Texas  San Antonio           9               7
3  Texas        Llano          10               3 
# iterating over groupby object with multiple passed group keys
for (state, city), group in df.groupby(['state', 'city']):
    print(f'State: {state}')
    print(f'Specific City: {city}')
    print(group, '\n')
State: Kansas
Specific City: Kansas City
    state         city  bbq_rating  outdoor_rating
2  Kansas  Kansas City           7               5 

State: Missouri
Specific City: Kansas City
      state         city  bbq_rating  outdoor_rating
4  Missouri  Kansas City           7               5 

State: Missouri
Specific City: St. Louis
      state       city  bbq_rating  outdoor_rating
5  Missouri  St. Louis           6               7 

State: North Carolina
Specific City: Raleigh
            state     city  bbq_rating  outdoor_rating
6  North Carolina  Raleigh           5               8 

State: Texas
Specific City: Austin
   state    city  bbq_rating  outdoor_rating
0  Texas  Austin           8               8 

State: Texas
Specific City: LLano
   state   city  bbq_rating  outdoor_rating
3  Texas  LLano          10               3 

State: Texas
Specific City: San Antonio
   state         city  bbq_rating  outdoor_rating
1  Texas  San Antonio           9               7 

Creating Groupby Dict

# computing a dict of the data pieces
pieces = dict(list(df.groupby('state')))

pieces['Texas']
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
state city bbq_rating outdoor_rating
0 Texas Austin 8 8
1 Texas San Antonio 9 7
3 Texas LLano 10 3
# groupby object is a pd.DataFrame if a list or array is passed to be computed
grouped_df = df.groupby('state')[['bbq_rating']]
grouped_df
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc6319e4610>
# groupby object is a pd.Series if only a single column is passed to be computed
grouped_series = df.groupby('state')['bbq_rating']
grouped_series
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc6319f7d90>
df.groupby('state').quantile(0.9)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
bbq_rating outdoor_rating
state
Kansas 7.0 5.0
Missouri 6.9 6.8
North Carolina 5.0 8.0
Texas 9.8 7.8