-
Notifications
You must be signed in to change notification settings - Fork 0
/
pandas_cheat_sheet.py
194 lines (148 loc) · 6.66 KB
/
pandas_cheat_sheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
---- Working With Data ------
#data cleanup on import -e.g. remove dollar signs or extraneous,
# pass them to df.replace(), specifying each char and it's replacement:
import regex
df[cols] = df[cols].replace({'\$': '', ',': ''}, regex=True)
Open file manually before bringin into pandas
with open('file.csv', 'r') as f:
...: exam_lines = f.readlines()
Rename Columns
df.rename(columns = {'$b':'B'}, inplace = True)
Null Values
#quick way to count the total number of null values
np.count_nonzero(df.isnull())
#can pass a column
np.count_nonzero(df['col1'].isnull())
#fill na with value, in this case, zero
df.fillna(0)
#fill forward or fill backward with adjacent data
df.fillna(method='ffill') #or 'bfill'
df.dropna() #drops missing values
#when performing calculations, skipna=True by default
#Drops columns based on column index
df.drop(df.columns[1:10], axis = 1, inplace = True)
#view rows with null values
df.loc[pd.isnull(df['col1']),:]
#apply a function to a dataframe, to a column(s) or row(s)
df.apply(my_function, axis=0) #change the axis to axis=1 to apply to a row
df['col1'] = df.apply(my_function, axis=0)
#merge and join data
new_df = df1.merge(df2, left_on='keycol_fromdf1', right_on='keycol_fromdf2')
----- Slicing and Selecting Data ------
# loc based on row name
#iloc based on row index (number)
#display the 100th row
print(df.loc[99])
#display multiple rows
print(df.loc[[0,99,999]])
#slicing columns or subsetting columns. note the colon to select all rows within the two columns
subset = df.loc[:,['col1', 'col2']]
#subset or slice columns with iloc, passing numeric values
subset = df.iloc[:, [2,4,-1]]
#or
subset = df.iloc[:,:3]
#to select non-contiguous columns with ranges,
pd.concat([df[['col1','col2']], df.loc[:,'col4':'col50']], axis=1)
#alternatively, use numpy arange to create a list of columns
initial_col = [0,1]
others = np.arange(3,7).tolist()
columns = initial_col + others
new_df = df.iloc[:,columns]
#subset columns - selects all rows, can pass a row index here as well
df.loc[:, ['col1','col2', 'col_n']
#Select all rows "except" those listed (e.g. selects b,c,d,e,)
df2 = df[df.columns.difference(['col_a', 'col_f','col_g'])]
#conditional selection
#separate attributes are enclosed in their own brackets
df.loc[df.column == 'Value_1']
df.loc[(df.column1 == 'Value_1') & (df.column2 >= 30)]
df.loc[(df.column1 == 'Value_1) | (df.column2 >= 30)]
df.loc[df.column2.isin(['Value_1', 'Value_2'])
df.loc[df.column.notnull()] #or isnull
#split and combine columns
col1_split = df['col1'].str.split('_', expand=True)
col1_split.columns = ['new_col1', 'new_col2']
joined_df = pd.concat([df, col1_split], axis=1)
#alternate way
new_col = df['col_name'].str.split('?',expand=True, n=1)
df_joined = pd.concat([new_col, df], axis=1)
-------Groupby and aggregations -----------
df.groupby('col_for_grouping')['col_for_aggregating'].sum()
#create a muliti-column grouped df that will group by cols 1 and 2 then sum based on 3 and 4
#This will create a multi-index dataframe
df2 = df.groupyb(['col_1','col2'])[['col3','col4']].sum()
#use reset_index to flatten the dataframe and avoid multi-index
#Use aggregation to look at multiple attributes of a second column
newdf = df.groupby('col1').col2.agg([min, max])
#boolean subset of DataFrame
df[df['col1'] > criteria]
#Groupby
#standard syntax is to specify first the column to group by, then the aggregation function to occur
grouped_df = df.groupby(['col1'])['col2'].sum()
#alternatively, you can simply create a groupby object. This returns a groupby object
grouped = df.groupby('col1')
#to view the groupby Object or to perform calculations
grouped.groups or grouped.sum() #mean(), #min()
#reshape data from wide to tall. A.k.a "tidy data"
pd.melt(df, id_vars='col_to_keep_unchanged')
#id_vars is a container(list, tuple, ndarray) that represents the variables that will remain as is
#value_vars identifes the columsn you want to melt down, or pivot, By default, pandas will melt
#any columns not specified in the id_vars parameter
#filter data based on a specific criteria. Here it filters on having a minimum of 30 samples.
new_df = df.groupby('col1').filter(lambda x: x['col1'].count() >=30)
#load multiple csv files into single dataframe
import glob
list_of_filenames = glob.glob('filepath/*.csv')
list_filename_df = []
for csv_filename in list_of_filenames:
df = pd.read_csv(csv_filename)
list_filename_df.append(df)
#same for loop in a list comprehension
list_filename_df_comp = [pd.read_csv(data) for data in list_of_filenames]
df_loop_concat_comp = pd.concat(list_filename_df_comp)
#pull filenames and strip the {.csv} from the filename, add names to list
import glob
filenames = []
for name in glob.glob('*.csv'):
filenames.append(name[:-4])
---- Time-based Calculations ------
#look at pandas strftime for additional functionality, not native to pandas
#datetime has a number of built in attributes
d = pd.to_datetime('2016-02-29')
d.month #year, week, day, weekday_name
#extract a portion of a date for a new column
df['year']= df['Date'].dt.year
#creating new columns based on date attributes
df['closing_quarter'] = (df['Closing Date'].dt.quarter)
#subsetting data by dates - two examples
df.loc[(df.Date.dt.year == 2010) & (df.Date.dt.month ==6)]
df.loc[(df.Date.dt.year >= 2010) & (df.Date.dt.year < 2011)]
#setting the date column to the index gives even more flexibility. Example selects all rows from June, 2010
df.index = df['Date']
df['2010-06'].iloc[:,:]
#resample date frequencies. If you resample to a higher frequency than the data, NaN fills missing values.
new = df.resample('M').sum()
#Converting minutes to seconds (python) - second brackets are needed for python 3
def get_sec(time_str):
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
print (get_sec('1:23:45'))
#Time-Date Handling
#Trim Datetime to date only (two parts)
df['Date_col'] = df['Date_col'].dt.date
df['Date_col'] = pd.to_datetime(df['Date_col'])
#TimeZones
#specify timezones and perform conversions using pytz
import pytz
depart=pd.Timestamp('2017-08-29 07:00', tz='US/Eastern')
#creating timedeltas across timezones will create an error unless the timezones are normalized. The example demonstrates a Pacific Arrival time, converted to allow timedelta calculation
duration = arrive.tz_convert('US/Eastern') - depart
#Linear Regression
import statsmodels.formula.api as smf
model = smf.ols(formula='tip~total_bill', data=tips) #the two variables being compared are separated by ~
results = model.fit()
results.summary()
----- Correlation and Statistical Analysis -------
corr_matrix = df.corr()
corr_matrix['target_column'].sort_values(ascending=False)
#creates a correlation plot then prints the values in descending order for a target column