-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtools.py
153 lines (121 loc) · 4.89 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import numpy as np
import pandas as pd
def clean_portfolio(df, channel_types=['web', 'email', 'mobile', 'social']):
"""
Cleans the raw portfolio dataframe through various cleaning steps.
Input:
df - the raw portfolio dataframe from portfolio.json
channel_types - a list of str's for the different media channels
Output:
clean_df - the cleaned dataframe.
Cleaning steps:
- Takes channels column and converts it into 0/1's columns for each channel.
- Reorder the columns.
- Rename the id column to "offer_id".
"""
# Get a dataframe for channel data
# Iterates through each list in the .channels series
# to pull out whether each channel is present or not.
campaigns = []
for campaign in df.channels:
campaign = set(campaign)
channels = []
for channel in channel_types:
if channel in campaign:
channels.append(1)
else:
channels.append(0)
campaigns.append(channels)
channel_frame = pd.DataFrame(campaigns)
channel_frame.columns = channel_types
# Reorder the original dataframe as well replacing the channels column
# with the new channel dataframe
clean_df = pd.concat([df[['id']],
channel_frame,
df[['offer_type', 'duration', 'difficulty', 'reward']]],
axis=1)
# Rename id as offer_id
clean_df = clean_df.rename(columns={'id':'offer_id'})
# Convert duration in days to hours
clean_df['duration'] = (clean_df['duration'] * 24).astype('int')
return clean_df
def clean_profile(df):
"""
Cleans the raw portfolio dataframe through various cleaning steps.
Input:
df - the raw profile dataframe from profile.json
Output:
clean_df - the cleaned dataframe.
Cleaning steps:
- Standardize the different types of NAs (None, age == 118) into np.nan.
- Convert 'became_member_on' into datetime objects.
- Reorder the columns.
- Rename the user id.
"""
df = df.copy()
# Standardize NAs to numpy nan
df['age'] = df['age'].replace(118, np.nan)
df['gender'] = df['gender'].fillna(np.nan)
# Convert str to datetime
df['became_member_on'] = pd.to_datetime(
df['became_member_on'], format='%Y%m%d')
# Reorder the columns
clean_df = df[['id', 'gender', 'age', 'income', 'became_member_on']]
# Relabel id as user_id
clean_df = clean_df.rename(columns={'id': 'customer_id'})
return clean_df
def clean_transcript(df):
"""
Cleans the raw transcript dataframe through various cleaning steps.
Input:
df - the raw transcript dataframe from transcript.json
Output:
clean_df - the cleaned dataframe.
Cleaning steps:
- Replace spaces in event strings to underscores.
- Expand out "value" column.
"""
df = df.copy()
# Replace space with underscore in event column
df['event'] = df['event'].str.replace(' ', '_')
# Expand the value column into multiple columns
value_frame = pd.DataFrame(df['value'].to_list())
# Combine 'offer id' and 'offer_id' columns
# First get the non nulls from each and check that they don't overlap
a = value_frame['offer id'].notnull()
b = value_frame['offer_id'].notnull()
assert ~((a & b).any()) # True if no overlap
# Make the combined column and drop 'offer id'
value_frame['offer_id'] = value_frame['offer_id'].combine_first(
value_frame['offer id'])
value_frame = value_frame.drop(labels=['offer id'], axis=1)
# Concatenate and reorder the columns
clean_df = pd.concat([df, value_frame], axis=1)
clean_df = clean_df[['person', 'time',
'event', 'amount', 'reward', 'offer_id']]
# Rename the 'person column'
clean_df = clean_df.rename(columns={'person': 'customer_id'})
return clean_df
def reconciliate_ids(primary_df, secondary_df, id_column_name):
"""
Takes hashed ids from two dataframes, and changes them to numbers.
The primary dataframe should be either portfolio or profile.
The secondary dataframe should be the transcript dataframe.
Input:
primary_df - either portfolio or profile dataframe
secondary_df - the transcript dataframe
id_column_name - a str representing the column you want to reconciliate
Output:
df1, df2 - dataframes with new id numbers, of the primary and secondary dfs respectively.
"""
# Renaming for convenience
df1 = primary_df.copy()
df2 = secondary_df.copy()
# Get unique ids from primary df
unique_ids = df1[id_column_name].unique()
# Create a dict of sequential ids
id_dict = dict(zip(unique_ids, range(1, len(unique_ids) + 1)))
# Map primary and secondary dataframe values using the new id dict
df1[id_column_name] = df1[id_column_name].map(id_dict)
df2[id_column_name] = df2[id_column_name].map(id_dict)
return df1, df2