-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathone_hot_encoding.py
40 lines (22 loc) · 1.17 KB
/
one_hot_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# create a sample dataframe
X = pd.DataFrame({"input1": [1,2,3,4,5],
"input2": ["A","A","B","B","C"],
"input3": ["X","X","X","Y","Y"]})
# put categorical variables in a list
categorical_vars = ["input2", "input3"]
# instantiate the one hot encoder
one_hot_encoder = OneHotEncoder(sparse=False, drop = "first")
# apply the one hot encoder logic
encoder_vars_array = one_hot_encoder.fit_transform(X[categorical_vars])
# create object for the feature names using the categorical variables
encoder_feature_names = one_hot_encoder.get_feature_names(categorical_vars)
# create a dataframe to hold the one hot encoded variables
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)
# concatenate the new dataframe back to the original input variables dataframe
X_new = pd.concat([X.reset_index(drop=True), encoder_vars_df.reset_index(drop=True)], axis = 1)
# drop the orignal input 2 and input 3 as it is not needed anymore
X_new.drop(categorical_vars, axis = 1, inplace = True)