-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessor.py
43 lines (31 loc) ยท 1.34 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from api import *
from data_preprocess import *
import time
def preprocess_main():
start = time.time()
### reclog, answerlog ์์ ์ฌ์ฉํ feature ###
log_use_features = ['rec_timestamp', 'answer', 'rectype',
'user', 'label', 'rec_log_id', 'question_from_user',
'company', 'job_small', 'question_type']
preprocess = Preprocess(log_use_features)
#### ๋ต๋ณ ํด๋ฆญ ๋ก๊ทธ ๊ธฐ์ค์ผ๋ก ์ถ์ฒ ๋ฒํผ ํด๋ฆญ ๋ก๊ทธ ๊ฐ์ ธ์ค๊ธฐ ####
fe_concat = preprocess.map_answerlog_last_reclog(load_answerlog(), load_recommendlog())
#### ๋
ธ์ถ๋ ๋ฐ์ดํฐ ํญ๋ชฉ์ ํ์ฉํ์ฌ positive: 1, negative:0 label ์์ฑ ####
preprocess.make_negative_answerlog(fe_concat)
#### side information ๋ณํฉ ####
preprocess.merge_side_information_user()
preprocess.merge_side_information_answer()
preprocess.merge_side_information_job()
preprocess._feature_engineering_data() #coin ๊ณ์ฐ์ ๋ณ์ ์ด๋ฆ์ ๊ตฌ๋ถํ๊ธฐ ์ํด ๋จผ์ ์คํ
preprocess.make_coin_feature()
### get fe data ###
fe_data = preprocess.get_fe_data()
print("---------------------Complete Preprocessing---------------------")
print("total time : ", time.time() - start)
return fe_data
if __name__ == '__main__':
preprocess_main()