-
Notifications
You must be signed in to change notification settings - Fork 138
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3a085a0
commit a29628a
Showing
16 changed files
with
751 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import warnings | ||
import pandas as pd | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
|
||
|
||
def fe_accumulate(G_df_dict, G_data_info, G_hist, is_train, remain_time): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
|
||
start = time.time() | ||
log('[+] feature engineer, accumulate') | ||
time_col = G_data_info['target_time'] | ||
|
||
if is_train: | ||
G_hist['FE_Accumulate'] = {} | ||
G_hist['FE_Accumulate']['normal'] = [] | ||
G_hist['FE_Accumulate']['time'] = [] | ||
|
||
for col in tqdm(G_hist['big_cols_cat']): | ||
G_hist['FE_Accumulate']['normal'].append(col) | ||
log("accumulate normal features: {}".format(G_hist['FE_Accumulate']['normal'])) | ||
|
||
if G_data_info['time_series_data'] == 'true': | ||
if G_hist['big_data_type'][time_col] == 'Unix_timestamp': | ||
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col) | ||
|
||
for col in tqdm(G_hist['big_cols_cat']): | ||
G_hist['FE_Accumulate']['time'].append(col) | ||
log("window features: {}".format(G_hist['FE_Accumulate']['time'])) | ||
|
||
G_df_dict['FE_Accumulate'] = pd.DataFrame() | ||
for col in tqdm(G_hist['FE_Accumulate']['normal']): | ||
G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] = G_df_dict['BIG'].groupby(col).cumcount() | ||
|
||
for col in tqdm(G_hist['FE_Accumulate']['time']): | ||
G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] = G_df_dict['BIG'].groupby(col)[time_col].transform('min') | ||
G_df_dict['FE_Accumulate'][f'{col}_acc_cnt_div_delta_time'] = G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] / \ | ||
(G_df_dict['BIG'][time_col] - G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] + 1) | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import warnings | ||
import pandas as pd | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
|
||
def fe_count_map(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
|
||
start = time.time() | ||
log('[+] feature engineer, count') | ||
|
||
Id = G_data_info['target_id'] | ||
target = G_data_info['target_label'] | ||
|
||
if is_train: | ||
G_hist['FE_count'] = {} | ||
G_hist['FE_count']['feature_map'] = {} | ||
G_hist['FE_count']['cnt_features'] = [] | ||
size_of_big = G_df_dict['BIG'].shape[0] | ||
|
||
cnt_features = [] | ||
for col in G_df_dict['BIG'].columns: | ||
if col in [target] + Id: | ||
continue | ||
if '_in_' in col: | ||
continue | ||
if 'int' in str(G_df_dict['BIG'][col].dtype): | ||
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8 and G_df_dict['BIG'][col].nunique() < 200000: | ||
cnt_features.append(col) | ||
G_hist['FE_count']['cnt_features'] = cnt_features | ||
log("count features: {}".format(cnt_features)) | ||
|
||
for f in cnt_features: | ||
temp = pd.DataFrame(G_df_dict['BIG'][f]) | ||
temp[f + '_cnt'] = temp.groupby([f])[f].transform('count') | ||
temp.index = temp[f] | ||
temp = temp.drop(f, axis=1) | ||
faeture_map = temp.to_dict()[f + '_cnt'] | ||
G_hist['FE_count']['feature_map'][f] = faeture_map | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_count'] = pd.DataFrame() | ||
for f in G_hist['FE_count']['cnt_features']: | ||
G_df_dict['FE_count'][f + "_cnt"] = G_df_dict['BIG'][f].map(G_hist['FE_count']['feature_map'][f]) | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import warnings | ||
import pandas as pd | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
|
||
def fe_count_ratio(G_df_dict, G_data_info, G_hist, is_train, remain_time): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
start = time.time() | ||
log('[+] feature engineer, count ratio') | ||
|
||
if is_train: | ||
G_hist['FE_count_ratio'] = {} | ||
size_of_big = G_df_dict['BIG'].shape[0] | ||
|
||
cnt_ratio_features = [] | ||
for col in G_hist['big_cols_cat'] + G_hist['big_cols_num']: | ||
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8: | ||
cnt_ratio_features.append(col) | ||
G_hist['FE_count_ratio'] = cnt_ratio_features | ||
log("count ratio features: {}".format(cnt_ratio_features)) | ||
|
||
G_df_dict['FE_count_ratio'] = pd.DataFrame() | ||
for col in tqdm(G_hist['FE_count_ratio']): | ||
G_df_dict['FE_count_ratio'][col + "_cnt_ratio"] = G_df_dict['BIG'].groupby(col)[col].transform('count') / \ | ||
G_df_dict['BIG'].shape[0] | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import warnings | ||
import pandas as pd | ||
warnings.filterwarnings('ignore') | ||
import time | ||
from autox.autox_server.util import log | ||
|
||
def _groupby_agg_rolling(df, keys, col, op, k, col_time): | ||
name = 'WIN_{}_{}_({})_({})'.format(k, op.upper(), '_'.join(keys), col) | ||
if type(k) == int: | ||
s = df.groupby(keys)[[col]] | ||
df_gp = s.rolling(window = k).agg(op) # rolling by number | ||
else: | ||
closed = 'left' # [left, right) | ||
# closed = 'both' # [left, right] | ||
s = df.groupby(keys)[[col_time, col]] | ||
df_gp = s.rolling(window = k, on = col_time, closed = closed).agg(op).iloc[:, -1:] # rolling by time | ||
df_gp.columns = [name] | ||
df_gp = df_gp.sort_index(level = 1).reset_index(drop = True) | ||
return df_gp | ||
|
||
def fe_frequency(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
|
||
start = time.time() | ||
log('[+] feature engineer, frequency') | ||
|
||
big_size = G_df_dict['BIG'].shape[0] | ||
time_col = G_data_info['target_time'] | ||
|
||
if is_train: | ||
G_hist['FE_frequency'] = {} | ||
G_hist['FE_frequency']['keys'] = [] | ||
G_hist['FE_frequency']['cols'] = [] | ||
|
||
if G_data_info['time_series_data'] == 'true': | ||
# !先对df排序 | ||
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col) | ||
|
||
keys_features = [] | ||
for col in G_hist['big_cols_cat']: | ||
if big_size * 0.005 < G_df_dict['BIG'][col].nunique() < big_size * 0.01: | ||
keys_features.append(col) | ||
G_hist['FE_frequency']['keys'] = keys_features | ||
log("FE_frequency keys: {}".format(keys_features)) | ||
|
||
cols_features = [] | ||
for col in G_hist['big_cols_cat']: | ||
if big_size * 0.6 < G_df_dict['BIG'][col].nunique() < big_size * 0.8: | ||
cols_features.append(col) | ||
G_hist['FE_frequency']['cols'] = cols_features | ||
log("FE_frequency cols: {}".format(cols_features)) | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_frequency'] = pd.DataFrame() | ||
for col in G_hist['FE_frequency']['cols']: | ||
for key_ in G_hist['FE_frequency']['keys']: | ||
df = G_df_dict['BIG'][[key_, col]].copy() | ||
keys = [key_] | ||
df['x'] = df.groupby(keys + [col])[col].transform('count') / df.groupby(keys)[col].transform('count') | ||
df['y'] = df.groupby(keys)['x'].transform('max') | ||
G_df_dict['FE_frequency'][f'{key_}__with__{col}__frequency'] = df['y'] | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import time | ||
import warnings | ||
|
||
import pandas as pd | ||
from tqdm import tqdm | ||
from autox.autox_server.util import log | ||
warnings.filterwarnings('ignore') | ||
|
||
def fe_hash_discrete(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
start = time.time() | ||
log('[+] feature engineer, hash_discrete') | ||
|
||
if is_train: | ||
G_hist['FE_hash_discrete'] = [] | ||
col_hash_discrete = [] | ||
if G_hist['super_big_data']: | ||
for col in G_hist['big_cols_cat']: | ||
# nunique大于10000的特征,截断保留4位 | ||
if G_df_dict['BIG'][col].nunique() > 10000: | ||
col_hash_discrete.append(col) | ||
|
||
G_hist['FE_hash_discrete'] = col_hash_discrete | ||
log("hash_discrete features: {}".format(G_hist['FE_hash_discrete'])) | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_hash_discrete'] = pd.DataFrame() | ||
for col in tqdm(G_hist['FE_c']): | ||
G_df_dict['FE_kv'][f"{col}_hash_discrete"] = G_df_dict['BIG'][col].apply(lambda x: str(x)[-4:]) | ||
|
||
# todo: 对应feql直接discrete签名 | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import warnings | ||
import pandas as pd | ||
import numpy as np | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from pypinyin import pinyin, lazy_pinyin, Style | ||
|
||
def str2map(s): | ||
if str(s) == 'None': | ||
return {} | ||
return {si.split(':')[0]: si.split(':')[1] for si in s.split(',')} | ||
|
||
def get_keys(kv): | ||
return list(kv.keys()) | ||
|
||
def fe_kv(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
start = time.time() | ||
log('[+] feature engineer, kv') | ||
|
||
if is_train: | ||
G_hist['FE_kv'] = {} | ||
G_hist['FE_kv']['cols'] = [] | ||
G_hist['FE_kv']['col_top_keys'] = {} | ||
|
||
cols_kv = [x for x in G_hist['big_cols_kv'] if x in G_df_dict['BIG'].columns] | ||
G_hist['FE_kv']['cols'] = cols_kv | ||
log("kv features: {}".format(G_hist['FE_kv']['cols'])) | ||
|
||
for col in cols_kv: | ||
temp = G_df_dict['BIG'][[col]].copy() | ||
temp[col] = temp[col].apply(lambda x: str2map(x)) | ||
temp[col + '_keys'] = temp[col].apply(lambda x: get_keys(x)) | ||
|
||
vectorizer = CountVectorizer(max_features=100) | ||
vectorizer.fit_transform(temp[col + '_keys'].astype(str)) | ||
G_hist['FE_kv']['col_top_keys'][col] = vectorizer.get_feature_names() | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_kv'] = pd.DataFrame() | ||
for col in tqdm(G_hist['FE_kv']['cols']): | ||
for key_ in G_hist['FE_kv']['col_top_keys'][col]: | ||
temp = G_df_dict['BIG'][[col]].copy() | ||
temp[col] = temp[col].apply(lambda x: str2map(x)) | ||
try: | ||
G_df_dict['FE_kv'][f"{col}__{key_}__kv"] = temp[col].apply(lambda x: float(x.get(key_, np.nan))) | ||
except: | ||
pass | ||
|
||
G_hist['FE_kv']['rename'] = {} | ||
cols_name = [] | ||
for i, col in enumerate(G_df_dict['FE_kv'].columns): | ||
col_rename = ''.join(lazy_pinyin(col)) + f'__idx{i}' | ||
cols_name.append(col_rename) | ||
G_hist['FE_kv']['rename'][col_rename] = col | ||
G_df_dict['FE_kv'].columns = cols_name | ||
|
||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
48 changes: 48 additions & 0 deletions
48
autox/autox_server/feature_engineer/fe_stat_for_same_prefix.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import warnings | ||
import pandas as pd | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
import re | ||
|
||
def fe_stat_for_same_prefix(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
start = time.time() | ||
log('[+] feature engineer, stat_for_same_prefix') | ||
|
||
if is_train: | ||
G_hist['FE_stat_for_same_prefix'] = [] | ||
|
||
cols_agg_list = [] | ||
cols = G_df_dict['BIG'].columns | ||
c_1_list = [col for col in cols if bool(re.search(r'_1$', str(col)))] | ||
for c_1 in c_1_list: | ||
c_list = [c_1] | ||
for i in range(2, 20): | ||
c_i = c_1.replace('_1', '_{}'.format(i)) | ||
if c_i in cols: | ||
c_list.append(c_i) | ||
num_flag = True | ||
for item in c_list: | ||
if str(G_df_dict['BIG'][item].dtype) == 'object': | ||
num_flag = False | ||
if num_flag and 3 <= len(c_list) <= 3: | ||
cols_agg_list.append(c_list) | ||
G_hist['FE_stat_for_same_prefix'] = cols_agg_list | ||
log("stat_for_same_prefix features: {}".format(G_hist['FE_stat_for_same_prefix'])) | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_stat_for_same_prefix'] = pd.DataFrame() | ||
for cols_agg in tqdm(G_hist['FE_stat_for_same_prefix']): | ||
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__mean'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].mean(axis = 1) | ||
# G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__median'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].median(axis = 1) | ||
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__min'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].min(axis = 1) | ||
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__max'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].max(axis = 1) | ||
# G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__std'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].std(axis = 1) | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import warnings | ||
import pandas as pd | ||
import time | ||
from autox.autox_server.util import log | ||
from tqdm import tqdm | ||
warnings.filterwarnings('ignore') | ||
|
||
def fe_time_count(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE): | ||
# 对G_df_dict['BIG']表做扩展特征 | ||
start = time.time() | ||
log('[+] feature engineer, time count') | ||
time_col = G_data_info['target_time'] | ||
|
||
if is_train: | ||
G_hist['FE_time_count'] = [] | ||
size_of_big = G_df_dict['BIG'].shape[0] | ||
if G_data_info['time_series_data'] == 'true': | ||
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col) | ||
for col in G_hist['big_cols_cat']: | ||
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8: | ||
G_hist['FE_time_count'].append(col) | ||
|
||
if G_hist['big_data']: | ||
G_hist['FE_time_count'] = [] | ||
|
||
log("time count features: {}".format(G_hist['FE_time_count'])) | ||
|
||
if not AMPERE: | ||
G_df_dict['FE_time_count'] = pd.DataFrame() | ||
for col in tqdm(G_hist['FE_time_count']): | ||
G_df_dict['FE_time_count'][f'{col}__time_count'] = G_df_dict['BIG'].groupby([col, time_col])[col].transform('count') | ||
|
||
end = time.time() | ||
remain_time -= (end - start) | ||
log("time consumption: {}".format(str(end - start))) | ||
log("remain_time: {} s".format(remain_time)) | ||
return remain_time |
Oops, something went wrong.