update autoxserver.

4paradigm · Jan 18, 2022 · a29628a · a29628a
1 parent 3a085a0
commit a29628a
Show file tree

Hide file tree

Showing 16 changed files with 751 additions and 90 deletions.
diff --git a/autox/autox_server/feature_engineer/fe_accumulate.py b/autox/autox_server/feature_engineer/fe_accumulate.py
@@ -0,0 +1,45 @@
+import warnings
+import pandas as pd
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+
+
+def fe_accumulate(G_df_dict, G_data_info, G_hist, is_train, remain_time):
+    # 对G_df_dict['BIG']表做扩展特征
+
+    start = time.time()
+    log('[+] feature engineer, accumulate')
+    time_col = G_data_info['target_time']
+
+    if is_train:
+        G_hist['FE_Accumulate'] = {}
+        G_hist['FE_Accumulate']['normal'] = []
+        G_hist['FE_Accumulate']['time'] = []
+
+        for col in tqdm(G_hist['big_cols_cat']):
+            G_hist['FE_Accumulate']['normal'].append(col)
+        log("accumulate normal features: {}".format(G_hist['FE_Accumulate']['normal']))
+
+        if G_data_info['time_series_data'] == 'true':
+            if G_hist['big_data_type'][time_col] == 'Unix_timestamp':
+                G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)
+
+                for col in tqdm(G_hist['big_cols_cat']):
+                    G_hist['FE_Accumulate']['time'].append(col)
+                log("window features: {}".format(G_hist['FE_Accumulate']['time']))
+
+    G_df_dict['FE_Accumulate'] = pd.DataFrame()
+    for col in tqdm(G_hist['FE_Accumulate']['normal']):
+        G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] = G_df_dict['BIG'].groupby(col).cumcount()
+
+    for col in tqdm(G_hist['FE_Accumulate']['time']):
+        G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] = G_df_dict['BIG'].groupby(col)[time_col].transform('min')
+        G_df_dict['FE_Accumulate'][f'{col}_acc_cnt_div_delta_time'] = G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] / \
+                (G_df_dict['BIG'][time_col] - G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] + 1)
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_count_map.py b/autox/autox_server/feature_engineer/fe_count_map.py
@@ -0,0 +1,52 @@
+import warnings
+import pandas as pd
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+
+def fe_count_map(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+
+    start = time.time()
+    log('[+] feature engineer, count')
+
+    Id = G_data_info['target_id']
+    target = G_data_info['target_label']
+
+    if is_train:
+        G_hist['FE_count'] = {}
+        G_hist['FE_count']['feature_map'] = {}
+        G_hist['FE_count']['cnt_features'] = []
+        size_of_big = G_df_dict['BIG'].shape[0]
+
+        cnt_features = []
+        for col in G_df_dict['BIG'].columns:
+            if col in [target] + Id:
+                continue
+            if '_in_' in col:
+                continue
+            if 'int' in str(G_df_dict['BIG'][col].dtype):
+                if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8 and G_df_dict['BIG'][col].nunique() < 200000:
+                    cnt_features.append(col)
+        G_hist['FE_count']['cnt_features'] = cnt_features
+        log("count features: {}".format(cnt_features))
+
+        for f in cnt_features:
+            temp = pd.DataFrame(G_df_dict['BIG'][f])
+            temp[f + '_cnt'] = temp.groupby([f])[f].transform('count')
+            temp.index = temp[f]
+            temp = temp.drop(f, axis=1)
+            faeture_map = temp.to_dict()[f + '_cnt']
+            G_hist['FE_count']['feature_map'][f] = faeture_map
+
+    if not AMPERE:
+        G_df_dict['FE_count'] = pd.DataFrame()
+        for f in G_hist['FE_count']['cnt_features']:
+            G_df_dict['FE_count'][f + "_cnt"] = G_df_dict['BIG'][f].map(G_hist['FE_count']['feature_map'][f])
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_count_ratio.py b/autox/autox_server/feature_engineer/fe_count_ratio.py
@@ -0,0 +1,33 @@
+import warnings
+import pandas as pd
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+
+def fe_count_ratio(G_df_dict, G_data_info, G_hist, is_train, remain_time):
+    # 对G_df_dict['BIG']表做扩展特征
+    start = time.time()
+    log('[+] feature engineer, count ratio')
+
+    if is_train:
+        G_hist['FE_count_ratio'] = {}
+        size_of_big = G_df_dict['BIG'].shape[0]
+
+        cnt_ratio_features = []
+        for col in G_hist['big_cols_cat'] + G_hist['big_cols_num']:
+            if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8:
+                cnt_ratio_features.append(col)
+        G_hist['FE_count_ratio'] = cnt_ratio_features
+        log("count ratio features: {}".format(cnt_ratio_features))
+
+    G_df_dict['FE_count_ratio'] = pd.DataFrame()
+    for col in tqdm(G_hist['FE_count_ratio']):
+        G_df_dict['FE_count_ratio'][col + "_cnt_ratio"] = G_df_dict['BIG'].groupby(col)[col].transform('count') / \
+                                            G_df_dict['BIG'].shape[0]
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_frequency.py b/autox/autox_server/feature_engineer/fe_frequency.py
@@ -0,0 +1,67 @@
+import warnings
+import pandas as pd
+warnings.filterwarnings('ignore')
+import time
+from autox.autox_server.util import log
+
+def _groupby_agg_rolling(df, keys, col, op, k, col_time):
+    name = 'WIN_{}_{}_({})_({})'.format(k, op.upper(), '_'.join(keys), col)
+    if type(k) == int:
+        s = df.groupby(keys)[[col]]
+        df_gp = s.rolling(window = k).agg(op) # rolling by number
+    else:
+        closed = 'left' # [left, right)
+        # closed = 'both' # [left, right]
+        s = df.groupby(keys)[[col_time, col]]
+        df_gp = s.rolling(window = k, on = col_time, closed = closed).agg(op).iloc[:, -1:] # rolling by time
+    df_gp.columns = [name]
+    df_gp = df_gp.sort_index(level = 1).reset_index(drop = True)
+    return df_gp
+
+def fe_frequency(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+
+    start = time.time()
+    log('[+] feature engineer, frequency')
+
+    big_size = G_df_dict['BIG'].shape[0]
+    time_col = G_data_info['target_time']
+
+    if is_train:
+        G_hist['FE_frequency'] = {}
+        G_hist['FE_frequency']['keys'] = []
+        G_hist['FE_frequency']['cols'] = []
+
+        if G_data_info['time_series_data'] == 'true':
+            # !先对df排序
+            G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)
+
+            keys_features = []
+            for col in G_hist['big_cols_cat']:
+                if big_size * 0.005 < G_df_dict['BIG'][col].nunique() < big_size * 0.01:
+                    keys_features.append(col)
+            G_hist['FE_frequency']['keys'] = keys_features
+            log("FE_frequency keys: {}".format(keys_features))
+
+            cols_features = []
+            for col in G_hist['big_cols_cat']:
+                if big_size * 0.6 < G_df_dict['BIG'][col].nunique() < big_size * 0.8:
+                    cols_features.append(col)
+            G_hist['FE_frequency']['cols'] = cols_features
+            log("FE_frequency cols: {}".format(cols_features))
+
+    if not AMPERE:
+        G_df_dict['FE_frequency'] = pd.DataFrame()
+        for col in G_hist['FE_frequency']['cols']:
+            for key_ in G_hist['FE_frequency']['keys']:
+                df = G_df_dict['BIG'][[key_, col]].copy()
+                keys = [key_]
+                df['x'] = df.groupby(keys + [col])[col].transform('count') / df.groupby(keys)[col].transform('count')
+                df['y'] = df.groupby(keys)['x'].transform('max')
+                G_df_dict['FE_frequency'][f'{key_}__with__{col}__frequency'] = df['y']
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_hash_discrete.py b/autox/autox_server/feature_engineer/fe_hash_discrete.py
@@ -0,0 +1,37 @@
+import time
+import warnings
+
+import pandas as pd
+from tqdm import tqdm
+from autox.autox_server.util import log
+warnings.filterwarnings('ignore')
+
+def fe_hash_discrete(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+    start = time.time()
+    log('[+] feature engineer, hash_discrete')
+
+    if is_train:
+        G_hist['FE_hash_discrete'] = []
+        col_hash_discrete = []
+        if G_hist['super_big_data']:
+            for col in G_hist['big_cols_cat']:
+                # nunique大于10000的特征，截断保留4位
+                if G_df_dict['BIG'][col].nunique() > 10000:
+                    col_hash_discrete.append(col)
+
+        G_hist['FE_hash_discrete'] = col_hash_discrete
+        log("hash_discrete features: {}".format(G_hist['FE_hash_discrete']))
+
+    if not AMPERE:
+        G_df_dict['FE_hash_discrete'] = pd.DataFrame()
+        for col in tqdm(G_hist['FE_c']):
+            G_df_dict['FE_kv'][f"{col}_hash_discrete"] = G_df_dict['BIG'][col].apply(lambda x: str(x)[-4:])
+
+        # todo: 对应feql直接discrete签名
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_kv.py b/autox/autox_server/feature_engineer/fe_kv.py
@@ -0,0 +1,66 @@
+import warnings
+import pandas as pd
+import numpy as np
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+from sklearn.feature_extraction.text import CountVectorizer
+from pypinyin import pinyin, lazy_pinyin, Style
+
+def str2map(s):
+    if str(s) == 'None':
+        return {}
+    return {si.split(':')[0]: si.split(':')[1] for si in s.split(',')}
+
+def get_keys(kv):
+    return list(kv.keys())
+
+def fe_kv(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+    start = time.time()
+    log('[+] feature engineer, kv')
+
+    if is_train:
+        G_hist['FE_kv'] = {}
+        G_hist['FE_kv']['cols'] = []
+        G_hist['FE_kv']['col_top_keys'] = {}
+
+        cols_kv = [x for x in G_hist['big_cols_kv'] if x in G_df_dict['BIG'].columns]
+        G_hist['FE_kv']['cols'] = cols_kv
+        log("kv features: {}".format(G_hist['FE_kv']['cols']))
+
+        for col in cols_kv:
+            temp = G_df_dict['BIG'][[col]].copy()
+            temp[col] = temp[col].apply(lambda x: str2map(x))
+            temp[col + '_keys'] = temp[col].apply(lambda x: get_keys(x))
+
+            vectorizer = CountVectorizer(max_features=100)
+            vectorizer.fit_transform(temp[col + '_keys'].astype(str))
+            G_hist['FE_kv']['col_top_keys'][col] = vectorizer.get_feature_names()
+
+    if not AMPERE:
+        G_df_dict['FE_kv'] = pd.DataFrame()
+        for col in tqdm(G_hist['FE_kv']['cols']):
+            for key_ in G_hist['FE_kv']['col_top_keys'][col]:
+                temp = G_df_dict['BIG'][[col]].copy()
+                temp[col] = temp[col].apply(lambda x: str2map(x))
+                try:
+                    G_df_dict['FE_kv'][f"{col}__{key_}__kv"] = temp[col].apply(lambda x: float(x.get(key_, np.nan)))
+                except:
+                    pass
+
+        G_hist['FE_kv']['rename'] = {}
+        cols_name = []
+        for i, col in enumerate(G_df_dict['FE_kv'].columns):
+            col_rename = ''.join(lazy_pinyin(col)) + f'__idx{i}'
+            cols_name.append(col_rename)
+            G_hist['FE_kv']['rename'][col_rename] = col
+        G_df_dict['FE_kv'].columns = cols_name
+
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_stat_for_same_prefix.py b/autox/autox_server/feature_engineer/fe_stat_for_same_prefix.py
@@ -0,0 +1,48 @@
+import warnings
+import pandas as pd
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+import re
+
+def fe_stat_for_same_prefix(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+    start = time.time()
+    log('[+] feature engineer, stat_for_same_prefix')
+
+    if is_train:
+        G_hist['FE_stat_for_same_prefix'] = []
+
+        cols_agg_list = []
+        cols = G_df_dict['BIG'].columns
+        c_1_list = [col for col in cols if bool(re.search(r'_1$', str(col)))]
+        for c_1 in c_1_list:
+            c_list = [c_1]
+            for i in range(2, 20):
+                c_i = c_1.replace('_1', '_{}'.format(i))
+                if c_i in cols:
+                    c_list.append(c_i)
+            num_flag = True
+            for item in c_list:
+                if str(G_df_dict['BIG'][item].dtype) == 'object':
+                    num_flag = False
+            if num_flag and 3 <= len(c_list) <= 3:
+                cols_agg_list.append(c_list)
+        G_hist['FE_stat_for_same_prefix'] = cols_agg_list
+        log("stat_for_same_prefix features: {}".format(G_hist['FE_stat_for_same_prefix']))
+
+    if not AMPERE:
+        G_df_dict['FE_stat_for_same_prefix'] = pd.DataFrame()
+        for cols_agg in tqdm(G_hist['FE_stat_for_same_prefix']):
+            G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__mean'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].mean(axis = 1)
+            # G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__median'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].median(axis = 1)
+            G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__min'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].min(axis = 1)
+            G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__max'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].max(axis = 1)
+            # G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__std'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].std(axis = 1)
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time
diff --git a/autox/autox_server/feature_engineer/fe_time_count.py b/autox/autox_server/feature_engineer/fe_time_count.py
@@ -0,0 +1,37 @@
+import warnings
+import pandas as pd
+import time
+from autox.autox_server.util import log
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+
+def fe_time_count(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
+    # 对G_df_dict['BIG']表做扩展特征
+    start = time.time()
+    log('[+] feature engineer, time count')
+    time_col = G_data_info['target_time']
+
+    if is_train:
+        G_hist['FE_time_count'] = []
+        size_of_big = G_df_dict['BIG'].shape[0]
+        if G_data_info['time_series_data'] == 'true':
+            G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)
+            for col in G_hist['big_cols_cat']:
+                if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8:
+                    G_hist['FE_time_count'].append(col)
+
+        if G_hist['big_data']:
+            G_hist['FE_time_count'] = []
+
+        log("time count features: {}".format(G_hist['FE_time_count']))
+
+    if not AMPERE:
+        G_df_dict['FE_time_count'] = pd.DataFrame()
+        for col in tqdm(G_hist['FE_time_count']):
+            G_df_dict['FE_time_count'][f'{col}__time_count'] = G_df_dict['BIG'].groupby([col, time_col])[col].transform('count')
+
+    end = time.time()
+    remain_time -= (end - start)
+    log("time consumption: {}".format(str(end - start)))
+    log("remain_time: {} s".format(remain_time))
+    return remain_time