-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
89 lines (74 loc) · 3.31 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np
import pandas as pd
import os
from collections import defaultdict
def optimize_dataframe(df):
"""
Optimize the data types of a pandas DataFrame to reduce memory usage.
Parameters:
df (pd.DataFrame): The DataFrame to optimize.
Returns:
pd.DataFrame: A new DataFrame with optimized data types.
"""
optimized_df = df.copy()
# Optimize numeric columns
for col in optimized_df.select_dtypes(include=['int', 'float']).columns:
col_min = optimized_df[col].min()
col_max = optimized_df[col].max()
if pd.api.types.is_integer_dtype(optimized_df[col]):
if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
optimized_df[col] = optimized_df[col].astype(np.int8)
elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
optimized_df[col] = optimized_df[col].astype(np.int16)
elif col_min >= np.iinfo(np.int32).min and col_max <= np.iinfo(np.int32).max:
optimized_df[col] = optimized_df[col].astype(np.int32)
else:
optimized_df[col] = optimized_df[col].astype(np.int64)
else:
if col_min >= np.finfo(np.float16).min and col_max <= np.finfo(np.float16).max:
optimized_df[col] = optimized_df[col].astype(np.float16)
elif col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
optimized_df[col] = optimized_df[col].astype(np.float32)
else:
optimized_df[col] = optimized_df[col].astype(np.float64)
# Optimize object columns
for col in optimized_df.select_dtypes(include=['object']).columns:
num_unique_values = len(optimized_df[col].unique())
num_total_values = len(optimized_df[col])
if num_unique_values / num_total_values < 0.5:
optimized_df[col] = optimized_df[col].astype('category')
return optimized_df
def optimize_dfs(data_dict,category):
'''
Loads data according to the category in a df and stores it in the dictionary
'''
for index,path in enumerate(data_dict[category]):
print(path)
temp_df = pd.read_csv(path)
optimized_df = optimize_dataframe(temp_df)
optimized_df.to_pickle(f'{optimized_df_path}/{category}_{index}.pkl')
data_dict[index] = f'{optimized_df_path}/{category}_{index}.pkl'
def load_data(data_dict,category):
'''
Loads data according to the category in a df and stores it in the dictionary
the data should be pickle files
'''
if category == 'credit':
credit_dict = defaultdict()
for path in data_dict[category]:
temp_df = pd.read_pickle(path)
if temp_df.shape[1] == 79:
credit_dict[0].append(temp_df)
if temp_df.shape[1] == 19:
credit_dict[1].append(temp_df)
if temp_df.shape[1] == 6:
credit_dict[2].append(temp_df)
if temp_df.shape[1] == 45:
credit_dict[3].append(temp_df)
data_dict[category] = credit_dict
else:
df_list = []
for path in data_dict[category]:
temp_df = pd.read_pickle(path)
df_list.append(temp_df)
data_dict[category] = df_list