-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlevel2.py
248 lines (197 loc) · 11.6 KB
/
level2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
import math
class AccountFeatures:
def __init__(self, account_no,
avg_transactions_per_month, avg_withdrawals_per_month, avg_deposits_per_month,
avg_withdrawal_amount, avg_deposit_amount,
avg_balance_amount_upper, avg_balance_amount_lower, anomaly_score):
self.account_no = account_no
self.avg_transactions_per_month = avg_transactions_per_month
self.avg_withdrawals_per_month = avg_withdrawals_per_month
self.avg_deposits_per_month = avg_deposits_per_month
self.avg_withdrawal_amount = avg_withdrawal_amount
self.avg_deposit_amount = avg_deposit_amount
self.avg_balance_amount_upper = avg_balance_amount_upper
self.avg_balance_amount_lower = avg_balance_amount_lower
self.anomaly_score = anomaly_score
# Replace 'your_excel_file.xlsx' with the actual path to your Excel file
excel_file_path = 'iit_data.xlsx'
# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)
# Handling NaN values in 'DEPOSIT AMT' and 'WITHDRAWAL AMT'
df['DEPOSIT AMT'] = df['DEPOSIT AMT'].fillna(0)
df['WITHDRAWAL AMT'] = df['WITHDRAWAL AMT'].fillna(0)
# Extract features for anomaly detection
X = df[['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']]
# Train an Isolation Forest model
model = IsolationForest(contamination='auto', random_state=42)
parameters = {'n_estimators': [50, 100, 1000], 'max_samples': ['auto', 100, 1000]}
grid_search = GridSearchCV(model, parameters, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X)
# Get the best parameters
best_params = grid_search.best_params_
# Train the model with the best parameters
best_model = IsolationForest(contamination='auto', random_state=42, n_estimators=best_params['n_estimators'], max_samples=best_params['max_samples'])
best_model.fit(X)
# Extract unique account numbers
unique_account_numbers = list(df['Account No'].unique())
print("No of unique Accounts: ", len(unique_account_numbers))
# Create a list to store instances of the AccountFeatures class
account_features_list = []
# Loop through unique account numbers
for account_number in unique_account_numbers:
# Filter data for the current account number
account_data = df[df['Account No'] == account_number]
# Calculate existing average per month features
avg_transactions_per_month = len(account_data) / len(account_data['DATE'].dt.to_period("M").unique())
avg_withdrawals_per_month = account_data['WITHDRAWAL AMT'].count() / len(account_data['DATE'].dt.to_period("M").unique())
avg_deposits_per_month = account_data['DEPOSIT AMT'].count() / len(account_data['DATE'].dt.to_period("M").unique())
# Calculate average withdrawal amount and deposit amount
avg_withdrawal_amount = account_data['WITHDRAWAL AMT'].mean()
avg_deposit_amount = account_data['DEPOSIT AMT'].mean()
# Calculate average balance amount (upper and lower)
avg_balance_amount_upper = account_data['BALANCE AMT'].max()
avg_balance_amount_lower = account_data['BALANCE AMT'].min()
# Check if the number of transactions is greater than the average transactions per month for the account
# Predict anomaly for the current account
predictions = best_model.decision_function(account_data[['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']])
anomaly_probability = 1 / (1 + (-predictions))
overall_anomaly_score = anomaly_probability.mean()
# Create an instance of the AccountFeatures class and append it to the list
account_features = AccountFeatures(
account_number,
avg_transactions_per_month,
avg_withdrawals_per_month,
avg_deposits_per_month,
avg_withdrawal_amount,
avg_deposit_amount,
avg_balance_amount_upper,
avg_balance_amount_lower,
anomaly_score=overall_anomaly_score - 0.15
)
account_features_list.append(account_features)
# Print the account features
print("\nAccount Features:")
def View_Process(ano) :
dt = {}
for account in account_features_list:
if account.account_no == ano :
dt["Avg Transactions/Month"]: account.avg_transactions_per_month
dt["Avg Withdrawals/Month"]: account.avg_withdrawals_per_month
dt["Avg Deposits/Month"]: account.avg_deposits_per_month
dt["Avg Withdrawal Amount"] : account.avg_withdrawal_amount
dt["Avg Deposit Amount"] : account.avg_deposit_amount
dt["Avg Balance Amount (Upper)"] : account.avg_balance_amount_upper
dt["Avg Balance Amount (Lower)"] : account.avg_balance_amount_lower
dt["Overall Anomaly Score"] : account.anomaly_score
return dt
# print("PRE-PROCESSING DATA-SET WITH FEATURE EXTRACTION OVER HERE")
for account in account_features_list:
print(f"Account No: {account.account_no}")
print(f"Avg Transactions/Month: {account.avg_transactions_per_month}")
print(f"Avg Withdrawals/Month: {account.avg_withdrawals_per_month}")
print(f"Avg Deposits/Month: {account.avg_deposits_per_month}")
print(f"Avg Withdrawal Amount: {account.avg_withdrawal_amount}")
print(f"Avg Deposit Amount: {account.avg_deposit_amount}")
print(f"Avg Balance Amount (Upper): {account.avg_balance_amount_upper}")
print(f"Avg Balance Amount (Lower): {account.avg_balance_amount_lower}")
print(f"Overall Anomaly Score: {account.anomaly_score}")
print("\n")
print("PRE-PROCESSING DATA-SET WITH FEATURE EXTRACTION OVER HERE")
print("INPUT NEW TRANSACTION FOR ANOMALY DETECTION 2ND LAYER INTERFACE INPUT")
# Get user input for a new transaction
account_no = input("Enter Account No: ")
num_transactions = int(input("Enter the number of new transactions: "))
new_transactions_list = []
for _ in range(num_transactions):
date = input("Enter Date (YYYY-MM-DD): ")
withdrawal_amt = float(input("Enter Withdrawal Amount: "))
deposit_amt = float(input("Enter Deposit Amount: "))
balance_amt = float(input("Enter Balance Amount: "))
new_transaction = {
'Account No': account_no,
'DATE': pd.to_datetime(date),
'WITHDRAWAL AMT': withdrawal_amt,
'DEPOSIT AMT': deposit_amt,
'BALANCE AMT': balance_amt
}
new_transactions_list.append(new_transaction)
new_transactions_df = pd.DataFrame(new_transactions_list)
new_transactions_df['DEPOSIT AMT'] = new_transactions_df['DEPOSIT AMT'].fillna(0)
new_transactions_df['WITHDRAWAL AMT'] = new_transactions_df['WITHDRAWAL AMT'].fillna(0)
new_transactions_df['Month'] = new_transactions_df['DATE'].dt.to_period("M")
avg_transactions_per_month_new = new_transactions_df.groupby('Month').size().mean()
# Predict transactions as anomalies if the number of transactions is greater than the average transactions per month
minn = math.floor(math.ceil(avg_transactions_per_month) - 0.25*math.ceil(avg_transactions_per_month))
maxx = math.ceil(math.ceil(avg_transactions_per_month) + 0.25*math.ceil(avg_transactions_per_month))
if minn <= avg_transactions_per_month and maxx >= avg_transactions_per_month :
print(f"\n\n Anomaly pattern- Number of transactions is not within the range.\n\n\n")
# Predict anomaly for the new transactions
predictions_new = best_model.decision_function(new_transactions_df[['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']])
anomaly_probability_new = 1 / (1 + (-predictions_new))
overall_anomaly_score_new = anomaly_probability_new.mean()
if overall_anomaly_score_new > 0.3 :
overall_anomaly_score_new -= 0.15
# Categorize the new transactions based on the threshold
threshold = 0.5
anomaly_label_new = 1 if overall_anomaly_score_new > threshold else 0
# ...
# ...
## ...
print("INPUT TRANSACTIONS ANALYSIS ")
# Print the result for the new transactions
print("\nNew Transaction Prediction:")
for idx, row in new_transactions_df.iterrows():
print(f"Account No: {row['Account No']}")
print(f"Date: {row['DATE']}")
print(f"Withdrawal Amount: {row['WITHDRAWAL AMT']}")
print(f"Deposit Amount: {row['DEPOSIT AMT']}")
print(f"Balance Amount: {row['BALANCE AMT']}")
print(f"Overall Anomaly Probability: {overall_anomaly_score_new}")
print(f"Anomaly Label: {anomaly_label_new}")
# Calculate the average amounts for the current account
current_account_data = df[df['Account No'] == row['Account No']]
avg_withdrawal_amount_current = current_account_data['WITHDRAWAL AMT'].mean()
avg_deposit_amount_current = current_account_data['DEPOSIT AMT'].mean()
avg_balance_amount_upper_current = current_account_data['BALANCE AMT'].max()
avg_balance_amount_lower_current = current_account_data['BALANCE AMT'].min()
# Check the reasons for anomaly
if row['WITHDRAWAL AMT']!=0 and row['WITHDRAWAL AMT'] < (0.75 * avg_withdrawal_amount_current) or row['WITHDRAWAL AMT'] > (1.25 * avg_withdrawal_amount_current):
print("Reason: Anomaly in Withdrawal-amount pattern behavior")
elif row['DEPOSIT AMT'] !=0 and row['DEPOSIT AMT'] < (0.75 * avg_deposit_amount_current) or row['DEPOSIT AMT'] > (1.25 * avg_deposit_amount_current):
print("Reason: Anomaly in Deposit-amount pattern behavior")
elif row['BALANCE AMT'] < (0.75 * avg_balance_amount_lower_current) or row['BALANCE AMT'] > (1.25 * avg_balance_amount_upper_current):
print("Reason: Anomaly in Balance-amount pattern behavior")
else:
print("Reason: No anomaly detected")
print("\n")
# ...
print("DATA-SET ANOMALIES VS NON-ANOMALIES ANALYSIS")
# Add anomaly probability to the new_transactions_df DataFrame
new_transactions_df['Anomaly Probability'] = anomaly_probability_new
# Embedding anomaly predictions for each feature
# Train the model again for each feature
for feature in ['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']:
model.fit(df[[feature]])
predictions = model.predict(df[[feature]])
df[f'{feature}_anomaly'] = predictions
# Print anomaly predictions for each feature and transaction
for index, row in df.iterrows():
overall_anomaly_score = (row['WITHDRAWAL AMT_anomaly'] + row['DEPOSIT AMT_anomaly'] + row['BALANCE AMT_anomaly']) / 3.0
if overall_anomaly_score < 0:
continue
# Sum up anomaly and non-anomaly transactions for unique account numbers
total_anomaly_counts = df.groupby('Account No').apply(lambda x: (x['WITHDRAWAL AMT_anomaly'] < 0).sum() +
(x['DEPOSIT AMT_anomaly'] < 0).sum() +
(x['BALANCE AMT_anomaly'] < 0).sum()).reset_index(name='Total Anomalies')
total_non_anomaly_counts = df.groupby('Account No').apply(lambda x: (x['WITHDRAWAL AMT_anomaly'] >= 0).sum() +
(x['DEPOSIT AMT_anomaly'] >= 0).sum() +
(x['BALANCE AMT_anomaly'] >= 0).sum()).reset_index(name='Total Non-Anomalies')
# Calculate averages and print the results
total_counts = pd.merge(total_anomaly_counts, total_non_anomaly_counts, on='Account No')
total_counts['Anomaly Percentage'] = (total_counts['Total Anomalies'] / (total_counts['Total Anomalies'] + total_counts['Total Non-Anomalies'])) * 100
total_counts['Non-Anomaly Percentage'] = (total_counts['Total Non-Anomalies'] / (total_counts['Total Anomalies'] + total_counts['Total Non-Anomalies'])) * 100
print("\nAccount Anomaly and Non-Anomaly Percentages:")
print(total_counts)