-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathf1_calculation.py
114 lines (92 loc) · 3.62 KB
/
f1_calculation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import os.path
def check_filepath(filename):
if os.path.isfile(filename):
return True
else:
raise Exception(f"File {filename} doesn't exist\n")
def read_unstructured_csv(filename):
lines = ''
with open(filename, 'r') as file:
lines = file.readlines()
# removing ('ID, Files containing ID\n') from code
lines.pop(0)
mapping_dictionary = {}
for line in lines:
line = line.strip('\n')
line = line.strip(' ')
lst = line.split(',')
idx = lst.pop(0)
mapping_dictionary[idx] = dict()
if len(lst[0]) > 0:
mapping_dictionary[idx] = {lst.pop(0): lst}
return mapping_dictionary
def intersection_of_lists(ti, ta):
# this function will return the intersection of the two lists
return list(set(ti).intersection(ta))
def get_excel_data(db_handler, sheet_name):
# this function will read the apt sheet from the database
# and return as dataframe
df = pd.read_excel(db_handler, sheet_name)
try:
if "ATT&CK Technique" not in df.columns:
# raise Exception(f"The 'ATT&CK Technique column doesn't exit in the sheet {sheet}")
tempdf = df.dropna().reset_index(drop=True)
tempdf.columns = tempdf.iloc[0].tolist()
tempdf = tempdf[1:]
df = tempdf
all_attacks_list = df["ATT&CK Technique"].tolist()
return all_attacks_list
except Exception as e:
print("Exception occur while reading the excel sheet")
def cal_f1(num_matches, num_input, num_apt):
"""
This function implements the formula for calculating the F1 score
"""
if num_matches == 0:
return 0
recall = num_matches / num_apt
precision = num_matches / num_input
score = 2 * recall * precision / (recall + precision)
return score
def calculate_score(database_filename, sheet_name):
"""
calculate_score: is the driver function that will use all helper functions to calculate the F1 score
database_filename: Database excel sheet that contain all of the apts
sheet_name: Sheet name will the name of the sheet that will be generated by generate_report.py file
"""
db_xls = pd.ExcelFile(database_filename)
database_sheets_list = db_xls.sheet_names
lines = ''
with open(sheet_name, 'r') as file:
lines = file.readlines()
# removing ('ID, Files containing ID\n') from code
lines.pop(0)
# preparing the dictionary
mapping_dictionary = {}
for line in lines:
# removing the '\n' from the code
line = line.strip('\n')
line = line.strip(' ')
# splitting the line with ', '
lst = line.split(',')
idx = lst.pop(0)
mapping_dictionary[idx] = dict()
if len(lst[0]) > 0:
lst.pop(0)
mapping_dictionary[idx] = lst
# getting total techinques present in ID column
ti = list(mapping_dictionary.keys())
f1_score = {}
for sheet in database_sheets_list:
f1_score[sheet] = 0
# here key contain technique name from ID column
# sheets_list carries the list of files that contain matching technique
for key, sheet_list in mapping_dictionary.items():
for sheet_name in sheet_list:
ta = get_excel_data(db_handler=db_xls, sheet_name=sheet_name)
common_elements = intersection_of_lists(ti, ta) # numerator completed
f1_score[sheet_name] = int(cal_f1(len(common_elements), len(ti), len(ta)) * 100)
# sorting the dictionary on the basis of F1 score
f1_score = dict(sorted(f1_score.items(), key=lambda item: item[1], reverse=True)[:3])
return f1_score