f1_calculation.py

import pandas as pd
import os.path


def check_filepath(filename):
    if os.path.isfile(filename):
        return True
    else:
        raise Exception(f"File {filename} doesn't exist\n")


def read_unstructured_csv(filename):
    lines = ''
    with open(filename, 'r') as file:
        lines = file.readlines()
    # removing ('ID, Files containing ID\n') from code
    lines.pop(0)
    mapping_dictionary = {}
    for line in lines:
        line = line.strip('\n')
        line = line.strip(' ')
        lst = line.split(',')
        idx = lst.pop(0)
        mapping_dictionary[idx] = dict()
        if len(lst[0]) > 0:
            mapping_dictionary[idx] = {lst.pop(0): lst}
    return mapping_dictionary


def intersection_of_lists(ti, ta):
    # this function will return the intersection of the two lists
    return list(set(ti).intersection(ta))


def get_excel_data(db_handler, sheet_name):
    # this function will read the apt sheet from the database
    # and return as dataframe
    df = pd.read_excel(db_handler, sheet_name)
    try:
        if "ATT&CK Technique" not in df.columns:
            # raise Exception(f"The 'ATT&CK Technique column doesn't exit in the sheet {sheet}")
            tempdf = df.dropna().reset_index(drop=True)
            tempdf.columns = tempdf.iloc[0].tolist()
            tempdf = tempdf[1:]
            df = tempdf

        all_attacks_list = df["ATT&CK Technique"].tolist()
        return all_attacks_list
    except Exception as e:
        print("Exception occur while reading the excel sheet")


def cal_f1(num_matches, num_input, num_apt):
    """
    This function implements the formula for calculating the F1 score
    """
    if num_matches == 0:
        return 0
    recall = num_matches / num_apt
    precision = num_matches / num_input
    score = 2 * recall * precision / (recall + precision)
    return score


def calculate_score(database_filename, sheet_name):
    """
    calculate_score: is the driver function that will use all helper functions to calculate the F1 score
    database_filename: Database excel sheet that contain all of the apts
    sheet_name:  Sheet name will the name of the sheet that will be generated by generate_report.py file
    """
    db_xls = pd.ExcelFile(database_filename)
    database_sheets_list = db_xls.sheet_names

    lines = ''
    with open(sheet_name, 'r') as file:
        lines = file.readlines()

    # removing ('ID, Files containing ID\n') from code
    lines.pop(0)

    # preparing the dictionary
    mapping_dictionary = {}
    for line in lines:
        # removing the '\n' from the code
        line = line.strip('\n')
        line = line.strip(' ')

        # splitting the line with ', '
        lst = line.split(',')
        idx = lst.pop(0)
        mapping_dictionary[idx] = dict()
        if len(lst[0]) > 0:
            lst.pop(0)
            mapping_dictionary[idx] = lst

    # getting total techinques present in ID column
    ti = list(mapping_dictionary.keys())

    f1_score = {}
    for sheet in database_sheets_list:
        f1_score[sheet] = 0

    # here key contain technique name from ID column
    # sheets_list carries the list of files that contain matching technique
    for key, sheet_list in mapping_dictionary.items():
        for sheet_name in sheet_list:
            ta = get_excel_data(db_handler=db_xls, sheet_name=sheet_name)
            common_elements = intersection_of_lists(ti, ta)  # numerator completed
            f1_score[sheet_name] = int(cal_f1(len(common_elements), len(ti), len(ta)) * 100)

    # sorting the dictionary on the basis of F1 score
    f1_score = dict(sorted(f1_score.items(), key=lambda item: item[1], reverse=True)[:3])

    return f1_score