From 262364c92606cfb420774b016d6fda16f11a20cd Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Sat, 16 Nov 2024 13:44:36 +0100 Subject: [PATCH 1/2] fixes memory issue by writing csv to disk and sample ratio issue for small datasets (in retrieval_eval.py) --- examples/retrieval_eval.py | 53 ++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index a7228933..43d62f68 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -64,7 +64,12 @@ import random import itertools import ast - +# Set pandas options to ensure full output +pd.set_option('display.max_rows', None) +pd.set_option('display.max_columns', None) +pd.set_option('display.width', None) +pd.set_option('display.colheader_justify', 'left') +pd.set_option('display.expand_frame_repr', False) def execute(args): # (1) Initialize knowledge base. @@ -86,9 +91,9 @@ def execute(args): object_properties = sorted({i for i in symbolic_kb.get_object_properties()}) # (3.1) Subsample if required. - if args.ratio_sample_object_prop: + if args.ratio_sample_object_prop and len(object_properties) > 0: object_properties = {i for i in random.sample(population=list(object_properties), - k=max(0, int(len(object_properties) * args.ratio_sample_object_prop)))} + k=max(1, int(len(object_properties) * args.ratio_sample_object_prop)))} object_properties = set(object_properties) @@ -103,9 +108,9 @@ def execute(args): - if args.ratio_sample_nc: + if args.ratio_sample_nc and len(nc) > 0: # (6.1) Subsample if required. - nc = {i for i in random.sample(population=list(nc), k=max(0, int(len(nc) * args.ratio_sample_nc)))} + nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))} nc = set(nc) # return to a set # (7) NC⁻: Complement of NC. @@ -211,7 +216,10 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Shuffled the data so that the progress bar is not influenced by the order of concepts. random.shuffle(concepts) - + # check if csv arleady exists and delete it cause we want to override it + if os.path.exists(args.path_report): + os.remove(args.path_report) + file_exists = False # () Iterate over single OWL Class Expressions in ALCQIHO for expression in (tqdm_bar := tqdm(concepts, position=0, leave=True)): retrieval_y: Set[str] @@ -225,8 +233,8 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Compute the F1-score. f1_sim = f1_set_similarity(retrieval_y, retrieval_neural_y) # () Store the data. - data.append( - { + df_row = pd.DataFrame( + [{ "Expression": owl_expression_to_dl(expression), "Type": type(expression).__name__, "Jaccard Similarity": jaccard_sim, @@ -235,34 +243,35 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: "Runtime Neural": runtime_neural_y, "Symbolic_Retrieval": retrieval_y, "Symbolic_Retrieval_Neural": retrieval_neural_y, - } - ) + }]) + # Append the row to the CSV file + df_row.to_csv(args.path_report, mode='a', header=not file_exists, index=False) + file_exists = True # () Update the progress bar. tqdm_bar.set_description_str( f"Expression: {owl_expression_to_dl(expression)} | Jaccard Similarity:{jaccard_sim:.4f} | F1 :{f1_sim:.4f} | Runtime Benefits:{runtime_y - runtime_neural_y:.3f}" ) # () Read the data into pandas dataframe - df = pd.DataFrame(data) - assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity - # () Save the experimental results into csv file. - df.to_csv(args.path_report) - del df - # () Load the saved CSV file. df = pd.read_csv(args.path_report, index_col=0, converters={'Symbolic_Retrieval': lambda x: ast.literal_eval(x), - 'Symbolic_Retrieval_Neural': lambda x: ast.literal_eval( - x)}) - # () A retrieval result can be parsed into set of instances to python object. + 'Symbolic_Retrieval_Neural': lambda x: ast.literal_eval(x)}) + # () Assert that the mean Jaccard Similarity meets the threshold + assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity + + # () Ensure 'Symbolic_Retrieval_Neural' contains sets x = df["Symbolic_Retrieval_Neural"].iloc[0] assert isinstance(x, set) - # () Extract the numerical features. + + # () Extract numerical features numerical_df = df.select_dtypes(include=["number"]) - # () Extract the type of owl concepts + + # () Group by the type of OWL concepts df_g = df.groupby(by="Type") print(df_g["Type"].count()) + + # () Compute mean of numerical columns per group mean_df = df_g[numerical_df.columns].mean() print(mean_df) - def get_default_arguments(): parser = ArgumentParser() parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") From 718a1b80b826cb325e5e51db70cb88089b7ac888 Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Sat, 16 Nov 2024 13:54:37 +0100 Subject: [PATCH 2/2] fixed typo --- examples/retrieval_eval_under_incomplete.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 35b3437c..500480a0 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -113,7 +113,7 @@ def execute(args): data = [] if args.sample == "Yes": - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.1", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.1", "--ratio_sample_object_prop", "0.2", "--path_report", path_report]) else: subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--path_report", path_report]) @@ -235,7 +235,7 @@ def get_default_arguments(): parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") - parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") + parser.add_argument("--ratio_sample_object_prop", type=float, default=None, help="To sample OWL Object Properties.") parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Incomplete_Results.csv") parser.add_argument("--number_of_subgraphs", type=int, default=1) parser.add_argument("--ratio", type=float, default=0.1, \