Merge pull request #493 from dice-group/retrieval_eval_fixes

fixes memory issue by writing csv to disk and sample ratio issue for …
dice-group · Nov 16, 2024 · b2782e4 · b2782e4
2 parents 77fae24 + 718a1b8
commit b2782e4
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 24 deletions.
diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py
@@ -64,7 +64,12 @@
 import random
 import itertools
 import ast
-
+# Set pandas options to ensure full output
+pd.set_option('display.max_rows', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', None)
+pd.set_option('display.colheader_justify', 'left')
+pd.set_option('display.expand_frame_repr', False)
 
 def execute(args):
     # (1) Initialize knowledge base.
@@ -86,9 +91,9 @@ def execute(args):
     object_properties = sorted({i for i in symbolic_kb.get_object_properties()})
 
     # (3.1) Subsample if required.
-    if args.ratio_sample_object_prop:
+    if args.ratio_sample_object_prop and len(object_properties) > 0:
         object_properties = {i for i in random.sample(population=list(object_properties),
-                                                      k=max(0, int(len(object_properties) * args.ratio_sample_object_prop)))}
+                                                      k=max(1, int(len(object_properties) * args.ratio_sample_object_prop)))}
 
     object_properties = set(object_properties)    
 
@@ -103,9 +108,9 @@ def execute(args):
 
 
 
-    if args.ratio_sample_nc:
+    if args.ratio_sample_nc and len(nc) > 0:
         # (6.1) Subsample if required.
-        nc = {i for i in random.sample(population=list(nc), k=max(0, int(len(nc) * args.ratio_sample_nc)))}
+        nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))}
 
     nc = set(nc) # return to a set
     # (7) NC⁻: Complement of NC.
@@ -211,7 +216,10 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]:
     # () Shuffled the data so that the progress bar is not influenced by the order of concepts.
 
     random.shuffle(concepts)
-
+    # check if csv arleady exists and delete it cause we want to override it
+    if os.path.exists(args.path_report):
+        os.remove(args.path_report)
+    file_exists = False
     # () Iterate over single OWL Class Expressions in ALCQIHO
     for expression in (tqdm_bar := tqdm(concepts, position=0, leave=True)):
         retrieval_y: Set[str]
@@ -225,8 +233,8 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]:
         # () Compute the F1-score.
         f1_sim = f1_set_similarity(retrieval_y, retrieval_neural_y)
         # () Store the data.
-        data.append(
-            {
+        df_row = pd.DataFrame(
+            [{
                 "Expression": owl_expression_to_dl(expression),
                 "Type": type(expression).__name__,
                 "Jaccard Similarity": jaccard_sim,
@@ -235,34 +243,35 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]:
                 "Runtime Neural": runtime_neural_y,
                 "Symbolic_Retrieval": retrieval_y,
                 "Symbolic_Retrieval_Neural": retrieval_neural_y,
-            }
-        )
+            }])
+        # Append the row to the CSV file
+        df_row.to_csv(args.path_report, mode='a', header=not file_exists, index=False)
+        file_exists = True
         # () Update the progress bar.
         tqdm_bar.set_description_str(
             f"Expression: {owl_expression_to_dl(expression)} | Jaccard Similarity:{jaccard_sim:.4f} | F1 :{f1_sim:.4f} | Runtime Benefits:{runtime_y - runtime_neural_y:.3f}"
         )
     # () Read the data into pandas dataframe
-    df = pd.DataFrame(data)
-    assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity
-    # () Save the experimental results into csv file.
-    df.to_csv(args.path_report)
-    del df
-    # () Load the saved CSV file.
     df = pd.read_csv(args.path_report, index_col=0, converters={'Symbolic_Retrieval': lambda x: ast.literal_eval(x),
-                                                                'Symbolic_Retrieval_Neural': lambda x: ast.literal_eval(
-                                                                    x)})
-    # () A retrieval result can be parsed into  set of instances to python object.
+                                                                'Symbolic_Retrieval_Neural': lambda x: ast.literal_eval(x)})
+    # () Assert that the mean Jaccard Similarity meets the threshold
+    assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity
+
+    # () Ensure 'Symbolic_Retrieval_Neural' contains sets
     x = df["Symbolic_Retrieval_Neural"].iloc[0]
     assert isinstance(x, set)
-    # () Extract the numerical features.
+
+    # () Extract numerical features
     numerical_df = df.select_dtypes(include=["number"])
-    # () Extract the type of owl concepts
+
+    # () Group by the type of OWL concepts
     df_g = df.groupby(by="Type")
     print(df_g["Type"].count())
+
+    # () Compute mean of numerical columns per group
     mean_df = df_g[numerical_df.columns].mean()
     print(mean_df)
 
-
 def get_default_arguments():
     parser = ArgumentParser()
     parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl")

diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py
@@ -113,7 +113,7 @@ def execute(args):
         data = []
 
         if args.sample == "Yes":
-            subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.1", "--ratio_sample_object_prob", "0.2", "--path_report", path_report])
+            subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.1", "--ratio_sample_object_prop", "0.2", "--path_report", path_report])
         else:
             subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--path_report", path_report])
 
@@ -235,7 +235,7 @@ def get_default_arguments():
     parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl")
     parser.add_argument("--seed", type=int, default=1)
     parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.")
-    parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.")
+    parser.add_argument("--ratio_sample_object_prop", type=float, default=None, help="To sample OWL Object Properties.")
     parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Incomplete_Results.csv")
     parser.add_argument("--number_of_subgraphs", type=int, default=1)
     parser.add_argument("--ratio", type=float, default=0.1, \