update handling of missing dict keys

ssciwr · Jun 13, 2024 · d8d740f · d8d740f
1 parent be918b5
commit d8d740f
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 57 deletions.
diff --git a/ammico/faces.py b/ammico/faces.py
@@ -340,6 +340,7 @@ def clean_subdict(self, result: dict) -> dict:
                         if confidence_value > self.emotion_threshold and not wears_mask
                         else None
                     )
+                    print("emotion confidence", confidence_value, outcome)
                     # also set the emotion category
                     if outcome:
                         self.subdict["emotion (category)"].append(

diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb
@@ -29,9 +29,10 @@
     "    # install setuptools\n",
     "    # %pip install setuptools==61 -qqq\n",
     "    # uninstall some pre-installed packages due to incompatibility\n",
-    "    %pip uninstall --yes tensorflow-probability dopamine-rl lida pandas-gbq torchaudio torchdata torchtext orbax-checkpoint flex-y -qqq\n",
+    "    %pip uninstall --yes tensorflow-probability dopamine-rl lida pandas-gbq torchaudio torchdata torchtext orbax-checkpoint flex-y jax jaxlib -qqq\n",
     "    # install ammico\n",
     "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    # install older version of jax to support transformers use of diffusers\n",
     "    # mount google drive for data and API key\n",
     "    from google.colab import drive\n",
     "\n",
@@ -95,6 +96,9 @@
    "outputs": [],
    "source": [
     "import os\n",
+    "# jax also sometimes leads to problems on google colab\n",
+    "# if this is the case, try restarting the kernel and executing this \n",
+    "# and the above two code cells again\n",
     "import ammico\n",
     "# for displaying a progress bar\n",
     "from tqdm import tqdm"
@@ -254,8 +258,9 @@
    "outputs": [],
    "source": [
     "for num, key in tqdm(enumerate(image_dict.keys()), total=len(image_dict)):    # loop through all images\n",
+    "    print(image_dict[key])\n",
     "    image_dict[key] = ammico.EmotionDetector(image_dict[key]).analyse_image() # analyse image with EmotionDetector and update dict\n",
-    "    \n",
+    "    print(image_dict[key])\n",
     "    if num % dump_every == 0 or num == len(image_dict) - 1:      # save results every dump_every to dump_file\n",
     "        image_df = ammico.get_dataframe(image_dict)\n",
     "        image_df.to_csv(dump_file)"

diff --git a/ammico/test/data/example_faces.json b/ammico/test/data/example_faces.json
@@ -18,8 +18,8 @@
         "wears_mask": ["No", "No"],
         "gender": ["Man", "Man"], 
         "race": ["asian", "white"], 
-        "emotion": [null, "angry"], 
-        "emotion (category)": [null, "Negative"]
+        "emotion": [null, null], 
+        "emotion (category)": [null, null]
         },
 "pexels-maksgelatin-4750169":
         {

diff --git a/ammico/test/test_faces.py b/ammico/test/test_faces.py
@@ -101,7 +101,7 @@ def test_analyse_faces(get_path, monkeypatch):
     for key in mydict.keys():
         mydict[key].update(
             fc.EmotionDetector(
-                mydict[key], emotion_threshold=60, accept_disclosure="OTHER_VAR"
+                mydict[key], emotion_threshold=80, accept_disclosure="OTHER_VAR"
             ).analyse_image()
         )
 

diff --git a/ammico/test/test_utils.py b/ammico/test/test_utils.py
@@ -90,30 +90,19 @@ def test_check_for_missing_keys():
         "file2": {"faces": "No", "text_english": "Otherthing"},
     }
     # check that dict is not changed
-    mydict2 = ut.check_for_missing_keys(mydict)
+    mydict2 = ut._check_for_missing_keys(mydict)
     assert mydict2 == mydict
     # check that dict is updated if key is missing
     mydict = {
         "file1": {"faces": "Yes", "text_english": "Something"},
         "file2": {"faces": "No"},
     }
-    mydict2 = ut.check_for_missing_keys(mydict)
+    mydict2 = ut._check_for_missing_keys(mydict)
     assert mydict2["file2"] == {"faces": "No", "text_english": None}
     # check that dict is updated if more than one key is missing
     mydict = {"file1": {"faces": "Yes", "text_english": "Something"}, "file2": {}}
-    mydict2 = ut.check_for_missing_keys(mydict)
+    mydict2 = ut._check_for_missing_keys(mydict)
     assert mydict2["file2"] == {"faces": None, "text_english": None}
-    # now test the exceptions
-    with pytest.raises(ValueError):
-        ut.check_for_missing_keys({"File": "path"})
-    with pytest.raises(ValueError):
-        ut.check_for_missing_keys({"File": {}})
-    mydict = {
-        "file1": {"faces": "Yes"},
-        "file2": {"faces": "No", "text_english": "Something"},
-    }
-    with pytest.raises(ValueError):
-        ut.check_for_missing_keys(mydict)
 
 
 def test_append_data_to_dict(get_path):

diff --git a/ammico/utils.py b/ammico/utils.py
@@ -156,52 +156,31 @@ def initialize_dict(filelist: list) -> dict:
     return mydict
 
 
-def check_for_missing_keys(mydict: dict) -> dict:
+def _check_for_missing_keys(mydict: dict) -> dict:
     """Check the nested dictionary for any missing keys in the subdicts.
 
     Args:
         mydict(dict): The nested dictionary with keys to check.
     Returns:
         dict: The dictionary with keys appended."""
     # check that we actually got a nested dict
-    if not isinstance(mydict[next(iter(mydict))], dict):
-        raise ValueError(
-            "Please provide a nested dictionary - you provided {}".format(
-                next(iter(mydict))
+    # also get all keys for all items
+    # currently we go through the whole dictionary twice
+    # however, compared to the rest of the code this is negligible
+    keylist = []
+    for key in mydict.keys():
+        if not isinstance(mydict[key], dict):
+            raise ValueError(
+                "Please provide a nested dictionary - you provided {}".format(key)
             )
-        )
-    # gather all existing keys of first item in a list
-    subdict = mydict[next(iter(mydict))]
-    if len(list(subdict.keys())) < 1:
-        raise ValueError(
-            "Could not get any keys to compare to - please check if your nested dict is empty!"
-        )
+        keylist.append(list(mydict[key].keys()))
+    # find the longest list of keys
+    max_keys = max(keylist, key=len)
+    # now generate missing keys
     for key in mydict.keys():
-        # compare keys of next item with first item
-        if subdict.keys() != mydict[key].keys():
-            # print a warning if key is not found and set to None
-            keys_a = set(subdict.keys())
-            keys_b = set(mydict[key].keys())
-            missing_keys_in_b = keys_a - keys_b
-            if missing_keys_in_b:
-                print(
-                    "Found missing key(s) {} in subdict {} - setting to None.".format(
-                        missing_keys_in_b, key
-                    )
-                )
-                for missing_key in missing_keys_in_b:
-                    mydict[key][missing_key] = None
-            # check that there are no other keys in the subdicts -
-            # this would only happen if there is a key missing in the first subdict
-            # then we would need to start over so best to
-            # abort if this happens - this is a very unlikely case
-            missing_keys_in_a = keys_b - keys_a
-            if missing_keys_in_a:
-                raise ValueError(
-                    "Could not update missing keys - first item already missing {}".format(
-                        missing_keys_in_a
-                    )
-                )
+        for mkey in max_keys:
+            if mkey not in mydict[key].keys():
+                mydict[key][mkey] = None
     return mydict
 
 
@@ -223,7 +202,7 @@ def dump_df(mydict: dict) -> DataFrame:
 
 
 def get_dataframe(mydict: dict) -> DataFrame:
-    check_for_missing_keys(mydict)
+    _check_for_missing_keys(mydict)
     outdict = append_data_to_dict(mydict)
     return dump_df(outdict)