Merge branch 'olgalidev' into master

abfleishman · Oct 20, 2018 · 1d35180 · 1d35180
2 parents 9b65da1 + aa83c9c
commit 1d35180
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -58,7 +58,9 @@ This will include cloning of https://github.com/tensorflow/models. (On my machin
  _"python_file_directory"_ config value should point to the _"train"_ scripts from this project.
 Example:  
 `python_file_directory=/home/olgali/repos/models/research/active-learning-detect/train`
-3) pip install azure-blob packages: azure.storage.blob
+4) pip install azure.storage.blob   
+ pip install opencv-python  
+ pip install pandas
 
 ### Tagger machine(s) (could be same as Linux box or separate boxes\vms)
 1) Have Python 3.6 up and running.

diff --git a/config.ini b/config.ini
@@ -6,6 +6,7 @@ label_container_name=activelearninglabels
 # IMAGE INFORMATION
 user_folders=True
 classes=knots,date
+ideal_class_balance=0.7,0.3
 filetype=*.png
 # TAGGING MACHINE 
 tagging_location=C:\Users\t-yapand\Desktop\NewTag

diff --git a/tag/download_vott_json.py b/tag/download_vott_json.py
@@ -29,6 +29,7 @@ def make_vott_output(all_predictions, output_location, user_folders, image_loc,
         output_location = Path(output_location)/folder_name
     else:
         output_location = Path(output_location)/"Images"
+
     output_location.mkdir(parents=True, exist_ok=True)
     using_blob_storage = blob_credentials is not None
     if using_blob_storage:
@@ -108,20 +109,51 @@ def make_vott_output(all_predictions, output_location, user_folders, image_loc,
     with open(str(output_location)+".json","w") as json_out:
         json.dump(dirjson, json_out)
 
-def get_top_rows(file_location, num_rows, user_folders, pick_max):
+def get_top_rows(file_location, num_rows, user_folders, pick_max, tag_names, ideal_class_balance):
+    if ideal_class_balance is not None and len(tag_names) != len(ideal_class_balance):
+        print("WARNING: Mismatch between number of classes:{} and ideal_class_balance: {}".format(tag_names,
+                                                                                                  ideal_class_balance))
+        print("Defaulting to ideal_class_balance: None")
+        ideal_class_balance = None
+
     with (file_location/"totag.csv").open(mode='r') as file:
         reader = csv.reader(file)
         header = next(reader)
         csv_list = list(reader)
+    all_files_per_class = {}
     if user_folders:
         all_files = defaultdict(lambda: defaultdict(list))
         for row in csv_list:
             all_files[row[FOLDER_LOCATION]][row[0]].append(row)
         all_lists = []
+        class_balances_cnt = 1
+        if ideal_class_balance is not None:
+            class_balances_cnt = len(ideal_class_balance)
         if pick_max:
             for folder_name in all_files:
-                all_lists.append(nlargest(num_rows, all_files[folder_name].values(), key=lambda x:float(x[0][CONFIDENCE_LOCATION])))
-            top_rows = max(all_lists,key=lambda x:sum(float(row[0][CONFIDENCE_LOCATION]) for row in x))
+                for k, v in all_files[folder_name].items():
+                    v_arr = np.array(v)
+                    classes = v_arr[:, TAG_LOCATION]
+                    for i in range(class_balances_cnt):
+                        class_i = tag_names[i]
+                        if class_i in classes:
+                            if class_i not in all_files_per_class:
+                                #all_files_per_class[class_i] = {}
+                                all_files_per_class[class_i] = []
+                            #all_files_per_class[class_i][k] = v
+                            all_files_per_class[class_i].append(v)
+                            break;
+
+
+                for i in range(class_balances_cnt):
+                    num_rows_i = int(num_rows * float(ideal_class_balance[i]))
+                    class_i = tag_names[i]
+                    top = nlargest(num_rows_i, all_files_per_class[class_i],
+                                              key=lambda x: float(x[0][CONFIDENCE_LOCATION]))
+                    all_lists = all_lists + top
+                #all_lists.append(nlargest(num_rows, all_files[folder_name].values(), key=lambda x:float(x[0][CONFIDENCE_LOCATION])))
+            #top_rows = max(all_lists,key=lambda x:sum(float(row[0][CONFIDENCE_LOCATION]) for row in x))
+            top_rows = all_lists
         else:
             for folder_name in all_files:
                 all_lists.append(nsmallest(num_rows, all_files[folder_name].values(), key=lambda x:float(x[0][CONFIDENCE_LOCATION])))
@@ -145,13 +177,16 @@ def get_top_rows(file_location, num_rows, user_folders, pick_max):
             (tagging_writer if row[0] in tagging_files else untagged_writer).writerow(row)
     return top_rows
 
-def create_vott_json(file_location, num_rows, user_folders, pick_max, image_loc, output_location, blob_credentials=None, tag_names = ["stamp"], max_tags_per_pixel=None):
-    all_files = get_top_rows(file_location, num_rows, user_folders, pick_max)
+def create_vott_json(file_location, num_rows, user_folders, pick_max, image_loc, output_location, blob_credentials=None,
+                     tag_names = ["stamp"], max_tags_per_pixel=None, ideal_class_balance=None):
+    all_files = get_top_rows(file_location, num_rows, user_folders, pick_max, tag_names, ideal_class_balance)
     # The tag_colors list generates random colors for each tag. To ensure that these colors stand out / are easy to see on a picture, the colors are generated
     # in the hls format, with the random numbers biased towards a high luminosity (>=.8) and saturation (>=.75).
-    make_vott_output(all_files, output_location, user_folders, image_loc, blob_credentials=blob_credentials, tag_names=tag_names,
-    tag_colors=['#%02x%02x%02x' % (int(256*r), int(256*g), int(256*b)) for 
-            r,g,b in [colorsys.hls_to_rgb(random.random(),0.8 + random.random()/5.0, 0.75 + random.random()/4.0) for _ in tag_names]], max_tags_per_pixel=max_tags_per_pixel)
+    colors = ['#%02x%02x%02x' % (int(256*r), int(256*g), int(256*b)) for
+            r,g,b in [colorsys.hls_to_rgb(random.random(),0.8 + random.random()/5.0, 0.75 + random.random()/4.0) for _ in tag_names]]
+
+    make_vott_output(all_files, output_location, user_folders, image_loc, blob_credentials=blob_credentials,
+                     tag_names=tag_names,  tag_colors=colors, max_tags_per_pixel=max_tags_per_pixel)
 
 if __name__ == "__main__":
     #create_vott_json(r"C:\Users\t-yapand\Desktop\GAUCC1_1533070087147.csv",20, True, r"C:\Users\t-yapand\Desktop\GAUCC", r"C:\Users\t-yapand\Desktop\Output\GAUCC")
@@ -177,10 +212,14 @@ def create_vott_json(file_location, num_rows, user_folders, pick_max, image_loc,
     block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], str(csv_file_loc/"totag.csv"))
     container_name = config_file["image_container_name"]
     file_date = [(blob.name, blob.properties.last_modified) for blob in block_blob_service.list_blobs(container_name) if re.match(r'tagging_(.*).csv', blob.name)]
+    ideal_class_balance = config_file["ideal_class_balance"].split(",")
     if file_date:
         block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], str(csv_file_loc/"tagging.csv"))
-    create_vott_json(csv_file_loc, int(sys.argv[1]), config_file["user_folders"]=="True", config_file["pick_max"]=="True", "", config_file["tagging_location"], 
-                blob_credentials=(block_blob_service, container_name), tag_names=config_file["classes"].split(","), max_tags_per_pixel=config_file.get("max_tags_per_pixel",None))
+    create_vott_json(csv_file_loc, int(sys.argv[1]), config_file["user_folders"]=="True", config_file["pick_max"]=="True", "",
+                     config_file["tagging_location"], blob_credentials=(block_blob_service, container_name),
+                     tag_names=config_file["classes"].split(","),
+                     max_tags_per_pixel=config_file.get("max_tags_per_pixel"),
+                     ideal_class_balance =config_file.get("ideal_class_balance").split(","))
     container_name = config_file["label_container_name"]
     block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("tagging",int(time.time() * 1000),"csv"), str(csv_file_loc/"tagging.csv"))
     block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("totag",int(time.time() * 1000),"csv"), str(csv_file_loc/"totag.csv"))
diff --git a/tag/upload_vott_json.py b/tag/upload_vott_json.py
@@ -95,11 +95,11 @@ def select_jsons(image_directory, user_folders, file_location):
     csv_file_loc = Path(config_file["tagging_location"])
     file_date = [(blob.name, blob.properties.last_modified) for blob in block_blob_service.list_blobs(container_name) if re.match(r'tagged_(.*).csv', blob.name)]
     if file_date:
-        block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], csv_file_loc/"tagged.csv")
+        block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], str(csv_file_loc/"tagged.csv"))
     file_date = [(blob.name, blob.properties.last_modified) for blob in block_blob_service.list_blobs(container_name) if re.match(r'tagging_(.*).csv', blob.name)]
     if file_date:
-        block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], csv_file_loc/"tagging.csv")
+        block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], str(csv_file_loc/"tagging.csv"))
     #TODO: Ensure this parses folder recursively when given tagging location. Remove the .json part
     select_jsons(config_file["tagging_location"],config_file["user_folders"]=="True",csv_file_loc)
-    block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("tagged",int(time.time() * 1000),"csv"), csv_file_loc/"tagged.csv")
-    block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("tagging",int(time.time() * 1000),"csv"), csv_file_loc/"tagging.csv")
+    block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("tagged",int(time.time() * 1000),"csv"), str(csv_file_loc/"tagged.csv"))
+    block_blob_service.create_blob_from_path(container_name, "{}_{}.{}".format("tagging",int(time.time() * 1000),"csv"), str(csv_file_loc/"tagging.csv"))
diff --git a/test/test_download_vott_json.py b/test/test_download_vott_json.py
@@ -27,8 +27,9 @@ def setUp(self):
         shutil.copyfile("./totag_source.csv", str(self.csv_file_loc / "totag.csv"))
 
         self.csv_file_loc.mkdir(parents=True, exist_ok=True)
-        #self.ideal_class_balance = self.config_file["ideal_class_balance"].split(",")
-        self.max_tags_per_pixel = self.config_file.get("max_tags_per_pixel")
+
+        self.ideal_class_balance = self.config_file["ideal_class_balance"].split(",")
+        self. max_tags_per_pixel = self.config_file.get("max_tags_per_pixel")
         self.tag_names = self.config_file["classes"].split(",")
         self.user_folders = self.config_file["user_folders"] == "True"
         self.pick_max  = self.config_file["pick_max"] == "True"
@@ -41,18 +42,19 @@ def test_get_top_rows(self):
         N_ROWS = 3
         N_FILES = 3
         all_files = get_top_rows(self.csv_file_loc, N_ROWS, self.user_folders ,
-                         self.pick_max)
+                         self.pick_max, self.tag_names, self.ideal_class_balance)
         self.assertEqual(len(all_files), N_FILES)
 
     def test_create_vott_json(self):
-        N_ROWS = 3
         N_FILES = 3
+
         FOLDER_NAME = "board_images_png"
         create_vott_json(self.csv_file_loc,  N_ROWS,  self.user_folders ,
                          self.pick_max, "",
                          self.tagging_location, blob_credentials=None,
                          tag_names= self.tag_names,
-                         max_tags_per_pixel=self. max_tags_per_pixel)
+                         max_tags_per_pixel=self. max_tags_per_pixel ,
+                         ideal_class_balance=self.ideal_class_balance)
 
         res_folder = os.path.join(self.tagging_location, FOLDER_NAME)
         res_immages_cnt = sum([len(files) for r, d, files in os.walk(res_folder)])