Release 1.3.0 with added DeepScores dataset

apacha · Jun 10, 2020 · 6d7a5e3 · 6d7a5e3
1 parent 6935f4b
commit 6d7a5e3
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 15 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,12 @@
 Changelog
 =========
 
+1.3.0
+-----
+Added download capabilities of DeepScores V1 with extended vocabulary and
+opened the downloader, so you can download custom datasets, as well as utilize
+other methods from it that were previously private.
+
 1.2.2
 -----
 Fixed incorrect import statement in `__init__.py`

diff --git a/omrdatasettools/Downloader.py b/omrdatasettools/Downloader.py
@@ -27,19 +27,33 @@ def download_and_extract_dataset(self, dataset: OmrDataset, destination_director
         >>> downloader.download_and_extract_dataset(OmrDataset.HOMUS_V2, "data")
 
         """
-        if not os.path.exists(dataset.get_dataset_filename()):
-            print("Downloading {0} dataset...".format(dataset.name))
-            self.__download_file(dataset.get_dataset_download_url(), dataset.get_dataset_filename())
-
-        print("Extracting {0} dataset...".format(dataset.name))
-        self.__extract_dataset(os.path.abspath(destination_directory), dataset.get_dataset_filename())
+        self.download_and_extract_custom_dataset(dataset.name, dataset.get_dataset_download_url(),
+                                                 dataset.get_dataset_filename(), destination_directory)
 
         if dataset is OmrDataset.Fornes:
             self.__fix_capital_file_endings(os.path.join(os.path.abspath(destination_directory), "Music_Symbols"))
 
         if dataset in [OmrDataset.MuscimaPlusPlus_V1, OmrDataset.MuscimaPlusPlus_V2]:
             self.__download_muscima_pp_images(dataset, destination_directory)
 
+    def download_and_extract_custom_dataset(self, dataset_name: str, dataset_url: str, dataset_filename: str,
+                                            destination_directory: str):
+        """ Starts the download of a custom dataset and extracts it into the specified directory.
+
+        Examples
+        --------
+        >>> from omrdatasettools import Downloader
+        >>> downloader = Downloader()
+        >>> downloader.download_and_extract_custom_dataset("MyNewOmrDataset", "https://example.org/dataset.zip", "dataset.zip", "data/MyNewOmrDataset")
+
+        """
+        if not os.path.exists(dataset_filename):
+            print("Downloading {0} dataset...".format(dataset_name))
+            self.download_file(dataset_url, dataset_filename)
+
+        print("Extracting {0} dataset...".format(dataset_name))
+        self.extract_dataset(os.path.abspath(destination_directory), dataset_filename)
+
     def download_images_from_mei_annotation(self, dataset: OmrDataset, dataset_directory: str, base_url: str):
         """ Crawls the images of an Edirom dataset, if provided with the respective URL. To avoid repetitive crawling,
             this URL has to be provided manually. If you are interested in these datasets, please contact the authors.
@@ -84,15 +98,15 @@ def __download_edirom_images(self, base, base_url, source):
     def __download_muscima_pp_images(self, dataset: OmrDataset, destination_directory: str):
         # Automatically download the images and measure annotations with the MUSCIMA++ dataset
         muscima_pp_images_filename = dataset.dataset_file_names()["MuscimaPlusPlus_Images"]
-        self.__download_file(dataset.dataset_download_urls()["MuscimaPlusPlus_Images"], muscima_pp_images_filename)
+        self.download_file(dataset.dataset_download_urls()["MuscimaPlusPlus_Images"], muscima_pp_images_filename)
         absolute_path_to_temp_folder = os.path.abspath('MuscimaPpImages')
-        self.__extract_dataset(absolute_path_to_temp_folder, muscima_pp_images_filename)
+        self.extract_dataset(absolute_path_to_temp_folder, muscima_pp_images_filename)
         if dataset is OmrDataset.MuscimaPlusPlus_V1:
             target_folder = os.path.join(os.path.abspath(destination_directory), "v1.0", "data", "images")
         if dataset is OmrDataset.MuscimaPlusPlus_V2:
             target_folder = os.path.join(os.path.abspath(destination_directory), "v2.0", "data", "images")
-        self.__copytree(os.path.join(absolute_path_to_temp_folder, "fulls"), target_folder)
-        self.__clean_up_temp_directory(absolute_path_to_temp_folder)
+        self.copytree(os.path.join(absolute_path_to_temp_folder, "fulls"), target_folder)
+        self.clean_up_temp_directory(absolute_path_to_temp_folder)
 
     def __fix_capital_file_endings(self, absolute_path_to_temp_folder):
         image_with_capital_file_ending = [y for x in os.walk(absolute_path_to_temp_folder) for y in
@@ -101,7 +115,7 @@ def __fix_capital_file_endings(self, absolute_path_to_temp_folder):
             os.rename(image, image[:-3] + "bmp")
 
     @staticmethod
-    def __copytree(src, dst):
+    def copytree(src, dst):
         if not os.path.exists(dst):
             os.makedirs(dst)
         for item in os.listdir(src):
@@ -114,18 +128,18 @@ def __copytree(src, dst):
                     shutil.copy2(s, d)
 
     @staticmethod
-    def __extract_dataset(absolute_path_to_folder: str, dataset_filename: str):
+    def extract_dataset(absolute_path_to_folder: str, dataset_filename: str):
         archive = ZipFile(dataset_filename, "r")
         archive.extractall(absolute_path_to_folder)
         archive.close()
 
     @staticmethod
-    def __clean_up_temp_directory(temp_directory):
+    def clean_up_temp_directory(temp_directory):
         print("Deleting temporary directory {0}".format(temp_directory))
         shutil.rmtree(temp_directory, ignore_errors=True)
 
     @staticmethod
-    def __download_file(url, destination_filename=None) -> str:
+    def download_file(url, destination_filename=None) -> str:
         u = urllib2.urlopen(url)
         scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
         filename = os.path.basename(path)

diff --git a/omrdatasettools/OmrDataset.py b/omrdatasettools/OmrDataset.py
@@ -71,6 +71,9 @@ class OmrDataset(Enum):
     #: The Rebelo dataset (part 2) with music symbols from http://www.inescporto.pt/~arebelo/index.php, Copyright 2017 by Ana Rebelo under CC BY-SA 4.0 license
     Rebelo2 = auto()
 
+    #: The DeepScore dataset (version 1) with extended vocabulary from https://tuggeluk.github.io/downloads/, License unspecified.
+    DeepScores_V1_Extended = auto()
+
     def get_dataset_download_url(self) -> str:
         """ Returns the url of the selected dataset.
             Example usage: OmrDataset.Fornes.get_dataset_download_url() """
@@ -132,6 +135,8 @@ def dataset_download_urls(self) -> Dict[str, str]:
 
             "Rebelo1": "https://github.com/apacha/OMR-Datasets/releases/download/datasets/Rebelo-Music-Symbol-Dataset1.zip",
             "Rebelo2": "https://github.com/apacha/OMR-Datasets/releases/download/datasets/Rebelo-Music-Symbol-Dataset2.zip",
+
+            "DeepScores_V1_Extended": "https://repository.cloudlab.zhaw.ch/artifactory/deepscores/ds_extended.zip"
         }
 
     def dataset_file_names(self) -> Dict[str, str]:
@@ -156,4 +161,5 @@ def dataset_file_names(self) -> Dict[str, str]:
             "Printed": "PrintedMusicSymbolsDataset.zip",
             "Rebelo1": "Rebelo-Music-Symbol-Dataset1.zip",
             "Rebelo2": "Rebelo-Music-Symbol-Dataset2.zip",
+            "DeepScores_V1_Extended": "ds_extended.zip"
         }
diff --git a/omrdatasettools/_version.py b/omrdatasettools/_version.py
@@ -1 +1 @@
-__version__ = '1.2.2'
+__version__ = '1.3.0'
diff --git a/omrdatasettools/tests/test_downloader.py b/omrdatasettools/tests/test_downloader.py
@@ -204,6 +204,15 @@ def test_download_of_rebelo2_dataset(self):
         self.download_dataset_and_verify_correct_extraction(destination_directory, number_of_samples_in_the_dataset,
                                                             target_file_extension, dataset)
 
+    def test_download_of_deepscores_dataset(self):
+        destination_directory = "DeepScoresV1Extended"
+        dataset = OmrDataset.DeepScores_V1_Extended
+        number_of_samples_in_the_dataset = 3408
+        target_file_extension = "*.png"
+
+        self.download_dataset_and_verify_correct_extraction(destination_directory, number_of_samples_in_the_dataset,
+                                                            target_file_extension, dataset)
+
     def download_dataset_and_verify_correct_extraction(self: unittest.TestCase, destination_directory: str,
                                                        number_of_samples_in_the_dataset: int,
                                                        target_file_extension: str, dataset: OmrDataset):