IntelPython · leriomaggio · Feb 16, 2023 · Feb 16, 2023 · Feb 16, 2023 · Feb 16, 2023
@@ -118,12 +118,36 @@ The configuration of benchmarks allows you to select the frameworks to run, sele
 |**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:|
 |**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:|
 
-### Scikit-learn benchmakrs
+### Scikit-learn benchmarks
 
 When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension.
 
 For the algorithms with both CPU and GPU support, you may use the same [configuration file](https://github.com/IntelPython/scikit-learn_bench/blob/master/configs/skl_xpu_config.json) to run the scikit-learn benchmarks on CPU and GPU.
 
+
+## Downloading Data
+
+It is possible to download all the required datasets to use in a benchmark, separately.
+
+This would considerably speed-up the overall execution, whilst also avoiding any networking
+issue that might occur during the experiments.
+
+To download public datasets included in the benchmark please run:
+
+```bash
+DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -d <DS_NAME_1> <DS_NAME_2>
+```
+
+Alternatively, it is also possible to automatically download all datasets included
+in benchmark configuration files:
+
+```shell
+
+DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -c config_1.json config_2.json ...
+```
+
+Please refer to the [documentation](./datasets/README.md) for further instructions.
+
 ## Algorithm parameters
 
 You can launch benchmarks for each algorithm separately.

@@ -0,0 +1,43 @@
+# Download Datasets for scikit-learn_bench
+
+The download selected public datasets included in the benchmark, please run the following command:
+
+```bash
+DATASETSROOT=/path/to/local/download/directory python -m datasets.load_datasets -d <DS_NAME_1> <DS_NAME_2>
+```
+
+The scipt relies on a `DATASETSROOT` environment variable, to indicate the local path where
+datasets will be automatically downloaded.
+
+You can alternatively export this variable in your SHELL environment **before** running the script:
+
+```shell
+export DATASETSROOT=/path/to/download/directory
+```
+## Important Note
+
+Please **do not** run the `load_datasets` script from within the `datasets` folder. This will not work
+due to issues with relative imports. 
+
+Please execute the `load_datasets` script directly from the _main_ folder, using the [`-m`](https://docs.python.org/3/using/cmdline.html#cmdoption-m) option with the Python interpreter.
+
+
+## List of available datasets
+
+To access the list of all the datasets included in the benchmark, please use the `--list` option:
+
+```bash
+python -m datasets.load_datasets --list
+```
+
+## Download datasets included in configurations files
+
+It is also possible to gather the list of dataset(s) to download directly from
+benchmark configuration files by using the `--configs` (`-c`) option:
+
+```shell
+DATASETSROOT=/path/to/download/dir python -m datasets.load_datasets -c config_1.json config_2.json ...
+```
+
+This method will override the `-d` option, and it is highly recommended when
+running multiple benchmark experiments.
diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py
@@ -18,23 +18,71 @@
 import logging
 import os
 import sys
+import json
 from pathlib import Path
 from typing import Callable, Dict
 
-from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
-                                    census,  cifar_binary, codrnanorm, covtype_binary, creditcard,
-                                    epsilon, epsilon_16K, epsilon_30K, epsilon_80K, epsilon_100K,
-                                    fraud, gisette, hepmass_150K,
-                                    higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas,
-                                    santander, skin_segmentation, susy)
-from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr,
-                                mnist, msrank, plasticc, sensit)
-from .loader_regression import (abalone, california_housing, fried, higgs_10500K,
-                                medical_charges_nominal, mortgage_first_q,
-                                twodplanes, year_prediction_msd, yolanda, airline_regression)
-from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering,
-                                hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster,
-                                road_network_20K_cluster, susy_cluster)
+from .loader_classification import (
+    a_nine_a,
+    airline,
+    airline_ohe,
+    bosch,
+    census,
+    cifar_binary,
+    codrnanorm,
+    covtype_binary,
+    creditcard,
+    epsilon,
+    epsilon_16K,
+    epsilon_30K,
+    epsilon_80K,
+    epsilon_100K,
+    fraud,
+    gisette,
+    hepmass_150K,
+    higgs,
+    higgs_one_m,
+    higgs_150K,
+    ijcnn,
+    klaverjas,
+    santander,
+    skin_segmentation,
+    susy,
+)
+from .loader_multiclass import (
+    cifar_10,
+    connect,
+    covertype,
+    covtype,
+    letters,
+    mlsr,
+    mnist,
+    msrank,
+    plasticc,
+    sensit,
+)
+from .loader_regression import (
+    abalone,
+    california_housing,
+    fried,
+    higgs_10500K,
+    medical_charges_nominal,
+    mortgage_first_q,
+    twodplanes,
+    year_prediction_msd,
+    yolanda,
+    airline_regression,
+)
+from .loader_clustering import (
+    cifar_cluster,
+    epsilon_50K_cluster,
+    higgs_one_m_clustering,
+    hepmass_1M_cluster,
+    hepmass_10K_cluster,
+    mnist_10K_cluster,
+    road_network_20K_cluster,
+    susy_cluster,
+)
 
 dataset_loaders: Dict[str, Callable[[Path], bool]] = {
     "a9a": a_nine_a,
@@ -101,31 +149,83 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool:
             logging.warning(f"Internal error loading dataset:\n{ex}")
             return False
     else:
-        logging.warning(f"There is no script to download the dataset: {dataset_name}. "
-                        "You need to add a dataset or script to load it.")
+        logging.warning(
+            f"There is no script to download the dataset: {dataset_name}. "
+            "You need to add a dataset or script to load it."
+        )
         return False
 
 
-if __name__ == '__main__':
+def extract_dataset_names(config_file: str) -> set[str]:
+    with open(config_file) as json_config_file:
+        experiment = json.load(json_config_file)
+
+        if "cases" not in experiment:
+            return set()
+
+        datasets = list()
+        for case in experiment["cases"]:
+            if "dataset" not in case:
+                continue
+            for ds in case["dataset"]:
+                if ds["source"] == "synthethic" or "name" not in ds:
+                    continue
+                datasets.append(ds["name"])
+    return set(datasets)  # remove duplicates
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Use \'-d\' or \'--datasets\' option to enumerate '
-                    'dataset(s) that should be downloaded')
-    parser.add_argument('-l', '--list', action='store_const',
-                        const=True, help='The list of available datasets')
-    parser.add_argument('-d', '--datasets', type=str, nargs='*',
-                        help='The datasets that should be downloaded.')
+        description="Utility to download selected publicly available datasets "
+        "included in the benchmark."
+    )
+    parser.add_argument(
+        "-l",
+        "--list",
+        action="store_const",
+        const=True,
+        help="The list of available datasets",
+    )
+    parser.add_argument(
+        "-d",
+        "--datasets",
+        type=str,
+        nargs="*",
+        help="The datasets that should be downloaded.",
+    )
+    parser.add_argument(
+        "-c",
+        "--configs",
+        type=str,
+        nargs="*",
+        help="The benchmark configuration file(s) to gather dataset name(s) to download.",
+    )
     args = parser.parse_args()
 
     if args.list:
         for key in dataset_loaders:
             print(key)
         sys.exit(0)
 
-    root_dir = Path(os.environ['DATASETSROOT'])
+    root_dir = Path(os.environ["DATASETSROOT"])
 
-    if args.datasets is not None:
-        for val in dataset_loaders.values():
-            val(root_dir)
+    if args.datasets is None and args.configs is None:
+        logging.warning("Warning: Enumerate dataset(s) that should be downloaded")
     else:
-        logging.warning(
-            'Warning: Enumerate dataset(s) that should be downloaded')
+        if args.configs:
+            print(f"Dataset name(s) to download will be gathered from : {args.configs}")
+            ds_names: set[str] = set()
+            for config_file in args.configs:
+                ds_names = ds_names.union(extract_dataset_names(config_file))
+        else:
+            ds_names = set(args.datasets)
+        print(
+            f"{len(ds_names)} dataset{'s' if len(ds_names) > 1 else ''} requested for download"
+        )
+        print(f"Download location: {root_dir}")
+
+        for i, name in enumerate(ds_names):
+            print(f'{i+1}. Dataset "{name}"')
+            downloaded = try_load_dataset(name, root_dir)
+            if downloaded:
+                print(f'Dataset "{name}" successfully downloaded.')