From 482628b79a15e6f3506f4c60232a791765540d6f Mon Sep 17 00:00:00 2001 From: 0x2b3bfa0 <0x2b3bfa0+git@googlemail.com> Date: Fri, 15 Sep 2023 09:19:01 +0200 Subject: [PATCH] Expose img2dataset distributor --- download_upstream.py | 16 ++++++++++++++++ environment.yml | 4 ++-- environment_osx.yml | 4 ++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/download_upstream.py b/download_upstream.py index 8c0780f..ca45291 100644 --- a/download_upstream.py +++ b/download_upstream.py @@ -143,6 +143,20 @@ def cleanup_dir(path): default="datacomp", help="Name of W&B project used (default datacomp)", ) + parser.add_argument( + "--distributor", + type=str, + required=False, + default="multiprocessing", + help="Distributor to use for img2dataset", + ) + parser.add_argument( + "--subjob_size", + type=int, + required=False, + default=1000, + help="Subjob size for img2dataset", + ) args = parser.parse_args() @@ -235,6 +249,8 @@ def cleanup_dir(path): retries=args.retries, enable_wandb=args.enable_wandb, wandb_project=args.wandb_project, + distributor=args.distributor, + subjob_size=args.subjob_size, ) else: print(f"Skipping image data download.") diff --git a/environment.yml b/environment.yml index 4704218..2a41727 100644 --- a/environment.yml +++ b/environment.yml @@ -85,7 +85,7 @@ dependencies: - huggingface-hub==0.14.1 - idna==3.4 - imageio==2.22.4 - - img2dataset==1.40.0 + - img2dataset==1.42.0 - importlib-resources==5.10.0 - isodate==0.6.1 - jmespath==1.0.1 @@ -138,7 +138,7 @@ dependencies: - python-dateutil==2.8.2 - pytz==2022.6 - pywavelets==1.4.1 - - pyyaml==5.4.1 + - pyyaml==6.0.1 - qudida==0.0.4 - regex==2022.10.31 - requests==2.28.1 diff --git a/environment_osx.yml b/environment_osx.yml index d558dc5..a2d0355 100644 --- a/environment_osx.yml +++ b/environment_osx.yml @@ -79,7 +79,7 @@ dependencies: - huggingface-hub==0.14.1 - idna==3.4 - imageio==2.22.4 - - img2dataset==1.40.0 + - img2dataset==1.42.0 - importlib-resources==5.10.0 - isodate==0.6.1 - jmespath==1.0.1 @@ -128,7 +128,7 @@ dependencies: - python-dateutil==2.8.2 - pytz==2022.6 - pywavelets==1.4.1 - - pyyaml==5.4.1 + - pyyaml==6.0.1 - qudida==0.0.4 - regex==2022.10.31 - requests==2.28.1