defer to s5cmd cp defaults, instead of freezing defaults in this library

NOAA-GSL · Sep 9, 2024 · e325731 · e325731
1 parent ab5675d
commit e325731
Showing 1 changed file with 15 additions and 6 deletions.
diff --git a/python/idsse_common/idsse/common/aws_utils.py b/python/idsse_common/idsse/common/aws_utils.py
@@ -68,8 +68,8 @@ def aws_ls(self, path: str, prepend_path: bool = True) -> Sequence[str]:
     def aws_cp(self,
                path: str,
                dest: str,
-               concurrency: int  = 5,
-               chunk_size: int = 50) -> bool:
+               concurrency: int | None = None,
+               chunk_size: int | None = None) -> bool:
         """Execute an 'cp' on the AWS s3 bucket specified by path, dest. Attempts to use
         [s5cmd](https://github.com/peak/s5cmd) to copy the file from S3 with parallelization,
         but falls back to (slower) aws-cli if s5cmd is not installed or throws an error.
@@ -78,18 +78,27 @@ def aws_cp(self,
             path (str): Relative or Absolute path to the object to be copied
             dest (str): The destination location
             concurrency (optional, int): Number of parallel threads for s5cmd to use to copy
-                the file down from AWS (may be helpful to tweak for large files). Default is 5.
+                the file down from AWS (may be helpful to tweak for large files).
+                Default is None (s5cmd default).
             chunk_size (optional, int): Size of chunks (in MB) for s5cmd to split up the source AWS
-                S3 file so it can download quicker with more threads. Default is 50 MB.
+                S3 file so it can download quicker with more threads.
+                Default is None (s5cmd default).
 
         Returns:
             bool: Returns True if copy is successful
         """
         try:
+            commands = ['s5cmd', '--no-sign-request',  'cp']
             logger.debug('First attempt with s5cmd, concurrency: %d, chunk_size: %s',
                          concurrency, chunk_size)
-            commands = ['s5cmd', '--no-sign-request',  'cp', '--concurrency', concurrency,
-                        '--part_size', chunk_size, path, dest]
+
+            # if concurrency and/or chunk_size options were provided, append to s5cmd before paths
+            if concurrency:
+                commands += ['--concurrency', concurrency]
+            if chunk_size:
+                commands += ['--part_size', chunk_size]
+            commands += [path, dest]  # finish the command list with the src and destination
+
             exec_cmd(commands)
             return True
         except FileNotFoundError: