diff --git a/python/idsse_common/idsse/common/aws_utils.py b/python/idsse_common/idsse/common/aws_utils.py index 9e25ffc..2646fc4 100644 --- a/python/idsse_common/idsse/common/aws_utils.py +++ b/python/idsse_common/idsse/common/aws_utils.py @@ -65,19 +65,40 @@ def aws_ls(self, path: str, prepend_path: bool = True) -> Sequence[str]: return [os.path.join(path, filename.split(' ')[-1]) for filename in commands_result] return [filename.split(' ')[-1] for filename in commands_result] - def aws_cp(self, path: str, dest: str) -> bool: - """Execute an 'cp' on the AWS s3 bucket specified by path, dest + def aws_cp(self, + path: str, + dest: str, + concurrency: int | None = None, + chunk_size: int | None = None) -> bool: + """Execute an 'cp' on the AWS s3 bucket specified by path, dest. Attempts to use + [s5cmd](https://github.com/peak/s5cmd) to copy the file from S3 with parallelization, + but falls back to (slower) aws-cli if s5cmd is not installed or throws an error. Args: path (str): Relative or Absolute path to the object to be copied dest (str): The destination location + concurrency (optional, int): Number of parallel threads for s5cmd to use to copy + the file down from AWS (may be helpful to tweak for large files). + Default is None (s5cmd default). + chunk_size (optional, int): Size of chunks (in MB) for s5cmd to split up the source AWS + S3 file so it can download quicker with more threads. + Default is None (s5cmd default). Returns: bool: Returns True if copy is successful """ try: - logger.debug('First attempt with s5cmd') - commands = ['s5cmd', '--no-sign-request', 'cp', path, dest] + logger.debug('First attempt with s5cmd, concurrency: %d, chunk_size: %s', + concurrency, chunk_size) + commands = ['s5cmd', '--no-sign-request', 'cp'] + + # if concurrency and/or chunk_size options were provided, append to s5cmd before paths + if concurrency: + commands += ['--concurrency', concurrency] + if chunk_size: + commands += ['--part_size', chunk_size] + commands += [path, dest] # finish the command list with the src and destination + exec_cmd(commands) return True except FileNotFoundError: