Skip to content

Commit

Permalink
defer to s5cmd cp defaults, instead of freezing defaults in this library
Browse files Browse the repository at this point in the history
  • Loading branch information
mackenzie-grimes-noaa committed Sep 9, 2024
1 parent ab5675d commit e325731
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions python/idsse_common/idsse/common/aws_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def aws_ls(self, path: str, prepend_path: bool = True) -> Sequence[str]:
def aws_cp(self,
path: str,
dest: str,
concurrency: int = 5,
chunk_size: int = 50) -> bool:
concurrency: int | None = None,
chunk_size: int | None = None) -> bool:
"""Execute an 'cp' on the AWS s3 bucket specified by path, dest. Attempts to use
[s5cmd](https://github.com/peak/s5cmd) to copy the file from S3 with parallelization,
but falls back to (slower) aws-cli if s5cmd is not installed or throws an error.
Expand All @@ -78,18 +78,27 @@ def aws_cp(self,
path (str): Relative or Absolute path to the object to be copied
dest (str): The destination location
concurrency (optional, int): Number of parallel threads for s5cmd to use to copy
the file down from AWS (may be helpful to tweak for large files). Default is 5.
the file down from AWS (may be helpful to tweak for large files).
Default is None (s5cmd default).
chunk_size (optional, int): Size of chunks (in MB) for s5cmd to split up the source AWS
S3 file so it can download quicker with more threads. Default is 50 MB.
S3 file so it can download quicker with more threads.
Default is None (s5cmd default).
Returns:
bool: Returns True if copy is successful
"""
try:
commands = ['s5cmd', '--no-sign-request', 'cp']
logger.debug('First attempt with s5cmd, concurrency: %d, chunk_size: %s',
concurrency, chunk_size)
commands = ['s5cmd', '--no-sign-request', 'cp', '--concurrency', concurrency,
'--part_size', chunk_size, path, dest]

# if concurrency and/or chunk_size options were provided, append to s5cmd before paths
if concurrency:
commands += ['--concurrency', concurrency]
if chunk_size:
commands += ['--part_size', chunk_size]
commands += [path, dest] # finish the command list with the src and destination

exec_cmd(commands)
return True
except FileNotFoundError:
Expand Down

0 comments on commit e325731

Please sign in to comment.