Skip to content

Commit

Permalink
Add verification of copy
Browse files Browse the repository at this point in the history
  • Loading branch information
moradology committed Jun 12, 2024
1 parent 1c9321a commit 5ee286d
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion pangeo_forge_recipes/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,16 @@ def _copy_btw_filesystems(input_opener, output_opener, BLOCK_SIZE=10_000_000):
start = time.time()
interval = 5 # seconds
bytes_read = log_count = 0
bytes_written = 0

while True:
data = source.read(BLOCK_SIZE)
if not data:
break
target.write(data)
write_len = target.write(data)
bytes_read += len(data)
bytes_written += write_len

elapsed = time.time() - start
throughput = bytes_read / elapsed
if elapsed // interval >= log_count:
Expand All @@ -42,6 +46,14 @@ def _copy_btw_filesystems(input_opener, output_opener, BLOCK_SIZE=10_000_000):
f"avg throughput over {elapsed/60:.2f} min: {throughput/1e6:.2f} MB/sec"
)
log_count += 1

# Validate data size after copy
if bytes_read != bytes_written:
error_message = (
f"Mismatch in data sizes, read {bytes_read} bytes but wrote {bytes_written} bytes."
)
logger.error(error_message)
raise ValueError(error_message)
except Exception as e:
logger.error(f"Failed during file copy after reading {bytes_read} bytes: {e}")
raise e
Expand Down

0 comments on commit 5ee286d

Please sign in to comment.