Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatic checking of hashes for transferred files #21

Merged
merged 4 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions creed/file_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# adapted from https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
import hashlib
import os
from ark.utils import io_utils, misc_utils

import warnings


def get_hash(filepath):
"""Computes the hash of the specified file to verify file integrity

Args:
filepath (str | PathLike): full path to file

Returns:
string: the hash of the file"""

with open(filepath, "rb") as f:
file_hash = hashlib.blake2b()
while chunk := f.read(8192):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yeah baby, python 3.8 walrus operator

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks StackOverflow!

file_hash.update(chunk)
return file_hash.hexdigest()


def compare_directories(dir_1, dir_2):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quick q here. is the intention to compare hashes between directories on the CAC and those on an external drive, or...?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. We copy a couple TBs off the CAC, want to ensure it went smoothly

"""Compares two directories to ensure all files are present in both with the same hashes

Args:
dir_1: first directory to compare
dir_2: second directory to compare

Returns:
list: a list of files with different hashes between the two directories"""

dir_1_folders = io_utils.list_folders(dir_1)
dir_2_folders = io_utils.list_folders(dir_2)

if len(dir_1_folders) > 0:
warnings.warn("The following subfolders were found in the first directory. Sub-folder "
"contents will not be compared for accuracy, if you want to ensure "
"successful copying please run this function on those subdirectories. "
"{}".format(dir_1_folders))

if len(dir_2_folders) > 0:
warnings.warn("The following subfolders were found in the second directory. Sub-folder "
"contents will not be compared for accuracy, if you want to ensure "
"successful copying please run this function on those subdirectories. "
"{}".format(dir_2_folders))

dir_1_files = io_utils.list_files(dir_1)
dir_2_files = io_utils.list_files(dir_2)

misc_utils.verify_same_elements(directory_1=dir_1_files, directory_2=dir_2_files)
Copy link
Contributor

@alex-l-kong alex-l-kong Jan 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that I think about it, the error messages of verify_same_elements (and, by extension, verify_in_list) can be a bit cryptic and, at the very least, inflexible. Is it necessary to add some flexibility to the error message displayed or is the default one fine for now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the default error message is okay, but the wording right now is still a little bit confusing. You can open an issue about it, and we can address it in future PR


bad_files = []
for file in dir_1_files:
hash1 = get_hash(os.path.join(dir_1, file))
hash2 = get_hash(os.path.join(dir_2, file))

if hash1 != hash2:
print("Found a file with differing hashes: {}".format(file))
bad_files.append(file)

return bad_files
61 changes: 61 additions & 0 deletions creed/file_hash_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
import shutil
import tempfile
import pytest

import skimage.io as io
import numpy as np

from creed import file_hash


def test_get_hash():
with tempfile.TemporaryDirectory() as temp_dir:

for img in range(2):
array = np.random.rand(36).reshape((6, 6))
temp_file_path = os.path.join(temp_dir, 'test_file_{}.tiff'.format(img))
io.imsave(temp_file_path, array)

shutil.copy(os.path.join(temp_dir, 'test_file_0.tiff'),
os.path.join(temp_dir, 'test_file_0_copy.tiff'))

hash1 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0.tiff'))
hash1_copy = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0_copy.tiff'))
hash2 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_1.tiff'))

assert hash1 != hash2
assert hash1 == hash1_copy


def test_compare_directories():
with tempfile.TemporaryDirectory() as top_level_dir:
dir_1 = os.path.join(top_level_dir, 'dir_1')
os.makedirs(dir_1)

# make fake data for testing
for img in range(5):
array = np.random.rand(36).reshape((6, 6))
temp_file_path = os.path.join(dir_1, 'test_file_{}.tiff'.format(img))
io.imsave(temp_file_path, array)

# copy same data into second directory
dir_2 = os.path.join(top_level_dir, 'dir_2')
shutil.copytree(dir_1, dir_2)

file_hash.compare_directories(dir_1, dir_2)

# check that warning is raised when sub-folder is present in first directory
sub_folder_1 = os.path.join(dir_1, 'sub_folder')
os.makedirs(sub_folder_1)

with pytest.warns(UserWarning, match='first directory'):
file_hash.compare_directories(dir_1, dir_2)

# check that warning is raised when sub-folder is present in second directory
shutil.rmtree(sub_folder_1)
sub_folder_2 = os.path.join(dir_2, 'sub_folder')
os.makedirs(sub_folder_2)

with pytest.warns(UserWarning, match='second directory'):
file_hash.compare_directories(dir_1, dir_2)