-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Automatic checking of hashes for transferred files #21
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# adapted from https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file | ||
import hashlib | ||
import os | ||
from ark.utils import io_utils, misc_utils | ||
|
||
import warnings | ||
|
||
|
||
def get_hash(filepath): | ||
"""Computes the hash of the specified file to verify file integrity | ||
|
||
Args: | ||
filepath (str | PathLike): full path to file | ||
|
||
Returns: | ||
string: the hash of the file""" | ||
|
||
with open(filepath, "rb") as f: | ||
file_hash = hashlib.blake2b() | ||
while chunk := f.read(8192): | ||
file_hash.update(chunk) | ||
return file_hash.hexdigest() | ||
|
||
|
||
def compare_directories(dir_1, dir_2): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. quick q here. is the intention to compare hashes between directories on the CAC and those on an external drive, or...? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly. We copy a couple TBs off the CAC, want to ensure it went smoothly |
||
"""Compares two directories to ensure all files are present in both with the same hashes | ||
|
||
Args: | ||
dir_1: first directory to compare | ||
dir_2: second directory to compare | ||
|
||
Returns: | ||
list: a list of files with different hashes between the two directories""" | ||
|
||
dir_1_folders = io_utils.list_folders(dir_1) | ||
dir_2_folders = io_utils.list_folders(dir_2) | ||
|
||
if len(dir_1_folders) > 0: | ||
warnings.warn("The following subfolders were found in the first directory. Sub-folder " | ||
"contents will not be compared for accuracy, if you want to ensure " | ||
"successful copying please run this function on those subdirectories. " | ||
"{}".format(dir_1_folders)) | ||
|
||
if len(dir_2_folders) > 0: | ||
warnings.warn("The following subfolders were found in the second directory. Sub-folder " | ||
"contents will not be compared for accuracy, if you want to ensure " | ||
"successful copying please run this function on those subdirectories. " | ||
"{}".format(dir_2_folders)) | ||
|
||
dir_1_files = io_utils.list_files(dir_1) | ||
dir_2_files = io_utils.list_files(dir_2) | ||
|
||
misc_utils.verify_same_elements(directory_1=dir_1_files, directory_2=dir_2_files) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I think about it, the error messages of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the default error message is okay, but the wording right now is still a little bit confusing. You can open an issue about it, and we can address it in future PR |
||
|
||
bad_files = [] | ||
for file in dir_1_files: | ||
hash1 = get_hash(os.path.join(dir_1, file)) | ||
hash2 = get_hash(os.path.join(dir_2, file)) | ||
|
||
if hash1 != hash2: | ||
print("Found a file with differing hashes: {}".format(file)) | ||
bad_files.append(file) | ||
|
||
return bad_files |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import os | ||
import shutil | ||
import tempfile | ||
import pytest | ||
|
||
import skimage.io as io | ||
import numpy as np | ||
|
||
from creed import file_hash | ||
|
||
|
||
def test_get_hash(): | ||
ngreenwald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
with tempfile.TemporaryDirectory() as temp_dir: | ||
|
||
for img in range(2): | ||
array = np.random.rand(36).reshape((6, 6)) | ||
temp_file_path = os.path.join(temp_dir, 'test_file_{}.tiff'.format(img)) | ||
io.imsave(temp_file_path, array) | ||
|
||
shutil.copy(os.path.join(temp_dir, 'test_file_0.tiff'), | ||
os.path.join(temp_dir, 'test_file_0_copy.tiff')) | ||
|
||
hash1 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0.tiff')) | ||
hash1_copy = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0_copy.tiff')) | ||
hash2 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_1.tiff')) | ||
|
||
assert hash1 != hash2 | ||
assert hash1 == hash1_copy | ||
|
||
|
||
def test_compare_directories(): | ||
with tempfile.TemporaryDirectory() as top_level_dir: | ||
dir_1 = os.path.join(top_level_dir, 'dir_1') | ||
os.makedirs(dir_1) | ||
|
||
# make fake data for testing | ||
for img in range(5): | ||
array = np.random.rand(36).reshape((6, 6)) | ||
temp_file_path = os.path.join(dir_1, 'test_file_{}.tiff'.format(img)) | ||
io.imsave(temp_file_path, array) | ||
|
||
# copy same data into second directory | ||
dir_2 = os.path.join(top_level_dir, 'dir_2') | ||
shutil.copytree(dir_1, dir_2) | ||
|
||
file_hash.compare_directories(dir_1, dir_2) | ||
|
||
# check that warning is raised when sub-folder is present in first directory | ||
sub_folder_1 = os.path.join(dir_1, 'sub_folder') | ||
os.makedirs(sub_folder_1) | ||
|
||
with pytest.warns(UserWarning, match='first directory'): | ||
file_hash.compare_directories(dir_1, dir_2) | ||
|
||
# check that warning is raised when sub-folder is present in second directory | ||
shutil.rmtree(sub_folder_1) | ||
sub_folder_2 = os.path.join(dir_2, 'sub_folder') | ||
os.makedirs(sub_folder_2) | ||
|
||
with pytest.warns(UserWarning, match='second directory'): | ||
file_hash.compare_directories(dir_1, dir_2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh yeah baby, python 3.8 walrus operator
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔥
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks StackOverflow!