diff --git a/creed/file_hash.py b/creed/file_hash.py new file mode 100644 index 00000000..504d474b --- /dev/null +++ b/creed/file_hash.py @@ -0,0 +1,64 @@ +# adapted from https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file +import hashlib +import os +from ark.utils import io_utils, misc_utils + +import warnings + + +def get_hash(filepath): + """Computes the hash of the specified file to verify file integrity + + Args: + filepath (str | PathLike): full path to file + + Returns: + string: the hash of the file""" + + with open(filepath, "rb") as f: + file_hash = hashlib.blake2b() + while chunk := f.read(8192): + file_hash.update(chunk) + return file_hash.hexdigest() + + +def compare_directories(dir_1, dir_2): + """Compares two directories to ensure all files are present in both with the same hashes + + Args: + dir_1: first directory to compare + dir_2: second directory to compare + + Returns: + list: a list of files with different hashes between the two directories""" + + dir_1_folders = io_utils.list_folders(dir_1) + dir_2_folders = io_utils.list_folders(dir_2) + + if len(dir_1_folders) > 0: + warnings.warn("The following subfolders were found in the first directory. Sub-folder " + "contents will not be compared for accuracy, if you want to ensure " + "successful copying please run this function on those subdirectories. " + "{}".format(dir_1_folders)) + + if len(dir_2_folders) > 0: + warnings.warn("The following subfolders were found in the second directory. Sub-folder " + "contents will not be compared for accuracy, if you want to ensure " + "successful copying please run this function on those subdirectories. " + "{}".format(dir_2_folders)) + + dir_1_files = io_utils.list_files(dir_1) + dir_2_files = io_utils.list_files(dir_2) + + misc_utils.verify_same_elements(directory_1=dir_1_files, directory_2=dir_2_files) + + bad_files = [] + for file in dir_1_files: + hash1 = get_hash(os.path.join(dir_1, file)) + hash2 = get_hash(os.path.join(dir_2, file)) + + if hash1 != hash2: + print("Found a file with differing hashes: {}".format(file)) + bad_files.append(file) + + return bad_files diff --git a/creed/file_hash_test.py b/creed/file_hash_test.py new file mode 100644 index 00000000..5a5e8409 --- /dev/null +++ b/creed/file_hash_test.py @@ -0,0 +1,61 @@ +import os +import shutil +import tempfile +import pytest + +import skimage.io as io +import numpy as np + +from creed import file_hash + + +def test_get_hash(): + with tempfile.TemporaryDirectory() as temp_dir: + + for img in range(2): + array = np.random.rand(36).reshape((6, 6)) + temp_file_path = os.path.join(temp_dir, 'test_file_{}.tiff'.format(img)) + io.imsave(temp_file_path, array) + + shutil.copy(os.path.join(temp_dir, 'test_file_0.tiff'), + os.path.join(temp_dir, 'test_file_0_copy.tiff')) + + hash1 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0.tiff')) + hash1_copy = file_hash.get_hash(os.path.join(temp_dir, 'test_file_0_copy.tiff')) + hash2 = file_hash.get_hash(os.path.join(temp_dir, 'test_file_1.tiff')) + + assert hash1 != hash2 + assert hash1 == hash1_copy + + +def test_compare_directories(): + with tempfile.TemporaryDirectory() as top_level_dir: + dir_1 = os.path.join(top_level_dir, 'dir_1') + os.makedirs(dir_1) + + # make fake data for testing + for img in range(5): + array = np.random.rand(36).reshape((6, 6)) + temp_file_path = os.path.join(dir_1, 'test_file_{}.tiff'.format(img)) + io.imsave(temp_file_path, array) + + # copy same data into second directory + dir_2 = os.path.join(top_level_dir, 'dir_2') + shutil.copytree(dir_1, dir_2) + + file_hash.compare_directories(dir_1, dir_2) + + # check that warning is raised when sub-folder is present in first directory + sub_folder_1 = os.path.join(dir_1, 'sub_folder') + os.makedirs(sub_folder_1) + + with pytest.warns(UserWarning, match='first directory'): + file_hash.compare_directories(dir_1, dir_2) + + # check that warning is raised when sub-folder is present in second directory + shutil.rmtree(sub_folder_1) + sub_folder_2 = os.path.join(dir_2, 'sub_folder') + os.makedirs(sub_folder_2) + + with pytest.warns(UserWarning, match='second directory'): + file_hash.compare_directories(dir_1, dir_2)