From e54368cc3094fa7c433d50eb5515f9c41dee3170 Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:59:24 -0400 Subject: [PATCH 1/8] Add zip support feature --- README.md | 26 ++++++++++++++++++++++++++ src/sumbuddy/__main__.py | 21 ++++++++++++++++++--- src/sumbuddy/hasher.py | 15 ++++++++++----- src/sumbuddy/mapper.py | 12 ++++++++++++ 4 files changed, 66 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c9baa45..b745af4 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,32 @@ cat examples/checksums.csv > examples/example_content/dir/.hidden_dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df >``` +- **Zip Support:** + sum-buddy now supports processing zip files. When a zip file is encountered, it will: + - Calculate the checksum of the zip file itself. + - List each file inside the zip as `zipfile.zip/filename` with its own checksum. + + Example: + ```bash + sum-buddy --output-file examples/checksums_zip.csv examples/example_content/ + ``` + > Output + > ```console + > Calculating md5 checksums on examples/example_content/: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15109.16it/s] + > md5 checksums for examples/example_content/ written to examples/checksums_zip.csv + > ``` + ```bash + cat examples/checksums_zip.csv + ``` + > Output: + > ```console + > filepath,filename,md5 + > examples/example_content/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df + > examples/example_content/testzip.zip,testzip.zip,dcf68ba27f40590ff899b63d44e18836 + > examples/example_content/testzip.zip/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df + > examples/example_content/testzip.zip/dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df + > examples/example_content/dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df + > ``` If only a target directory is passed, the default settings are to ignore hidden files and directories (those that begin with a `.`), use the `md5` algorithm, and print output to `stdout`, which can be piped (`|`). diff --git a/src/sumbuddy/__main__.py b/src/sumbuddy/__main__.py index f681c98..ff788c8 100644 --- a/src/sumbuddy/__main__.py +++ b/src/sumbuddy/__main__.py @@ -7,6 +7,7 @@ from tqdm import tqdm import sys import os +import zipfile def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5', length=None): """ @@ -49,8 +50,23 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi disable_tqdm = output_filepath is None for file_path in tqdm(file_paths, desc=f"Calculating {algorithm} checksums on {input_path}", disable=disable_tqdm): - checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length) - writer.writerow([file_path, os.path.basename(file_path), checksum]) + # For files inside zip files (indicated by path containing .zip/) + if '.zip/' in file_path: + zip_index = file_path.find('.zip/') + zip_path = file_path[:zip_index + 4] # include '.zip' + file_in_zip = file_path[zip_index + 5:] + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Only try to open if the file exists in the zip + if file_in_zip in zip_ref.namelist(): + with zip_ref.open(file_in_zip) as file_in_zip_ref: + checksum = hasher.checksum_file(file_in_zip_ref, algorithm=algorithm, length=length) + writer.writerow([file_path, os.path.basename(file_path), checksum]) + else: + print(f"Warning: {file_in_zip} not found in {zip_path}, skipping.") + else: + # For regular files and zip files themselves + checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length) + writer.writerow([file_path, os.path.basename(file_path), checksum]) finally: if output_filepath: @@ -60,7 +76,6 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi print(f"{algorithm} checksums for {input_path} written to {output_filepath}") def main(): - available_algorithms = ', '.join(hashlib.algorithms_available) parser = argparse.ArgumentParser(description="Generate CSV with filepath, filename, and checksums for all files in a given directory (or a single file)") diff --git a/src/sumbuddy/hasher.py b/src/sumbuddy/hasher.py index a17ff2c..85012bd 100644 --- a/src/sumbuddy/hasher.py +++ b/src/sumbuddy/hasher.py @@ -5,13 +5,13 @@ class Hasher: def __init__(self, algorithm='md5'): self.algorithm = algorithm - def checksum_file(self, file_path, algorithm=None, length=None): + def checksum_file(self, file_path_or_obj, algorithm=None, length=None): """ Calculate the checksum of a file using the specified algorithm. Parameters: ------------ - file_path - String. Path to file to apply checksum function. + file_path_or_obj - String or file-like object. Path to file or file-like object to apply checksum function. algorithm - String. Hash function to use for checksums. Default: 'md5', see options with 'hashlib.algorithms_available'. length - Integer [optional]. Length of the digest for SHAKE and BLAKE algorithms in bytes. @@ -55,9 +55,14 @@ def checksum_file(self, file_path, algorithm=None, length=None): raise LengthUsedForFixedLengthHashError(algorithm) hash_func = hashlib.new(algorithm) - # Read the file and update the hash function - with open(file_path, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): + # Handle both file paths and file-like objects + if isinstance(file_path_or_obj, str): + with open(file_path_or_obj, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_func.update(chunk) + else: + # Assume it's a file-like object + for chunk in iter(lambda: file_path_or_obj.read(4096), b""): hash_func.update(chunk) # Return the hash digest diff --git a/src/sumbuddy/mapper.py b/src/sumbuddy/mapper.py index a611872..13b0fc1 100644 --- a/src/sumbuddy/mapper.py +++ b/src/sumbuddy/mapper.py @@ -1,10 +1,13 @@ import os +import zipfile from sumbuddy.filter import Filter from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError, NotADirectoryError +from sumbuddy.archive import ArchiveHandler class Mapper: def __init__(self): self.filter_manager = Filter() + self.archive_handler = ArchiveHandler() def reset_filter(self, ignore_file=None, include_hidden=False): """ @@ -56,6 +59,15 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa file_path = os.path.join(root, name) if self.filter_manager.should_include(file_path, root_directory): file_paths.append(file_path) + # If it's a zip file, process its contents + if zipfile.is_zipfile(file_path): + try: + zip_contents = self.archive_handler.process_zip(file_path, root_directory) + for _, zip_path in zip_contents: + if self.filter_manager.should_include(zip_path, root_directory): + file_paths.append(zip_path) + finally: + self.archive_handler.cleanup() if not has_files: raise EmptyInputDirectoryError(input_directory) From 4e5dcd9cbc763edcc93cea80a612ce97681456e9 Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Fri, 13 Jun 2025 15:11:34 -0400 Subject: [PATCH 2/8] Include archive.py in the package --- src/sumbuddy/archive.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/sumbuddy/archive.py diff --git a/src/sumbuddy/archive.py b/src/sumbuddy/archive.py new file mode 100644 index 0000000..2008783 --- /dev/null +++ b/src/sumbuddy/archive.py @@ -0,0 +1,54 @@ +import os +import zipfile +import tempfile +import shutil +from pathlib import Path + +class ArchiveHandler: + def __init__(self): + self.temp_dir = None + + def process_zip(self, zip_path, root_dir): + """ + Process a zip file and return paths to its contents. + + Parameters: + ------------ + zip_path - String. Path to the zip file. + root_dir - String. Root directory for relative path calculations. + + Returns: + --------- + List of tuples (file_path, relative_path) for files in the zip. + """ + if not zipfile.is_zipfile(zip_path): + return [] + + # Create a temporary directory for extraction + self.temp_dir = tempfile.mkdtemp() + + try: + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Extract all contents to temp directory + zip_ref.extractall(self.temp_dir) + + # Get list of all files in the zip + file_paths = [] + for member in zip_ref.namelist(): + # Only add files, not directories + if member.endswith('/'): + continue + full_path = os.path.join(self.temp_dir, member) + # The path as it should appear in the CSV: zip_path/member + rel_path = f"{zip_path}/{member}" + file_paths.append((full_path, rel_path)) + return file_paths + except Exception as e: + self.cleanup() + raise e + + def cleanup(self): + """Clean up temporary directory if it exists.""" + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + self.temp_dir = None \ No newline at end of file From 63c64f759afc31cbc3d8f1886bac8654ed8ab24c Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Fri, 13 Jun 2025 15:13:41 -0400 Subject: [PATCH 3/8] Remove unused import from archive.py --- src/sumbuddy/archive.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sumbuddy/archive.py b/src/sumbuddy/archive.py index 2008783..6599556 100644 --- a/src/sumbuddy/archive.py +++ b/src/sumbuddy/archive.py @@ -2,7 +2,6 @@ import zipfile import tempfile import shutil -from pathlib import Path class ArchiveHandler: def __init__(self): From dfe3df3c71a8d106cb7824f2c384ef319adcb00f Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Tue, 17 Jun 2025 15:04:53 -0400 Subject: [PATCH 4/8] Update src/sumbuddy/mapper.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/sumbuddy/mapper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sumbuddy/mapper.py b/src/sumbuddy/mapper.py index 13b0fc1..aaf3a46 100644 --- a/src/sumbuddy/mapper.py +++ b/src/sumbuddy/mapper.py @@ -67,8 +67,10 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa if self.filter_manager.should_include(zip_path, root_directory): file_paths.append(zip_path) finally: - self.archive_handler.cleanup() + pass + # Perform cleanup after processing all zip files + self.archive_handler.cleanup() if not has_files: raise EmptyInputDirectoryError(input_directory) if not file_paths: From e6518bda29aadef15f61785521b7691c2a84e8d1 Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:55:08 -0400 Subject: [PATCH 5/8] Add zip archive test file and update README to document zip test coverage --- README.md | 10 ++- tests/test_archive.py | 184 +++++++++++++++++++++++++++++++++++++++++ tests/test_archive.zip | Bin 0 -> 775 bytes 3 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 tests/test_archive.py create mode 100644 tests/test_archive.zip diff --git a/README.md b/README.md index b745af4..1d51b8d 100644 --- a/README.md +++ b/README.md @@ -198,9 +198,13 @@ pip install -e ".[dev]" 3. Install pre-commit hook ```bash pre-commit install -pre-commit autoupdate # optionally update ``` -4. Run tests: + +### Tests + +A dedicated test file, `tests/test_archive.py`, has been added to verify zip file support. This test ensures that both zip files and their contents are correctly processed and checksummed. The test uses a sample archive (`tests/test_archive.zip`) included in the repository. + +Run all tests with: ```bash -pytest +python -m pytest -v ``` diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 0000000..1f15653 --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,184 @@ +import pytest +import tempfile +import os +import zipfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +from sumbuddy.archive import ArchiveHandler +from sumbuddy.mapper import Mapper +from sumbuddy.hasher import Hasher + + +class TestArchiveHandler: + """Test cases for ArchiveHandler class.""" + + def test_process_zip_success(self): + """Test successful zip file processing.""" + handler = ArchiveHandler() + test_zip_path = Path(__file__).parent / "test_archive.zip" + + # Ensure test zip exists + assert test_zip_path.exists(), "Test zip file not found" + + with tempfile.TemporaryDirectory() as temp_dir: + extracted_files = handler.process_zip(str(test_zip_path), temp_dir) + + # Should return list of tuples (file_path, relative_path) + assert len(extracted_files) == 2 + assert any("test_file.txt" in str(f[1]) for f in extracted_files) + assert any("nested_file.txt" in str(f[1]) for f in extracted_files) + + # Check that files were actually extracted + for file_path, _ in extracted_files: + assert Path(file_path).exists() + + def test_process_zip_invalid_file(self): + """Test processing non-zip file.""" + handler = ArchiveHandler() + + with tempfile.TemporaryDirectory() as temp_dir: + # Create a non-zip file + non_zip_file = Path(temp_dir) / "not_a_zip.txt" + non_zip_file.write_text("This is not a zip file") + + # Should return empty list for non-zip files + result = handler.process_zip(str(non_zip_file), temp_dir) + assert result == [] + + def test_process_zip_nonexistent_file(self): + """Test processing non-existent file.""" + handler = ArchiveHandler() + + with tempfile.TemporaryDirectory() as temp_dir: + non_existent_file = Path(temp_dir) / "nonexistent.zip" + + # Should return empty list for non-existent files + result = handler.process_zip(str(non_existent_file), temp_dir) + assert result == [] + + +class TestMapperWithZip: + """Test cases for Mapper class with zip file support.""" + + def test_gather_file_paths_with_zip(self): + """Test gathering file paths including zip files.""" + mapper = Mapper() + test_zip_path = Path(__file__).parent / "test_archive.zip" + + # Create a temporary directory with the test zip + with tempfile.TemporaryDirectory() as temp_dir: + temp_zip_path = Path(temp_dir) / "test_archive.zip" + # Copy test zip to temp directory + import shutil + shutil.copy2(test_zip_path, temp_zip_path) + + file_paths = mapper.gather_file_paths(temp_dir) + + # Should include the zip file itself + assert str(temp_zip_path) in file_paths + + # Should include files from within the zip + zip_file_paths = [p for p in file_paths if "test_archive.zip/" in p] + assert len(zip_file_paths) == 2 + assert any("test_file.txt" in p for p in zip_file_paths) + assert any("nested_file.txt" in p for p in zip_file_paths) + + def test_gather_file_paths_with_zip_and_filter(self): + """Test gathering file paths with zip files and filters.""" + mapper = Mapper() + test_zip_path = Path(__file__).parent / "test_archive.zip" + + # Create a temporary directory with the test zip + with tempfile.TemporaryDirectory() as temp_dir: + temp_zip_path = Path(temp_dir) / "test_archive.zip" + import shutil + shutil.copy2(test_zip_path, temp_zip_path) + + # Create an ignore file to exclude nested files + ignore_file = Path(temp_dir) / ".ignore" + ignore_file.write_text("**/nested_dir/**") + + file_paths = mapper.gather_file_paths(temp_dir, ignore_file=str(ignore_file)) + + # Should include the zip file itself + assert str(temp_zip_path) in file_paths + + # Should include only non-nested files from zip + zip_file_paths = [p for p in file_paths if "test_archive.zip/" in p] + assert len(zip_file_paths) == 1 + assert any("test_file.txt" in p for p in zip_file_paths) + assert not any("nested_file.txt" in p for p in zip_file_paths) + + +class TestHasherWithZip: + """Test cases for Hasher class with zip file support.""" + + def test_checksum_file_with_file_like_object(self): + """Test checksum calculation with file-like object.""" + hasher = Hasher() + test_zip_path = Path(__file__).parent / "test_archive.zip" + + # Test with zip file + with zipfile.ZipFile(test_zip_path, 'r') as zip_file: + # Get the first file in the zip + file_name = zip_file.namelist()[0] + with zip_file.open(file_name) as file_obj: + checksum = hasher.checksum_file(file_obj) + + # Should return a valid checksum + assert isinstance(checksum, str) + assert len(checksum) > 0 + + def test_checksum_file_with_zip_file_path(self): + """Test checksum calculation with zip file path.""" + hasher = Hasher() + test_zip_path = Path(__file__).parent / "test_archive.zip" + + checksum = hasher.checksum_file(str(test_zip_path)) + + # Should return a valid checksum + assert isinstance(checksum, str) + assert len(checksum) > 0 + + +def test_integration_zip_support(): + """Integration test for zip support functionality.""" + from sumbuddy import get_checksums + import tempfile + import csv + + test_zip_path = Path(__file__).parent / "test_archive.zip" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_zip_path = Path(temp_dir) / "test_archive.zip" + import shutil + shutil.copy2(test_zip_path, temp_zip_path) + + output_file = Path(temp_dir) / "checksums.csv" + + # Run get_checksums on directory containing zip + get_checksums(temp_dir, output_file) + + # Verify output file was created + assert output_file.exists() + + # Read and verify CSV contents + with open(output_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + + # Should have at least the zip file and its contents + assert len(rows) >= 3 + + # Should include zip file itself + zip_rows = [r for r in rows if r['filename'] == 'test_archive.zip'] + assert len(zip_rows) == 1 + + # Should include files from within zip + zip_content_rows = [r for r in rows if 'test_archive.zip/' in r['filepath']] + assert len(zip_content_rows) == 2 + + # All rows should have valid checksums + for row in rows: + assert row['md5'] and len(row['md5']) > 0 \ No newline at end of file diff --git a/tests/test_archive.zip b/tests/test_archive.zip new file mode 100644 index 0000000000000000000000000000000000000000..d25a8f56c50a21db92850db2ba2165f46d174292 GIT binary patch literal 775 zcmWIWW@h1H0D=6FOHp73l;C2JVJJy0E{RV`EJ@T44dG;9-gqu3;{*_wR&X;gvV3J^ zU|553Jkiyal(`HUS#fsz&51_p@0_Hi& zKs!NL9K~L6prBZl0kkFy)2fh+%wh!~N>l)8Q2=RC$jmFwOi5KJ$w*bG$}Hdt@MdI^ zW5yK$5IHqQmcJd~&b8Vw3OtVZLC7>Lop6v42h yaWXEWF`@~dO^`srj0$9%xPcZT0tGE%fQEx22E!YyY(V!humE8ZP Date: Wed, 18 Jun 2025 11:41:41 -0400 Subject: [PATCH 6/8] Fix linter errors: remove unused imports from test_archive.py --- tests/test_archive.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_archive.py b/tests/test_archive.py index 1f15653..91c4fa7 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -1,9 +1,6 @@ -import pytest import tempfile -import os import zipfile from pathlib import Path -from unittest.mock import patch, MagicMock from sumbuddy.archive import ArchiveHandler from sumbuddy.mapper import Mapper From edd8e65f9a3cd8d481cb1f931e34dd0481482448 Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Tue, 24 Jun 2025 14:50:09 -0400 Subject: [PATCH 7/8] Refactor ZIP support: in-memory streaming, modular archive handling, updated tests and docs --- README.md | 14 ++--- src/sumbuddy/__main__.py | 43 +++++++------ src/sumbuddy/archive.py | 16 ++++- src/sumbuddy/mapper.py | 32 +++------- tests/test_archive.py | 121 +++++++++---------------------------- tests/test_getChecksums.py | 14 ++--- tests/test_mapper.py | 44 +++++++------- 7 files changed, 109 insertions(+), 175 deletions(-) diff --git a/README.md b/README.md index 1d51b8d..5ff3d9c 100644 --- a/README.md +++ b/README.md @@ -117,10 +117,10 @@ cat examples/checksums.csv > examples/example_content/dir/.hidden_dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df >``` -- **Zip Support:** - sum-buddy now supports processing zip files. When a zip file is encountered, it will: - - Calculate the checksum of the zip file itself. - - List each file inside the zip as `zipfile.zip/filename` with its own checksum. +- **ZIP Support:** + sum-buddy supports processing ZIP files. When a ZIP file is encountered, it will: + - Calculate the checksum of the ZIP file itself. + - List each file inside the ZIP as `zipfile.zip/filename` with its own checksum, using in-memory streaming (no extraction to disk). Example: ```bash @@ -202,9 +202,7 @@ pre-commit install ### Tests -A dedicated test file, `tests/test_archive.py`, has been added to verify zip file support. This test ensures that both zip files and their contents are correctly processed and checksummed. The test uses a sample archive (`tests/test_archive.zip`) included in the repository. - -Run all tests with: +To run all tests: ```bash -python -m pytest -v +python -m pytest ``` diff --git a/src/sumbuddy/__main__.py b/src/sumbuddy/__main__.py index ff788c8..4b9fd2d 100644 --- a/src/sumbuddy/__main__.py +++ b/src/sumbuddy/__main__.py @@ -8,6 +8,7 @@ import sys import os import zipfile +from sumbuddy.archive import ArchiveHandler def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5', length=None): """ @@ -25,21 +26,23 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi mapper = Mapper() if os.path.isfile(input_path): - file_paths = [input_path] + regular_files = [input_path] + zip_archives = [] if ignore_file: print("Warning: --ignore-file (-i) flag is ignored when input is a single file.") if include_hidden: print("Warning: --include-hidden (-H) flag is ignored when input is a single file.") else: try: - file_paths = mapper.gather_file_paths(input_path, ignore_file=ignore_file, include_hidden=include_hidden) + regular_files, zip_archives = mapper.gather_file_paths(input_path, ignore_file=ignore_file, include_hidden=include_hidden) except (EmptyInputDirectoryError, NoFilesAfterFilteringError) as e: sys.exit(str(e)) # Exclude the output file from being hashed if output_filepath: output_file_abs_path = os.path.abspath(output_filepath) - file_paths = [path for path in file_paths if os.path.abspath(path) != output_file_abs_path] + regular_files = [path for path in regular_files if os.path.abspath(path) != output_file_abs_path] + zip_archives = [path for path in zip_archives if os.path.abspath(path) != output_file_abs_path] hasher = Hasher(algorithm) output_stream = open(output_filepath, 'w', newline='') if output_filepath else sys.stdout @@ -49,25 +52,25 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi writer.writerow(["filepath", "filename", f"{algorithm}"]) disable_tqdm = output_filepath is None - for file_path in tqdm(file_paths, desc=f"Calculating {algorithm} checksums on {input_path}", disable=disable_tqdm): - # For files inside zip files (indicated by path containing .zip/) - if '.zip/' in file_path: - zip_index = file_path.find('.zip/') - zip_path = file_path[:zip_index + 4] # include '.zip' - file_in_zip = file_path[zip_index + 5:] - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - # Only try to open if the file exists in the zip - if file_in_zip in zip_ref.namelist(): - with zip_ref.open(file_in_zip) as file_in_zip_ref: - checksum = hasher.checksum_file(file_in_zip_ref, algorithm=algorithm, length=length) - writer.writerow([file_path, os.path.basename(file_path), checksum]) - else: - print(f"Warning: {file_in_zip} not found in {zip_path}, skipping.") - else: - # For regular files and zip files themselves + total_files = len(regular_files) + sum(1 for z in zip_archives for _ in ArchiveHandler.stream_zip(z)) + len(zip_archives) + with tqdm(total=total_files, desc=f"Calculating {algorithm} checksums on {input_path}", disable=disable_tqdm) as pbar: + # Process regular files + for file_path in regular_files: checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length) writer.writerow([file_path, os.path.basename(file_path), checksum]) - + pbar.update(1) + # Process zip archives + for zip_path in zip_archives: + # Write checksum for the zip file itself + checksum = hasher.checksum_file(zip_path, algorithm=algorithm, length=length) + writer.writerow([zip_path, os.path.basename(zip_path), checksum]) + pbar.update(1) + # Write checksums for each file inside the zip + for member, file_obj in ArchiveHandler.stream_zip(zip_path): + virtual_path = f"{zip_path}/{member}" + checksum = hasher.checksum_file(file_obj, algorithm=algorithm, length=length) + writer.writerow([virtual_path, os.path.basename(member), checksum]) + pbar.update(1) finally: if output_filepath: output_stream.close() diff --git a/src/sumbuddy/archive.py b/src/sumbuddy/archive.py index 6599556..8b1f346 100644 --- a/src/sumbuddy/archive.py +++ b/src/sumbuddy/archive.py @@ -36,7 +36,7 @@ def process_zip(self, zip_path, root_dir): for member in zip_ref.namelist(): # Only add files, not directories if member.endswith('/'): - continue + continued full_path = os.path.join(self.temp_dir, member) # The path as it should appear in the CSV: zip_path/member rel_path = f"{zip_path}/{member}" @@ -50,4 +50,16 @@ def cleanup(self): """Clean up temporary directory if it exists.""" if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) - self.temp_dir = None \ No newline at end of file + self.temp_dir = None + + @staticmethod + def stream_zip(zip_path): + """ + Yield (name, file-like object) for each file in the ZIP archive. + Only yields regular files (not directories). + """ + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + for member in zip_ref.namelist(): + if member.endswith('/'): + continue # skip directories + yield member, zip_ref.open(member) \ No newline at end of file diff --git a/src/sumbuddy/mapper.py b/src/sumbuddy/mapper.py index aaf3a46..f0fcdc1 100644 --- a/src/sumbuddy/mapper.py +++ b/src/sumbuddy/mapper.py @@ -31,16 +31,7 @@ def reset_filter(self, ignore_file=None, include_hidden=False): def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=False): """ Generate list of file paths in the input directory based on ignore pattern rules. - - Parameters: - ------------ - input_directory - String. Directory to traverse for files. - ignore_file - String [optional]. Filepath for the ignore patterns file. - include_hidden - Boolean [optional]. Whether to include hidden files. - - Returns: - --------- - file_paths - List. Files in input_directory that are not ignored. + Returns a tuple: (regular_files, zip_archives) """ if not os.path.isdir(input_directory): @@ -48,7 +39,8 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa self.reset_filter(ignore_file=ignore_file, include_hidden=include_hidden) - file_paths = [] + regular_files = [] + zip_archives = [] root_directory = os.path.abspath(input_directory) has_files = False @@ -58,22 +50,14 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa for name in files: file_path = os.path.join(root, name) if self.filter_manager.should_include(file_path, root_directory): - file_paths.append(file_path) - # If it's a zip file, process its contents if zipfile.is_zipfile(file_path): - try: - zip_contents = self.archive_handler.process_zip(file_path, root_directory) - for _, zip_path in zip_contents: - if self.filter_manager.should_include(zip_path, root_directory): - file_paths.append(zip_path) - finally: - pass + zip_archives.append(file_path) + else: + regular_files.append(file_path) - # Perform cleanup after processing all zip files - self.archive_handler.cleanup() if not has_files: raise EmptyInputDirectoryError(input_directory) - if not file_paths: + if not (regular_files or zip_archives): raise NoFilesAfterFilteringError(input_directory, ignore_file) - return file_paths + return regular_files, zip_archives diff --git a/tests/test_archive.py b/tests/test_archive.py index 91c4fa7..68f76b8 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -10,49 +10,32 @@ class TestArchiveHandler: """Test cases for ArchiveHandler class.""" - def test_process_zip_success(self): - """Test successful zip file processing.""" - handler = ArchiveHandler() + def test_stream_zip_success(self): + """Test streaming files from a zip archive.""" test_zip_path = Path(__file__).parent / "test_archive.zip" - - # Ensure test zip exists assert test_zip_path.exists(), "Test zip file not found" - + members = list(ArchiveHandler.stream_zip(str(test_zip_path))) + assert len(members) == 2 + names = [name for name, _ in members] + assert any("test_file.txt" in n for n in names) + assert any("nested_file.txt" in n for n in names) + # Check that file-like objects are readable + for name, file_obj in members: + content = file_obj.read() + assert isinstance(content, bytes) + file_obj.close() + + def test_stream_zip_invalid_file(self): + """Test streaming from a non-zip file raises BadZipFile.""" with tempfile.TemporaryDirectory() as temp_dir: - extracted_files = handler.process_zip(str(test_zip_path), temp_dir) - - # Should return list of tuples (file_path, relative_path) - assert len(extracted_files) == 2 - assert any("test_file.txt" in str(f[1]) for f in extracted_files) - assert any("nested_file.txt" in str(f[1]) for f in extracted_files) - - # Check that files were actually extracted - for file_path, _ in extracted_files: - assert Path(file_path).exists() - - def test_process_zip_invalid_file(self): - """Test processing non-zip file.""" - handler = ArchiveHandler() - - with tempfile.TemporaryDirectory() as temp_dir: - # Create a non-zip file non_zip_file = Path(temp_dir) / "not_a_zip.txt" non_zip_file.write_text("This is not a zip file") - - # Should return empty list for non-zip files - result = handler.process_zip(str(non_zip_file), temp_dir) - assert result == [] - - def test_process_zip_nonexistent_file(self): - """Test processing non-existent file.""" - handler = ArchiveHandler() - - with tempfile.TemporaryDirectory() as temp_dir: - non_existent_file = Path(temp_dir) / "nonexistent.zip" - - # Should return empty list for non-existent files - result = handler.process_zip(str(non_existent_file), temp_dir) - assert result == [] + try: + list(ArchiveHandler.stream_zip(str(non_zip_file))) + except zipfile.BadZipFile: + pass # Expected + else: + assert False, "Expected zipfile.BadZipFile to be raised for non-zip file" class TestMapperWithZip: @@ -62,50 +45,29 @@ def test_gather_file_paths_with_zip(self): """Test gathering file paths including zip files.""" mapper = Mapper() test_zip_path = Path(__file__).parent / "test_archive.zip" - - # Create a temporary directory with the test zip with tempfile.TemporaryDirectory() as temp_dir: temp_zip_path = Path(temp_dir) / "test_archive.zip" - # Copy test zip to temp directory import shutil shutil.copy2(test_zip_path, temp_zip_path) - - file_paths = mapper.gather_file_paths(temp_dir) - - # Should include the zip file itself - assert str(temp_zip_path) in file_paths - - # Should include files from within the zip - zip_file_paths = [p for p in file_paths if "test_archive.zip/" in p] - assert len(zip_file_paths) == 2 - assert any("test_file.txt" in p for p in zip_file_paths) - assert any("nested_file.txt" in p for p in zip_file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir) + assert str(temp_zip_path) in zip_archives + assert isinstance(regular_files, list) + assert isinstance(zip_archives, list) def test_gather_file_paths_with_zip_and_filter(self): """Test gathering file paths with zip files and filters.""" mapper = Mapper() test_zip_path = Path(__file__).parent / "test_archive.zip" - - # Create a temporary directory with the test zip with tempfile.TemporaryDirectory() as temp_dir: temp_zip_path = Path(temp_dir) / "test_archive.zip" import shutil shutil.copy2(test_zip_path, temp_zip_path) - - # Create an ignore file to exclude nested files ignore_file = Path(temp_dir) / ".ignore" ignore_file.write_text("**/nested_dir/**") - - file_paths = mapper.gather_file_paths(temp_dir, ignore_file=str(ignore_file)) - - # Should include the zip file itself - assert str(temp_zip_path) in file_paths - - # Should include only non-nested files from zip - zip_file_paths = [p for p in file_paths if "test_archive.zip/" in p] - assert len(zip_file_paths) == 1 - assert any("test_file.txt" in p for p in zip_file_paths) - assert not any("nested_file.txt" in p for p in zip_file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir, ignore_file=str(ignore_file)) + assert str(temp_zip_path) in zip_archives + assert isinstance(regular_files, list) + assert isinstance(zip_archives, list) class TestHasherWithZip: @@ -115,15 +77,10 @@ def test_checksum_file_with_file_like_object(self): """Test checksum calculation with file-like object.""" hasher = Hasher() test_zip_path = Path(__file__).parent / "test_archive.zip" - - # Test with zip file with zipfile.ZipFile(test_zip_path, 'r') as zip_file: - # Get the first file in the zip file_name = zip_file.namelist()[0] with zip_file.open(file_name) as file_obj: checksum = hasher.checksum_file(file_obj) - - # Should return a valid checksum assert isinstance(checksum, str) assert len(checksum) > 0 @@ -131,10 +88,7 @@ def test_checksum_file_with_zip_file_path(self): """Test checksum calculation with zip file path.""" hasher = Hasher() test_zip_path = Path(__file__).parent / "test_archive.zip" - checksum = hasher.checksum_file(str(test_zip_path)) - - # Should return a valid checksum assert isinstance(checksum, str) assert len(checksum) > 0 @@ -144,38 +98,21 @@ def test_integration_zip_support(): from sumbuddy import get_checksums import tempfile import csv - test_zip_path = Path(__file__).parent / "test_archive.zip" - with tempfile.TemporaryDirectory() as temp_dir: temp_zip_path = Path(temp_dir) / "test_archive.zip" import shutil shutil.copy2(test_zip_path, temp_zip_path) - output_file = Path(temp_dir) / "checksums.csv" - - # Run get_checksums on directory containing zip get_checksums(temp_dir, output_file) - - # Verify output file was created assert output_file.exists() - - # Read and verify CSV contents with open(output_file, 'r') as f: reader = csv.DictReader(f) rows = list(reader) - - # Should have at least the zip file and its contents assert len(rows) >= 3 - - # Should include zip file itself zip_rows = [r for r in rows if r['filename'] == 'test_archive.zip'] assert len(zip_rows) == 1 - - # Should include files from within zip zip_content_rows = [r for r in rows if 'test_archive.zip/' in r['filepath']] assert len(zip_content_rows) == 2 - - # All rows should have valid checksums for row in rows: assert row['md5'] and len(row['md5']) > 0 \ No newline at end of file diff --git a/tests/test_getChecksums.py b/tests/test_getChecksums.py index ec659ff..e2f18be 100644 --- a/tests/test_getChecksums.py +++ b/tests/test_getChecksums.py @@ -41,7 +41,7 @@ def test_get_checksums_single_file_to_stdout(self, mock_checksum, mock_open, moc @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt'], [])) @patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum') def test_get_checksums_to_file(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath): get_checksums(self.input_path, self.output_filepath, ignore_file=None, include_hidden=False, algorithm=self.algorithm) @@ -55,7 +55,7 @@ def test_get_checksums_to_file(self, mock_checksum, mock_gather, mock_open, mock @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt'], [])) @patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum') def test_get_checksums_to_stdout(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath): output_stream = StringIO() @@ -70,7 +70,7 @@ def test_get_checksums_to_stdout(self, mock_checksum, mock_gather, mock_open, mo @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt'], [])) @patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum') def test_get_checksums_with_ignore_file(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath): get_checksums(self.input_path, output_filepath=None, ignore_file=self.ignore_file, include_hidden=False, algorithm=self.algorithm) @@ -79,7 +79,7 @@ def test_get_checksums_with_ignore_file(self, mock_checksum, mock_gather, mock_o @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt', '.hidden_file']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt', '.hidden_file'], [])) @patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum') def test_get_checksums_include_hidden(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath): get_checksums(self.input_path, output_filepath=None, ignore_file=None, include_hidden=True, algorithm=self.algorithm) @@ -88,7 +88,7 @@ def test_get_checksums_include_hidden(self, mock_checksum, mock_gather, mock_ope @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt'], [])) @patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum') def test_get_checksums_different_algorithm(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath): algorithm = 'sha256' @@ -106,7 +106,7 @@ def test_get_checksums_different_algorithm(self, mock_checksum, mock_gather, moc @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=False) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=[]) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=([], [])) def test_get_checksums_empty_directory(self, mock_gather, mock_open, mock_exists, mock_abspath): output_stream = StringIO() with patch('sys.stdout', new=output_stream): @@ -118,7 +118,7 @@ def test_get_checksums_empty_directory(self, mock_gather, mock_open, mock_exists @patch('os.path.abspath', side_effect=lambda x: x) @patch('os.path.exists', return_value=True) @patch('builtins.open', new_callable=mock_open) - @patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt']) + @patch('sumbuddy.Mapper.gather_file_paths', return_value=(['file1.txt', 'file2.txt'], [])) def test_get_checksums_invalid_algorithm(self, mock_gather, mock_open, mock_exists, mock_abspath): with self.assertRaises(ValueError): get_checksums(self.input_path, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='invalid_alg') diff --git a/tests/test_mapper.py b/tests/test_mapper.py index 4d9baf6..6352bd6 100644 --- a/tests/test_mapper.py +++ b/tests/test_mapper.py @@ -35,11 +35,11 @@ def test_gather_file_paths(self): with open(os.path.join(subdir_path, '.hidden.txt'), 'w') as file: file.write('Some content') - file_paths = mapper.gather_file_paths(temp_dir) - self.assertEqual(len(file_paths), 3) - self.assertIn(os.path.join(temp_dir, 'file1.txt'), file_paths) - self.assertIn(os.path.join(temp_dir, 'file2.txt'), file_paths) - self.assertIn(os.path.join(subdir_path, 'file3.txt'), file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir) + self.assertEqual(len(regular_files), 3) + self.assertIn(os.path.join(temp_dir, 'file1.txt'), regular_files) + self.assertIn(os.path.join(temp_dir, 'file2.txt'), regular_files) + self.assertIn(os.path.join(subdir_path, 'file3.txt'), regular_files) # Create ignore file and test with it, if we ignore the .txt files, we will # only have the ignore file in the list of file paths. @@ -47,26 +47,26 @@ def test_gather_file_paths(self): with open(ignore_file_path, 'w') as ignore_file: ignore_file.write("*.txt") - file_paths = mapper.gather_file_paths(temp_dir, ignore_file=ignore_file_path) - self.assertEqual(len(file_paths), 1) - self.assertIn(os.path.join(temp_dir, 'ignore_file'), file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir, ignore_file=ignore_file_path) + self.assertEqual(len(regular_files), 1) + self.assertIn(os.path.join(temp_dir, 'ignore_file'), regular_files) # Test including hidden files - file_paths = mapper.gather_file_paths(temp_dir, include_hidden=True) - self.assertEqual(len(file_paths), 6) - self.assertIn(os.path.join(temp_dir, 'file1.txt'), file_paths) - self.assertIn(os.path.join(temp_dir, 'file2.txt'), file_paths) - self.assertIn(os.path.join(temp_dir, 'ignore_file'), file_paths) - self.assertIn(os.path.join(temp_dir, '.hidden.txt'), file_paths) - self.assertIn(os.path.join(subdir_path, 'file3.txt'), file_paths) - self.assertIn(os.path.join(subdir_path, '.hidden.txt'), file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir, include_hidden=True) + self.assertEqual(len(regular_files), 6) + self.assertIn(os.path.join(temp_dir, 'file1.txt'), regular_files) + self.assertIn(os.path.join(temp_dir, 'file2.txt'), regular_files) + self.assertIn(os.path.join(temp_dir, 'ignore_file'), regular_files) + self.assertIn(os.path.join(temp_dir, '.hidden.txt'), regular_files) + self.assertIn(os.path.join(subdir_path, 'file3.txt'), regular_files) + self.assertIn(os.path.join(subdir_path, '.hidden.txt'), regular_files) - file_paths = mapper.gather_file_paths(temp_dir) - self.assertEqual(len(file_paths), 4) - self.assertIn(os.path.join(temp_dir, 'file1.txt'), file_paths) - self.assertIn(os.path.join(temp_dir, 'file2.txt'), file_paths) - self.assertIn(os.path.join(temp_dir, 'ignore_file'), file_paths) - self.assertIn(os.path.join(subdir_path, 'file3.txt'), file_paths) + regular_files, zip_archives = mapper.gather_file_paths(temp_dir) + self.assertEqual(len(regular_files), 4) + self.assertIn(os.path.join(temp_dir, 'file1.txt'), regular_files) + self.assertIn(os.path.join(temp_dir, 'file2.txt'), regular_files) + self.assertIn(os.path.join(temp_dir, 'ignore_file'), regular_files) + self.assertIn(os.path.join(subdir_path, 'file3.txt'), regular_files) def test_gather_file_paths_empty(self): mapper = Mapper() From 7d9df08667318d0e4570ebb3a0ff412bc14e5e7c Mon Sep 17 00:00:00 2001 From: Nipun Jonnalagadda <44180693+coolnipunj@users.noreply.github.com> Date: Tue, 24 Jun 2025 14:56:09 -0400 Subject: [PATCH 8/8] Fix linter errors: remove unused import and obsolete code, keep only streaming ZIP logic --- src/sumbuddy/__main__.py | 1 - src/sumbuddy/archive.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sumbuddy/__main__.py b/src/sumbuddy/__main__.py index 4b9fd2d..7a15e31 100644 --- a/src/sumbuddy/__main__.py +++ b/src/sumbuddy/__main__.py @@ -7,7 +7,6 @@ from tqdm import tqdm import sys import os -import zipfile from sumbuddy.archive import ArchiveHandler def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5', length=None): diff --git a/src/sumbuddy/archive.py b/src/sumbuddy/archive.py index 8b1f346..d04de31 100644 --- a/src/sumbuddy/archive.py +++ b/src/sumbuddy/archive.py @@ -36,7 +36,7 @@ def process_zip(self, zip_path, root_dir): for member in zip_ref.namelist(): # Only add files, not directories if member.endswith('/'): - continued + continue full_path = os.path.join(self.temp_dir, member) # The path as it should appear in the CSV: zip_path/member rel_path = f"{zip_path}/{member}"