From e80ebb70d961c8844df6aceb97e6b0514c27169f Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:36:42 -0700 Subject: [PATCH 1/8] simplify caching --- tests/test_load_video_frames.py | 123 ++++++++++++++++++++------------ zamba/data/video.py | 30 +------- zamba/models/model_manager.py | 2 +- zamba/pytorch/dataloaders.py | 12 +++- 4 files changed, 92 insertions(+), 75 deletions(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index 58c8a609..8e62dafd 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -10,11 +10,12 @@ from pydantic import BaseModel, ValidationError from zamba.data.video import ( - cached_load_video_frames, + npy_cache, load_video_frames, VideoLoaderConfig, MegadetectorLiteYoloXConfig, ) +from zamba.pytorch.dataloaders import FfmpegZambaVideoDataset from conftest import ASSETS_DIR, TEST_VIDEOS_DIR @@ -384,35 +385,6 @@ def test_load_video_frames(case: Case, video_metadata: Dict[str, Any]): assert video_shape[field] == value -def test_same_filename_new_kwargs(tmp_path): - """Test that load_video_frames does not load the npz file if the params change.""" - # use first test video - test_vid = [f for f in TEST_VIDEOS_DIR.rglob("*") if f.is_file()][0] - cache = tmp_path / "test_cache" - - with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): - # confirm cache is set in environment variable - assert os.environ["VIDEO_CACHE_DIR"] == str(cache) - - first_load = cached_load_video_frames(filepath=test_vid, config=VideoLoaderConfig(fps=1)) - new_params_same_name = cached_load_video_frames( - filepath=test_vid, config=VideoLoaderConfig(fps=2) - ) - assert first_load.shape != new_params_same_name.shape - - # check no params - first_load = cached_load_video_frames(filepath=test_vid) - assert first_load.shape != new_params_same_name.shape - - # multiple params in config - c1 = VideoLoaderConfig(scene_threshold=0.2) - c2 = VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) - - first_load = cached_load_video_frames(filepath=test_vid, config=c1) - new_params_same_name = cached_load_video_frames(filepath=test_vid, config=c2) - assert first_load.shape != new_params_same_name.shape - - def test_megadetector_lite_yolox_dog(tmp_path): dog = Image.open(ASSETS_DIR / "dog.jpg") blank = Image.new("RGB", dog.size, (64, 64, 64)) @@ -506,39 +478,99 @@ def test_validate_total_frames(): assert config.total_frames == 8 -def test_caching(tmp_path, caplog): +def generate_dataset(config): + """Return loaded video from FFmpegZambaVideoDataset.""" + return FfmpegZambaVideoDataset(annotations=labels, video_loader_config=config).__getitem__( + index=0 + )[0] + + +def test_same_filename_new_kwargs(tmp_path, train_metadata): + """Test that load_video_frames does not load the npz file if the params change.""" + # use first test video + # test_vid = [f for f in TEST_VIDEOS_DIR.rglob("*") if f.is_file()][0] + cache = tmp_path / "test_cache" + + # prep labels for one video + labels = ( + train_metadata[train_metadata.split == "train"] + .set_index("filepath") + .filter(regex="species") + .head(1) + ) + + + with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): + # confirm cache is set in environment variable + assert os.environ["VIDEO_CACHE_DIR"] == str(cache) + + first_load = _generate_dataset(config=VideoLoaderConfig(fps=1)) + new_params_same_name = _generate_dataset(config=VideoLoaderConfig(fps=2)) + assert first_load.shape != new_params_same_name.shape + + # check no params + no_params_same_name = _generate_dataset(config=None) + assert first_load.shape != new_params_same_name.shape != no_params_same_name.shape + + # multiple params in config + c1 = VideoLoaderConfig(scene_threshold=0.2) + c2 = VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) + + first_load = _generate_dataset(config=c1) + + new_params_same_name = _generate_dataset(config=c2) + assert first_load.shape != new_params_same_name.shape + + +def test_caching(tmp_path, caplog, train_metadata): cache = tmp_path / "video_cache" - test_vid = [f for f in TEST_VIDEOS_DIR.rglob("*") if f.is_file()][0] + + # prep labels for one video + labels = ( + train_metadata[train_metadata.split == "train"] + .set_index("filepath") + .filter(regex="species") + .head(1) + ) # no caching by default - _ = cached_load_video_frames(filepath=test_vid, config=VideoLoaderConfig(fps=1)) + _ = FfmpegZambaVideoDataset( + annotations=labels, + ).__getitem__(index=0) assert not cache.exists() # caching can be specifed in config - _ = cached_load_video_frames( - filepath=test_vid, config=VideoLoaderConfig(fps=1, cache_dir=cache) - ) + _ = FfmpegZambaVideoDataset( + annotations=labels, video_loader_config=VideoLoaderConfig(fps=1, cache_dir=cache) + ).__getitem__(index=0) + # one file in cache assert len([f for f in cache.rglob("*") if f.is_file()]) == 1 shutil.rmtree(cache) # or caching can be specified in environment variable with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): - _ = cached_load_video_frames(filepath=test_vid) + _ = FfmpegZambaVideoDataset( + annotations=labels, + ).__getitem__(index=0) assert len([f for f in cache.rglob("*") if f.is_file()]) == 1 # changing cleanup in config does not prompt new hashing of videos with mock.patch.dict(os.environ, {"LOG_LEVEL": "DEBUG"}): - _ = cached_load_video_frames( - filepath=test_vid, config=VideoLoaderConfig(cleanup_cache=True) - ) + _ = FfmpegZambaVideoDataset( + annotations=labels, video_loader_config=VideoLoaderConfig(cleanup_cache=True) + ).__getitem__(index=0) + assert "Loading from cache" in caplog.text # if no config is passed, this is equivalent to specifying None/False in all non-cache related VLC params - no_config = cached_load_video_frames(filepath=test_vid, config=None) - config_with_nones = cached_load_video_frames( - filepath=test_vid, - config=VideoLoaderConfig( + no_config = FfmpegZambaVideoDataset(annotations=labels, video_loader_config=None).__getitem__( + index=0 + )[0] + + config_with_nones = FfmpegZambaVideoDataset( + annotations=labels, + video_loader_config=VideoLoaderConfig( crop_bottom_pixels=None, i_frames=False, scene_threshold=None, @@ -555,5 +587,6 @@ def test_caching(tmp_path, caplog): model_input_height=None, model_input_width=None, ), - ) + ).__getitem__(index=0)[0] + assert np.array_equal(no_config, config_with_nones) diff --git a/zamba/data/video.py b/zamba/data/video.py index 0daccbdb..01cb5db0 100644 --- a/zamba/data/video.py +++ b/zamba/data/video.py @@ -326,8 +326,8 @@ def validate_total_frames(cls, values): class npy_cache: - def __init__(self, path: Optional[Path] = None, cleanup: bool = False): - self.tmp_path = path + def __init__(self, tmp_path: Optional[Path] = None, cleanup: bool = False): + self.tmp_path = tmp_path self.cleanup = cleanup def __call__(self, f): @@ -390,14 +390,6 @@ def __del__(self): ) -def npy_cache_factory(path, callable, cleanup): - @npy_cache(path=path, cleanup=cleanup) - def decorated_callable(*args, **kwargs): - return callable(*args, **kwargs) - - return decorated_callable - - def load_video_frames( filepath: os.PathLike, config: Optional[VideoLoaderConfig] = None, @@ -503,21 +495,3 @@ def load_video_frames( arr = ensure_frame_number(arr, total_frames=config.total_frames) return arr - - -def cached_load_video_frames(filepath: os.PathLike, config: Optional[VideoLoaderConfig] = None): - """Loads frames from videos using fast ffmpeg commands and caches to .npy file - if config.cache_dir is not None. - - Args: - filepath (os.PathLike): Path to the video. - config (VideoLoaderConfig): Configuration for video loading. - """ - if config is None: - # get environment variable for cache if it exists - config = VideoLoaderConfig() - - decorated_load_video_frames = npy_cache_factory( - path=config.cache_dir, callable=load_video_frames, cleanup=config.cleanup_cache - ) - return decorated_load_video_frames(filepath=filepath, config=config) diff --git a/zamba/models/model_manager.py b/zamba/models/model_manager.py index dbfd5de9..35eb5b81 100644 --- a/zamba/models/model_manager.py +++ b/zamba/models/model_manager.py @@ -81,7 +81,7 @@ def instantiate_model( hparams = yaml.safe_load(f) else: - if not Path(checkpoint).exists(): + if not (model_cache_dir / checkpoint).exists(): logger.info("Downloading weights for model.") checkpoint = download_weights( filename=str(checkpoint), diff --git a/zamba/pytorch/dataloaders.py b/zamba/pytorch/dataloaders.py index fde47611..b8f041a3 100644 --- a/zamba/pytorch/dataloaders.py +++ b/zamba/pytorch/dataloaders.py @@ -11,7 +11,7 @@ from torchvision.datasets.vision import VisionDataset import torchvision.transforms.transforms -from zamba.data.video import cached_load_video_frames, VideoLoaderConfig +from zamba.data.video import npy_cache, load_video_frames, VideoLoaderConfig def get_datasets( @@ -87,6 +87,11 @@ def __init__( self.targets = annotations self.transform = transform + + # get environment variable for cache if it exists + if video_loader_config is None: + video_loader_config = VideoLoaderConfig() + self.video_loader_config = video_loader_config super().__init__(root=None, transform=transform) @@ -96,6 +101,11 @@ def __len__(self): def __getitem__(self, index: int): try: + cached_load_video_frames = npy_cache( + tmp_path=self.video_loader_config.cache_dir, + cleanup=self.video_loader_config.cleanup_cache, + )(load_video_frames) + video = cached_load_video_frames( filepath=self.video_paths[index], config=self.video_loader_config ) From a4cc03de4bdb1d67a02f608eb11d7d0d2d89e9e5 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:38:48 -0700 Subject: [PATCH 2/8] nest function --- tests/test_load_video_frames.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index 8e62dafd..070f7eef 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -478,13 +478,6 @@ def test_validate_total_frames(): assert config.total_frames == 8 -def generate_dataset(config): - """Return loaded video from FFmpegZambaVideoDataset.""" - return FfmpegZambaVideoDataset(annotations=labels, video_loader_config=config).__getitem__( - index=0 - )[0] - - def test_same_filename_new_kwargs(tmp_path, train_metadata): """Test that load_video_frames does not load the npz file if the params change.""" # use first test video @@ -499,6 +492,11 @@ def test_same_filename_new_kwargs(tmp_path, train_metadata): .head(1) ) + def _generate_dataset(config): + """Return loaded video from FFmpegZambaVideoDataset.""" + return FfmpegZambaVideoDataset(annotations=labels, video_loader_config=config).__getitem__( + index=0 + )[0] with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): # confirm cache is set in environment variable From 435b544ecc011a503b032234fa20a1e61b912c08 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:39:59 -0700 Subject: [PATCH 3/8] simplify --- tests/test_load_video_frames.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index 070f7eef..94b36b1f 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -511,12 +511,10 @@ def _generate_dataset(config): assert first_load.shape != new_params_same_name.shape != no_params_same_name.shape # multiple params in config - c1 = VideoLoaderConfig(scene_threshold=0.2) - c2 = VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) - - first_load = _generate_dataset(config=c1) - - new_params_same_name = _generate_dataset(config=c2) + first_load = _generate_dataset(config=VideoLoaderConfig(scene_threshold=0.2)) + new_params_same_name = _generate_dataset( + config=VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) + ) assert first_load.shape != new_params_same_name.shape From a100944e25fa38db42c7086348195e3c9bc88dc1 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:43:19 -0700 Subject: [PATCH 4/8] move --- tests/test_load_video_frames.py | 78 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index 94b36b1f..144271a8 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -385,6 +385,44 @@ def test_load_video_frames(case: Case, video_metadata: Dict[str, Any]): assert video_shape[field] == value +def test_same_filename_new_kwargs(tmp_path, train_metadata): + """Test that load_video_frames does not load the npz file if the params change.""" + cache = tmp_path / "test_cache" + + # prep labels for one video + labels = ( + train_metadata[train_metadata.split == "train"] + .set_index("filepath") + .filter(regex="species") + .head(1) + ) + + def _generate_dataset(config): + """Return loaded video from FFmpegZambaVideoDataset.""" + return FfmpegZambaVideoDataset(annotations=labels, video_loader_config=config).__getitem__( + index=0 + )[0] + + with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): + # confirm cache is set in environment variable + assert os.environ["VIDEO_CACHE_DIR"] == str(cache) + + first_load = _generate_dataset(config=VideoLoaderConfig(fps=1)) + new_params_same_name = _generate_dataset(config=VideoLoaderConfig(fps=2)) + assert first_load.shape != new_params_same_name.shape + + # check no params + no_params_same_name = _generate_dataset(config=None) + assert first_load.shape != new_params_same_name.shape != no_params_same_name.shape + + # multiple params in config + first_load = _generate_dataset(config=VideoLoaderConfig(scene_threshold=0.2)) + new_params_same_name = _generate_dataset( + config=VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) + ) + assert first_load.shape != new_params_same_name.shape + + def test_megadetector_lite_yolox_dog(tmp_path): dog = Image.open(ASSETS_DIR / "dog.jpg") blank = Image.new("RGB", dog.size, (64, 64, 64)) @@ -478,46 +516,6 @@ def test_validate_total_frames(): assert config.total_frames == 8 -def test_same_filename_new_kwargs(tmp_path, train_metadata): - """Test that load_video_frames does not load the npz file if the params change.""" - # use first test video - # test_vid = [f for f in TEST_VIDEOS_DIR.rglob("*") if f.is_file()][0] - cache = tmp_path / "test_cache" - - # prep labels for one video - labels = ( - train_metadata[train_metadata.split == "train"] - .set_index("filepath") - .filter(regex="species") - .head(1) - ) - - def _generate_dataset(config): - """Return loaded video from FFmpegZambaVideoDataset.""" - return FfmpegZambaVideoDataset(annotations=labels, video_loader_config=config).__getitem__( - index=0 - )[0] - - with mock.patch.dict(os.environ, {"VIDEO_CACHE_DIR": str(cache)}): - # confirm cache is set in environment variable - assert os.environ["VIDEO_CACHE_DIR"] == str(cache) - - first_load = _generate_dataset(config=VideoLoaderConfig(fps=1)) - new_params_same_name = _generate_dataset(config=VideoLoaderConfig(fps=2)) - assert first_load.shape != new_params_same_name.shape - - # check no params - no_params_same_name = _generate_dataset(config=None) - assert first_load.shape != new_params_same_name.shape != no_params_same_name.shape - - # multiple params in config - first_load = _generate_dataset(config=VideoLoaderConfig(scene_threshold=0.2)) - new_params_same_name = _generate_dataset( - config=VideoLoaderConfig(scene_threshold=0.2, crop_bottom_pixels=2) - ) - assert first_load.shape != new_params_same_name.shape - - def test_caching(tmp_path, caplog, train_metadata): cache = tmp_path / "video_cache" From 7770c63e1850a533b59a434c00bfe08113e5c9c0 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:46:14 -0700 Subject: [PATCH 5/8] alphabetize --- tests/test_load_video_frames.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index 144271a8..d3b6c062 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -10,10 +10,10 @@ from pydantic import BaseModel, ValidationError from zamba.data.video import ( - npy_cache, load_video_frames, - VideoLoaderConfig, MegadetectorLiteYoloXConfig, + npy_cache, + VideoLoaderConfig, ) from zamba.pytorch.dataloaders import FfmpegZambaVideoDataset From 579d43433ad077e8daf2c7643be6d4b750222893 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Thu, 21 Oct 2021 16:46:33 -0700 Subject: [PATCH 6/8] lint --- tests/test_load_video_frames.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_load_video_frames.py b/tests/test_load_video_frames.py index d3b6c062..164083b9 100644 --- a/tests/test_load_video_frames.py +++ b/tests/test_load_video_frames.py @@ -12,7 +12,6 @@ from zamba.data.video import ( load_video_frames, MegadetectorLiteYoloXConfig, - npy_cache, VideoLoaderConfig, ) from zamba.pytorch.dataloaders import FfmpegZambaVideoDataset From ecb3b979a1c38385acba684e6843c4bb3ca0c81d Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Fri, 22 Oct 2021 10:10:41 -0700 Subject: [PATCH 7/8] rename to cache path --- zamba/data/video.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/zamba/data/video.py b/zamba/data/video.py index 01cb5db0..1f798fb5 100644 --- a/zamba/data/video.py +++ b/zamba/data/video.py @@ -326,8 +326,8 @@ def validate_total_frames(cls, values): class npy_cache: - def __init__(self, tmp_path: Optional[Path] = None, cleanup: bool = False): - self.tmp_path = tmp_path + def __init__(self, cache_path: Optional[Path] = None, cleanup: bool = False): + self.cache_path = cache_path self.cleanup = cleanup def __call__(self, f): @@ -358,7 +358,7 @@ def _wrapped(*args, **kwargs): if isinstance(vid_path, S3Path): vid_path = AnyPath(vid_path.key) - npy_path = self.tmp_path / hash_str / vid_path.with_suffix(".npy") + npy_path = self.cache_path / hash_str / vid_path.with_suffix(".npy") # make parent directories since we're using absolute paths npy_path.parent.mkdir(parents=True, exist_ok=True) @@ -372,21 +372,21 @@ def _wrapped(*args, **kwargs): logger.debug(f"Wrote to cache {npy_path}: size {npy_path.stat().st_size}") return loaded_video - if self.tmp_path is not None: + if self.cache_path is not None: return _wrapped else: return f def __del__(self): - if hasattr(self, "tmp_path") and self.cleanup and self.tmp_path.exists(): - if self.tmp_path.parents[0] == tempfile.gettempdir(): - logger.info(f"Deleting cache dir {self.tmp_path}.") - rmtree(self.tmp_path) + if hasattr(self, "cache_path") and self.cleanup and self.cache_path.exists(): + if self.cache_path.parents[0] == tempfile.gettempdir(): + logger.info(f"Deleting cache dir {self.cache_path}.") + rmtree(self.cache_path) else: logger.warning( "Bravely refusing to delete directory that is not a subdirectory of the " "system temp directory. If you really want to delete, do so manually using:\n " - f"rm -r {self.tmp_path}" + f"rm -r {self.cache_path}" ) From 7b6c5bec968ae5519103765c375e225aad275b96 Mon Sep 17 00:00:00 2001 From: Emily Miller Date: Fri, 22 Oct 2021 10:25:43 -0700 Subject: [PATCH 8/8] fix dataset --- zamba/pytorch/dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zamba/pytorch/dataloaders.py b/zamba/pytorch/dataloaders.py index b8f041a3..ed9008bd 100644 --- a/zamba/pytorch/dataloaders.py +++ b/zamba/pytorch/dataloaders.py @@ -102,7 +102,7 @@ def __len__(self): def __getitem__(self, index: int): try: cached_load_video_frames = npy_cache( - tmp_path=self.video_loader_config.cache_dir, + cache_path=self.video_loader_config.cache_dir, cleanup=self.video_loader_config.cleanup_cache, )(load_video_frames)