diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 8077a03b910..d7853b46314 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -416,7 +416,11 @@ def _populate_private_class_attributes(cls): continue defaults.append( - {kwarg: default for kwarg, default in zip(argspec.args[-len(argspec.defaults):], argspec.defaults)} + { + kwarg: default + for kwarg, default in zip(argspec.args[-len(argspec.defaults):], argspec.defaults) + if not kwarg.startswith("_") + } ) if not argspec.varkw: @@ -637,7 +641,7 @@ def __init__(self, *args, **kwargs): def _set_default_frames_per_clip(self, inject_fake_data): argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__) - args_without_default = argspec.args[1:-len(argspec.defaults)] + args_without_default = argspec.args[1:(-len(argspec.defaults) if argspec.defaults else None)] frames_per_clip_last = args_without_default[-1] == "frames_per_clip" @functools.wraps(inject_fake_data) diff --git a/test/test_datasets.py b/test/test_datasets.py index a076b843fa8..bea2a2b80b9 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -944,6 +944,27 @@ def test_not_found_or_corrupted(self): super().test_not_found_or_corrupted() +class KineticsTestCase(datasets_utils.VideoDatasetTestCase): + DATASET_CLASS = datasets.Kinetics + ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + split=("train", "val"), num_classes=("400", "600", "700") + ) + + def inject_fake_data(self, tmpdir, config): + classes = ("Abseiling", "Zumba") + num_videos_per_class = 2 + tmpdir = pathlib.Path(tmpdir) / config['split'] + digits = string.ascii_letters + string.digits + "-_" + for cls in classes: + datasets_utils.create_video_folder( + tmpdir, + cls, + lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4", + num_videos_per_class, + ) + return num_videos_per_class * len(classes) + + class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.Kinetics400 diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py index 0066b76ccbe..8c2d575e01d 100644 --- a/test/test_datasets_download.py +++ b/test/test_datasets_download.py @@ -392,6 +392,25 @@ def widerface(): ) +def kinetics(): + return itertools.chain( + *[ + collect_download_configs( + lambda: datasets.Kinetics( + path.join(ROOT, f"Kinetics{num_classes}"), + frames_per_clip=1, + num_classes=num_classes, + split=split, + download=True, + ), + name=f"Kinetics, {num_classes}, {split}", + file="kinetics", + ) + for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val")) + ] + ) + + def kitti(): return itertools.chain( *[ @@ -440,6 +459,7 @@ def make_parametrize_kwargs(download_configs): usps(), celeba(), widerface(), + kinetics(), kitti(), ) ) diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py index b60fc7c7964..e67ba08d299 100644 --- a/torchvision/datasets/__init__.py +++ b/torchvision/datasets/__init__.py @@ -20,7 +20,7 @@ from .sbd import SBDataset from .vision import VisionDataset from .usps import USPS -from .kinetics import Kinetics400 +from .kinetics import Kinetics400, Kinetics from .hmdb51 import HMDB51 from .ucf101 import UCF101 from .places365 import Places365 @@ -34,6 +34,6 @@ 'Omniglot', 'SBU', 'Flickr8k', 'Flickr30k', 'VOCSegmentation', 'VOCDetection', 'Cityscapes', 'ImageNet', 'Caltech101', 'Caltech256', 'CelebA', 'WIDERFace', 'SBDataset', - 'VisionDataset', 'USPS', 'Kinetics400', 'HMDB51', 'UCF101', + 'VisionDataset', 'USPS', 'Kinetics400', "Kinetics", 'HMDB51', 'UCF101', 'Places365', 'Kitti', ) diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py index a8986986c17..2543b6c514d 100644 --- a/torchvision/datasets/kinetics.py +++ b/torchvision/datasets/kinetics.py @@ -1,15 +1,29 @@ -from .utils import list_dir +import time +import os +import warnings + + +from os import path +import csv +from typing import Any, Callable, Dict, Optional, Tuple +from functools import partial +from multiprocessing import Pool + +from .utils import download_and_extract_archive, download_url, verify_str_arg, check_integrity from .folder import find_classes, make_dataset from .video_utils import VideoClips from .vision import VisionDataset -class Kinetics400(VisionDataset): - """ - `Kinetics-400 `_ +def _dl_wrap(tarpath, videopath, line): + download_and_extract_archive(line, tarpath, videopath) + + +class Kinetics(VisionDataset): + """` Generic Kinetics `_ dataset. - Kinetics-400 is an action recognition video dataset. + Kinetics-400/600/700 are action recognition video datasets. This dataset consider every video as a collection of video clips of fixed size, specified by ``frames_per_clip``, where the step in frames between each clip is given by ``step_between_clips``. @@ -20,44 +34,101 @@ class Kinetics400(VisionDataset): Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all frames in a video might be present. - Internally, it uses a VideoClips object to handle clip creation. - Args: - root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows: - + root (string): Root directory of the Kinetics Dataset. + Directory should be structured as follows: .. code:: root/ - ├── class1 - │ ├── clip1.avi - │ ├── clip2.avi - │ └── ... - └── class2 - ├── clipx.avi - └── ... - + ├── split + │ ├── class1 + │ │ ├── clip1.mp4 + │ │ ├── clip2.mp4 + │ │ ├── clip3.mp4 + │ │ ├── ... + │ ├── class2 + │ │ ├── clipx.mp4 + │ │ └── ... + Note: split is appended automatically using the split argument. frames_per_clip (int): number of frames in a clip + num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700 + split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` + frame_rate (float): If omitted, interpolate different frame rate for each clip. step_between_clips (int): number of frames between each clip transform (callable, optional): A function/transform that takes in a TxHxWxC video and returns a transformed version. + download (bool): Download the official version of the dataset to root folder. + num_workers (int): Use multiple workers for VideoClips creation + num_download_workers (int): Use multiprocessing in order to speed up download. Returns: tuple: A 3-tuple with the following entries: - - video (Tensor[T, H, W, C]): the `T` video frames + - video (Tensor[T, C, H, W]): the `T` video frames in torch.uint8 tensor - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels - and `L` is the number of points + and `L` is the number of points in torch.float tensor - label (int): class of the video clip + + Raises: + RuntimeError: If ``download is True`` and the video archives are already extracted. """ - def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, - extensions=('avi',), transform=None, _precomputed_metadata=None, - num_workers=1, _video_width=0, _video_height=0, - _video_min_dimension=0, _audio_samples=0, _audio_channels=0): - super(Kinetics400, self).__init__(root) + _TAR_URLS = { + "400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt", + "600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt", + "700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt", + } + _ANNOTATION_URLS = { + "400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv", + "600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.txt", + "700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv", + } + + def __init__( + self, + root: str, + frames_per_clip: int, + num_classes: str = "400", + split: str = "train", + frame_rate: Optional[float] = None, + step_between_clips: int = 1, + transform: Optional[Callable] = None, + extensions: Tuple[str, ...] = ("avi", "mp4"), + download: bool = False, + num_download_workers: int = 1, + num_workers: int = 1, + _precomputed_metadata: Optional[Dict] = None, + _video_width: int = 0, + _video_height: int = 0, + _video_min_dimension: int = 0, + _audio_samples: int = 0, + _audio_channels: int = 0, + _legacy: bool = False, + ) -> None: + + # TODO: support test + self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"]) + self.extensions = extensions + self.num_download_workers = num_download_workers + + self.root = root + self._legacy = _legacy + if _legacy: + print("Using legacy structure") + self.split_folder = root + self.split = "unknown" + assert not download, "Cannot download the videos using legacy_structure." + else: + self.split_folder = path.join(root, split) + self.split = verify_str_arg(split, arg="split", valid_values=["train", "val"]) + + if download: + self.download_and_process_videos() + + super().__init__(self.root) - self.classes, class_to_idx = find_classes(self.root) - self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) + self.classes, class_to_idx = find_classes(self.split_folder) + self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None) video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, @@ -74,6 +145,88 @@ def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, ) self.transform = transform + def download_and_process_videos(self) -> None: + """Downloads all the videos to the _root_ folder in the expected format.""" + tic = time.time() + self._download_videos() + toc = time.time() + print("Elapsed time for downloading in mins ", (toc - tic) / 60) + self._make_ds_structure() + toc2 = time.time() + print("Elapsed time for processing in mins ", (toc2 - toc) / 60) + print("Elapsed time overall in mins ", (toc2 - tic) / 60) + + def _download_videos(self) -> None: + """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where + split is one of the official dataset splits. + + Raises: + RuntimeError: if download folder exists, break to prevent downloading entire dataset again. + """ + if path.exists(self.split_folder): + raise RuntimeError( + f"The directory {self.split_folder} already exists. " + f"If you want to re-download or re-extract the images, delete the directory." + ) + tar_path = path.join(self.root, "tars") + file_list_path = path.join(self.root, "files") + + split_url = self._TAR_URLS[self.num_classes].format(split=self.split) + split_url_filepath = path.join(file_list_path, path.basename(split_url)) + if not check_integrity(split_url_filepath): + download_url(split_url, file_list_path) + list_video_urls = open(split_url_filepath, "r") + + if self.num_download_workers == 1: + for line in list_video_urls.readlines(): + line = str(line).replace("\n", "") + download_and_extract_archive(line, tar_path, self.split_folder) + else: + part = partial(_dl_wrap, tar_path, self.split_folder) + lines = [str(line).replace("\n", "") for line in list_video_urls.readlines()] + poolproc = Pool(self.num_download_workers) + poolproc.map(part, lines) + + def _make_ds_structure(self): + """move videos from + split_folder/ + ├── clip1.avi + ├── clip2.avi + + to the correct format as described below: + split_folder/ + ├── class1 + │ ├── clip1.avi + + """ + annotation_path = path.join(self.root, "annotations") + if not check_integrity(path.join(annotation_path, f"{self.split}.csv")): + download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path) + annotations = path.join(annotation_path, f"{self.split}.csv") + + file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4" + with open(annotations) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + f = file_fmtstr.format( + ytid=row["youtube_id"], + start=int(row["time_start"]), + end=int(row["time_end"]), + ) + label = ( + row["label"] + .replace(" ", "_") + .replace("'", "") + .replace("(", "") + .replace(")", "") + ) + os.makedirs(path.join(self.split_folder, label), exist_ok=True) + downloaded_file = path.join(self.split_folder, f) + if path.isfile(downloaded_file): + os.replace( + downloaded_file, path.join(self.split_folder, label, f), + ) + @property def metadata(self): return self.video_clips.metadata @@ -83,9 +236,86 @@ def __len__(self): def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) + if not self._legacy: + # [T,H,W,C] --> [T,C,H,W] + video = video.permute(0, 3, 1, 2) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label + + +class Kinetics400(Kinetics): + """ + `Kinetics-400 `_ + dataset. + + Kinetics-400 is an action recognition video dataset. + This dataset consider every video as a collection of video clips of fixed size, specified + by ``frames_per_clip``, where the step in frames between each clip is given by + ``step_between_clips``. + + To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5`` + and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two + elements will come from video 1, and the next three elements from video 2. + Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all + frames in a video might be present. + + Internally, it uses a VideoClips object to handle clip creation. + + Args: + root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows: + + .. code:: + + root/ + ├── class1 + │ ├── clip1.avi + │ ├── clip2.avi + │ ├── clip3.mp4 + │ └── ... + └── class2 + ├── clipx.avi + └── ... + + frames_per_clip (int): number of frames in a clip + step_between_clips (int): number of frames between each clip + transform (callable, optional): A function/transform that takes in a TxHxWxC video + and returns a transformed version. + + Returns: + tuple: A 3-tuple with the following entries: + + - video (Tensor[T, H, W, C]): the `T` video frames + - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels + and `L` is the number of points + - label (int): class of the video clip + """ + + def __init__( + self, + root: str, + frames_per_clip: int, + num_classes: Any = None, + split: Any = None, + download: Any = None, + num_download_workers: Any = None, + **kwargs: Any + ): + warnings.warn( + "Kinetics400 is deprecated and will be removed in a future release." + "It was replaced by Kinetics(..., num_classes=\"400\").") + if any(value is not None for value in (num_classes, split, download, num_download_workers)): + raise RuntimeError( + "Usage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in " + "Kinetics400. Please use Kinetics instead." + ) + + super(Kinetics400, self).__init__( + root=root, + frames_per_clip=frames_per_clip, + _legacy=True, + **kwargs, + )