-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_dfdc.py
82 lines (64 loc) · 2.95 KB
/
index_dfdc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import argparse
from multiprocessing import Pool
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from isplutils.utils import extract_meta_av
def parse_args(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--source', type=Path, help='Source dir', required=True)
parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl',
help='Path to save the videos DataFrame')
parser.add_argument('--batch', type=int, help='Batch size', default=64)
return parser.parse_args(argv)
def main(argv):
## Parameters parsing
args = parse_args(argv)
source_dir: Path = args.source
videodataset_path: Path = args.videodataset
batch_size: int = args.batch
## DataFrame
if videodataset_path.exists():
print('Loading video DataFrame')
df_videos = pd.read_pickle(videodataset_path)
else:
print('Creating video DataFrame')
# Create ouptut folder
videodataset_path.parent.mkdir(parents=True, exist_ok=True)
# Index
df_train_list = list()
for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')):
df_tmp = pd.read_json(json_path, orient='index')
df_tmp['path'] = df_tmp.index.map(
lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x)))
df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
df_train_list.append(df_tmp)
df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)
# Save space
del df_videos['split']
df_videos['label'] = df_videos['label'] == 'FAKE'
df_videos['original'] = df_videos['original'].astype('category')
df_videos['folder'] = df_videos['folder'].astype(np.uint8)
# Collect metadata
paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
height_list = []
width_list = []
frames_list = []
with Pool() as pool:
for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'):
batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size])
for res in batch_res:
height_list.append(res[0])
width_list.append(res[1])
frames_list.append(res[2])
df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)
print('Saving video DataFrame to {}'.format(videodataset_path))
df_videos.to_pickle(str(videodataset_path))
print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
if __name__ == '__main__':
main(sys.argv[1:])