-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval.py
165 lines (136 loc) · 6.02 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import clip
from PIL import Image
import numpy as np
from einops import rearrange
from metrics.clip_score import calculate_clip_score
from utils import load_video, load_image
from skimage.metrics import mean_squared_error, structural_similarity as ssim
model, preprocess = clip.load("path_to_dir/ViT-L-14.pt", device="cuda")
# GenVideo-RefVideo SSIM
def compute_video_video_similarity_ssim(reference_video_path,
video_path,
keyframes=[0, 4, 8, 12, 15]):
global model, preprocess
reference_video = load_video(reference_video_path)
try:
video_frames = load_video(video_path,
size=(reference_video[0].shape[1],
reference_video[0].shape[0]))[:16]
except:
print(video_path, ' Error !!')
return {'ssim': 0, 'state': 0}
all_frame_scores = 0.
for idx, frames in enumerate(keyframes):
im1 = np.array(
reference_video[10 + frames * 3]
) # In webvid, we start with the tenth frame of the video for the motion. Different fps * 3
im2 = np.array(video_frames[frames])
ssim_value = ssim(im1,
im2,
multichannel=True,
win_size=7,
channel_axis=-1)
all_frame_scores += ssim_value
all_frame_scores /= len(keyframes)
return {'ssim': all_frame_scores, 'state': 1}
# GenVideo clip
def compute_temporal_consistency(video_path):
global model, preprocess
all_frame_scores = 0.
video_frames = load_video(video_path)[:16] # get pre 16 frames
for i in range(1, len(video_frames)):
pre_frame = Image.fromarray(video_frames[i - 1])
next_frame = Image.fromarray(video_frames[i])
score = calculate_clip_score(model,
preprocess,
first_data=pre_frame,
second_data=next_frame,
first_flag='img',
second_flag='img').cpu().numpy()
all_frame_scores += score
return all_frame_scores / (len(video_frames) - 1)
# GenVideo-Text clip
def compute_video_text_alignment(video_path, text):
global model, preprocess
all_frame_scores = 0.
video_frames = load_video(video_path, )[:16]
assert len(video_frames) != 0, 'check video_path {video_path}!'
for frame in video_frames:
frame = Image.fromarray(frame)
score = calculate_clip_score(model,
preprocess,
first_data=text,
second_data=frame,
first_flag='txt',
second_flag='img').cpu().numpy()
all_frame_scores += score
return all_frame_scores / len(video_frames)
# GenVideo-RefVideo clip (keyframes)
def compute_video_video_similarity(reference_video_path,
video_path,
keyframes=[0, 4, 8, 12, 15]):
global model, preprocess
reference_video = load_video(reference_video_path)
try:
video_frames = load_video(video_path,
size=reference_video[0].shape[:2])
except:
print(video_path, ' Error !!')
return {'clip': 0, 'state': 0}
all_frame_scores = 0.
for idx, frames in enumerate(keyframes):
im1 = Image.fromarray(np.array(reference_video[10 + frames * 3]))
im2 = Image.fromarray(np.array(video_frames[frames]))
score = calculate_clip_score(model,
preprocess,
first_data=im1,
second_data=im2,
first_flag='img',
second_flag='img').cpu().numpy()
all_frame_scores += score
all_frame_scores /= len(keyframes)
return {'clip': all_frame_scores, 'state': 1}
# MSE (First) and SSIM (First)
def compute_image_image_similarity(init_image_path, video_path):
global model, preprocess
# all_frame_scores = 0.
init_image = Image.fromarray(load_image(init_image_path))
try:
video_frames = load_video(video_path,
size=init_image.size) # size=(512, 904)
except:
print(video_path, ' Error !!')
return {'MSE': 0, 'SSIM': 0, 'state': 0}
init_image_np = np.array(init_image)
video_frame_np = np.array(video_frames[0]) # or 10th frames
if init_image_np.shape == video_frame_np.shape:
mse_value = mean_squared_error(init_image_np, video_frame_np)
ssim_value = ssim(init_image_np,
video_frame_np,
multichannel=True,
win_size=7,
channel_axis=-1)
else:
print("Error: The images do not have the same dimensions.", video_path)
return {'MSE': 0, 'SSIM': 0, 'state': 0}
return {'MSE': mse_value, 'SSIM': ssim_value, 'state': 1}
# Image-GenVideo clip
def compute_video_image_similarity(video_path, image_path):
global model, preprocess
all_frame_scores = 0.
image = Image.fromarray(load_image(image_path))
try:
video_frames = load_video(video_path, size=image.size)
except:
print(video_path, ' Error !!')
return {"clip_per": all_frame_scores / len(video_frames), "state": 0}
for frame in video_frames:
frame = Image.fromarray(frame)
score = calculate_clip_score(model,
preprocess,
first_data=image,
second_data=frame,
first_flag='img',
second_flag='img').cpu().numpy()
all_frame_scores += score
return {"clip_per": all_frame_scores / len(video_frames), "state": 1}