-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathscoper.py
372 lines (267 loc) · 12 KB
/
scoper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
"""
Scoper
------------------------------------------------------------------------
Fuzzy & semantic caption-based searching for YouTube videos.
"""
from collections import defaultdict
import warnings
import argparse
import re
import gensim
from gensim.utils import tokenize
from fuzzywuzzy import process
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.corpus import brown
warnings.filterwarnings("ignore")
def normalize_time(timestamps):
"""
Method to normalize the timestamp (floating seconds) to
"hh mm ss" format.
@param timestamps: List of floats where each float is a timestamp
"""
normalized_time = []
for timestamp in timestamps:
seconds = int(timestamp)
minutes = seconds//60
hours = minutes//60
minutes = minutes%60
seconds = seconds%60
normalized_timestamp = ''
if hours != 0:
normalized_timestamp += str(hours) + 'h '
normalized_timestamp += str(minutes)+'m '+str(seconds)+'s'
normalized_time.append(normalized_timestamp)
return normalized_time
def pretty_print(output):
"""
Method to output the relevant captions and timestamps in a pretty manner.
@param output: A list of pairs where each pair = (caption, timestamp)
"""
spaces = max([len(caption) for caption, timestamp in output])
spaces += 5
for caption, timestamp in output:
print(caption, end=' '*(spaces - len(caption)))
print(timestamp)
def parse(large_dataset_filepath):
"""
Method meant for user to implement their own corpus' parsing
algorithm.
"""
_f_ = open(large_dataset_filepath)
corpus = _f_.readlines()
return corpus
def setup(pretrained_vectors=False, small_dataset=True):
"""
Utility script required to setup global variables.
"""
if pretrained_vectors:
print('Support for pretrained vectors is currently in \
development. Feel free to modify this code to suit your \
needs and send a PR if you\'d like.')
sentences = ['']
if small_dataset:
sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, min_count=1)
else:
large_dataset_filepath = input('Enter your dataset\'s filepath: ')
sentences = parse(large_dataset_filepath)
model = gensim.models.Word2Vec(sentences, min_count=1)
vocabulary = defaultdict(int)
for sentence in sentences:
for word in sentence:
vocabulary[word] = 1
return model, vocabulary
def get_video_id(youtube_video_url):
"""
Extract youtube's video ID from the URL using a regex.
@param youtube_video_url: String holding a youtube link.
Example:
input : https://www.youtube.com/watch?v=7bD_r5u3znQ
output : 7bD_r5u3znQ
"""
# Regular expressions to parse YouTube links
youtube = r'(youtu.be\/|v\/|e\/|u\/\w+\/|embed\/|v=)'
video_id = r'([^#\&\?]*).*'
https = r'^.*'
parsed_url = re.search(https + youtube + video_id, youtube_video_url)
return parsed_url[2]
def get_youtube_captions(youtube_video_url, languages=None):
"""
Use YouTube's API to pull captions from a video.
@param youtube_video_url: String holding the youtube link.
@param languages: Language on which captions must be downloaded.
Note: The semantic similarity method only works for english, as the model
trained by this script is trained on the English Brown Corpus.
The fuzzy similarity method works across all languages.
"""
if languages is None:
languages = ['en']
video_id = get_video_id(youtube_video_url)
captions_and_timestamps = dict()
try:
captions_and_timestamps = YouTubeTranscriptApi.get_transcript(video_id, languages)
except Exception as _e_:
# This exception arises in case of a broken link or when
# searching in a video where captions are unavailable
print(_e_, type(_e_))
captions = defaultdict(float)
for data in captions_and_timestamps:
captions[data['text'].lower()] = data['start']
return captions
class Scoper:
"""
Class scoper, consists of all methods required to perform either a
fuzzy or semantic search of YouTube captions.
The main code that invokes all other utility methods is the `main`
method of this class.
"""
def __init__(self, pretrained_vectors=False,
small_dataset=True):
"""
Class constructor.
Defines and declares all variables used.
"""
self._fuzzy_ = "FUZZY"
self._semantic_ = "SEMANTIC"
_m_, _v_ = setup(pretrained_vectors=pretrained_vectors,
small_dataset=small_dataset)
self.model, self.vocabulary = _m_, _v_
def get_captions_by_fuzzy_similarity(self, query, corpus, limit=5):
"""
Method that uses fuzzy string matching algorithms to determine
similarity and return most X similar strings.
@param query: The user query
@param corpus: The corpus or set of captions
@param limit: The number of most similar captions to be
returned
"""
similar_strings = process.extractBests(query, corpus, limit=limit)
reqd_strings = [caption for caption, similarity in similar_strings]
return reqd_strings
def get_captions_by_semantic_similarity(self, query, corpus, limit=5):
"""
Method to use semantic matching algorithms and leverage word
embeddings while defining similarity.
@param query: The user query
@param corpus: The set of captions
@param limit: The number of most similar captions to be returned
"""
captions_and_similarities = []
corpus = list(corpus)
for idx, caption in enumerate(corpus):
similarity = self.compute_semantic_similarity(query, caption)
captions_and_similarities.append([similarity, idx])
captions_and_similarities.sort()
captions_and_similarities = captions_and_similarities[:limit]
closest_captions = []
for similarity, idx in captions_and_similarities:
closest_captions.append(corpus[idx])
return closest_captions
def compute_semantic_similarity(self, sentence_1, sentence_2):
"""
Method to compute the semantic similarity between two sentences using
word2vec.
@param sentence_1: The first phrase/sentence
@param sentence_2: The second phrase/sentence
Note: This function is not symmetric as it uses a modified version of
the WMD between sentences/documents of varying lengths.
"""
sentence_1 = list(tokenize(sentence_1))
sentence_2 = list(tokenize(sentence_2))
visited_1 = [0]*len(sentence_1)
visited_2 = [0]*len(sentence_2)
similarity = 0
# The algorithm used below is a modified word-mover's distance algorithm.
# It is asymmetric, and for each word in sentence_1, we find the closest
# un-mapped word in sentence_2 and add this similarity.
for idx_a, word_a in enumerate(sentence_1):
if self.vocabulary[word_a] == 1:
visited_1[idx_a] = 1
closest_distance = 1e18
idx_chosen = -1
for idx_b, word_b in enumerate(sentence_2):
if visited_2[idx_b] == 0 and self.vocabulary[word_b] == 1:
current_distance = (1 - self.model.similarity(word_a, word_b))
if idx_chosen == -1 or current_distance < closest_distance:
closest_distance = min(closest_distance, current_distance)
idx_chosen = idx_b
if idx_chosen != -1:
visited_2[idx_chosen] = 1
similarity += closest_distance
return similarity/len(sentence_1)
def get_timestamp(self, query, caption_to_timestamp, mode='FUZZY', limit=5):
"""
Method to get the timestamps corresponding to the most relevant captions.
The user query is passed, the top X relevant captions are computed and identified.
The timestamps of these captions are found and returned.
@param query: The user query
@param caption_to_timestamp: A dictionary where keys are captions
and values are their timestamps
@param mode: Whether the search is fuzzy or semantic
@param limit: The number of relevant captions to be fetched
@param model: The word2vec model being used
"""
if mode == self._fuzzy_:
get_captions = self.get_captions_by_fuzzy_similarity
most_similar_captions = get_captions(query,
caption_to_timestamp.keys(),
limit=limit)
elif mode == self._semantic_:
get_captions = self.get_captions_by_semantic_similarity
most_similar_captions = get_captions(query,
caption_to_timestamp.keys(),
limit=limit)
marked_timestamps = [caption_to_timestamp[caption] for caption \
in most_similar_captions]
return marked_timestamps
def main(self, youtube_video_url, query, limit=5, languages=None, mode='FUZZY'):
"""
The driving code of this script, this invokes all other methods as it deeems fit.
@param query: The user query
@param youtube_video_url: The YouTube video URL from which captions are pulled
@param limit: The number of relevant captions to find
@param languages: The languages the captions must be extracted in
@param mode: Whether the search is fuzzy or semantic
"""
query = query.lower()
captions_and_timestamps = get_youtube_captions(youtube_video_url, languages)
timestamps = self.get_timestamp(query, captions_and_timestamps,
limit=limit, mode=mode)
timestamps_and_captions = defaultdict(str)
for caption in captions_and_timestamps:
key = captions_and_timestamps[caption]
value = caption
timestamps_and_captions[key] = value
captions_extracted = []
for timestamp in timestamps:
captions_extracted.append(timestamps_and_captions[timestamp])
pretty_print(list(zip(captions_extracted, normalize_time(timestamps))))
return list(zip(captions_extracted, normalize_time(timestamps)))
if __name__ == "__main__":
PARSER = argparse.ArgumentParser()
PARSER.add_argument('--video', action='append')
PARSER.add_argument('--mode', action='append',
choices=['FUZZY', 'SEMANTIC', 'F', 'S'])
PARSER.add_argument('--limit', action='append')
PARSER.add_argument('--language', action='append')
ARGS = PARSER.parse_args()
if ARGS.video is None:
print('No video URL found.')
exit()
else:
VIDEO = str(ARGS.video[0])
if ARGS.mode is None:
MODE = 'FUZZY'
else:
MODE = ARGS.mode[0]
if ARGS.language is None:
LANGUAGE = ['en']
else:
LANGUAGE = ARGS.language
if ARGS.limit is None:
LIMIT = 10
else:
LIMIT = int(ARGS.limit[0])
query = input('Enter query string: ')
FINDER = Scoper()
FINDER.main(VIDEO, query=query, limit=LIMIT, languages=LANGUAGE, mode=MODE)