-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathNetflixAnalyzer.java
528 lines (436 loc) · 16.8 KB
/
NetflixAnalyzer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
package com.github.kilianB.launcher;
import java.io.File;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import org.apache.commons.text.similarity.EditDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import com.github.kilianB.fileHandling.in.NetflixParser;
import com.github.kilianB.fileHandling.out.CSVMovieWriter;
import com.github.kilianB.fileHandling.out.CSVShowWriter;
import com.github.kilianB.fileHandling.out.CSVWriter;
import com.github.kilianB.model.BaseEntityWrapper;
import com.github.kilianB.model.netflix.NetflixMovie;
import com.github.kilianB.model.netflix.NetflixShowEpisode;
import com.github.kilianB.model.netflix.ViewItem;
import com.uwetrottmann.trakt5.entities.Episode;
import com.uwetrottmann.trakt5.entities.Movie;
import com.uwetrottmann.trakt5.entities.Season;
import com.uwetrottmann.trakt5.entities.Show;
import com.uwetrottmann.trakt5.enums.Type;
import trakt.TraktHelper;
/**
* Download and append detailed information to a Netflix viewing history file e.g. runtime
* and genre for further evaluation.
* <p>
*
* The code is in no way optimized but rather a practice project focuses on utilizing some of Java 8-10
* features.
*
* JAVA 8 Method reference :: Predicates
* JAVA 9 streams + lambda
* JAVA 10: local * type inference
*
* The input csv file will be parsed and individual items will either be classified as an
* episode (show), a movie or of type unknown. For each type a separate csv file with additional
* information will be produced
*
* @usage -> java -jar NetflixAnalyzer InputFilePath.csv TraktApiKey
* -> java -jar NetflixAnalyzer TraktApiKey
*
* @author Kilian
*
*/
public class NetflixAnalyzer {
private static final Logger LOGGER = Logger.getLogger(NetflixParser.class.getName());
// Settings
/**
* Print additional debug information
*/
private final boolean verbose = false;
/**
* Output path of csv file containing all items classified as movie
*/
private final String movieCsv = "MovieViewingHistory.csv";
/**
* Output path of csv file containing all items classified as episode
*/
private final String showCsv = "ShowViewingHistory.csv";
/**
* Output path of csv file containing all items which could not be resolved
*/
private final String unknownCsv = "UnknownViewingHistory.csv";
//Fields
/**
* Trakt client used to query the movie database
*/
private TraktHelper trakt;
/**
* Key : -> Series name as found in the netflix viewing history file
* Value: -> Show object returned by trakt (Overview: Genre, rating, ids)
*/
private Map<String, Show> traktShows;
/**
* Key : -> Movie name as found in the netflix viewing history file
* Value: -> Movie object returned by trakt
*/
private Map<NetflixMovie, Movie> traktMovies;
/**
* 1. Parse the Netflix viewing csv file -> (Title,Date)
* 2. Query trakt API to attach id's to the movie/series title.
* a) Movies are done. more work for episodes are required
* 3. Download summary of the series to get the episode id's
* 4. Use the episode ids and download granular information for each item
* 5. Match trakt and netflix titles using levenshtein distance
* 6. Output results to csv
*
* @param viewFilePath
* @param traktToken
*/
public NetflixAnalyzer(String viewFilePath, String traktToken) {
/*
* Initialize trakt api movie database
*/
trakt = new TraktHelper(traktToken);
//@formatter:off
try {
//1. Import viewing history and attempt to classify movies and shows
List<ViewItem> parsedHistory = NetflixParser.parseHistoryFile(viewFilePath);
//Retrieve a single NetflixShow for every show watched
HashSet<NetflixShowEpisode> distinctSeries = parsedHistory.stream()
.filter(item -> item.isShow())
.filter(distinctObjects(item -> ((NetflixShowEpisode) item).getSeries()))
.map(show -> (NetflixShowEpisode) show)
.collect(Collectors.toCollection(HashSet::new));
//Retrieve unique movies e.g. if we have watched a movie twice we only should hit the api once
var distinctMovies = parsedHistory.stream()
.filter(i -> !i.isShow())
.filter(distinctObjects(item -> ((NetflixMovie) item).getTitle()))
.map(show -> (NetflixMovie) show)
.collect(Collectors.toCollection(HashSet::new));
//@formatter:on
// Debug print
System.out.println("Series: " + distinctSeries.size() + " " + distinctSeries.stream()
.map(series -> series.getSeries()).collect(Collectors.joining(" , ", "Series [", "] parsed")));
System.out.println("Movies: " + distinctMovies.size() + " " + distinctMovies.stream()
.map(movie -> movie.getTitle()).collect(Collectors.joining(" , ", "Movies [", "] parsed")));
if (verbose) {
for (var item : parsedHistory) {
if (item instanceof NetflixShowEpisode) {
NetflixShowEpisode show = (NetflixShowEpisode) item;
System.out.println(show);
}
}
}
/*
* We are not allowed to search for episode given a specified series. Only
* searching for the episode title will give false reports therefore download
* episode info for all series we have watched and search within the returned
* results. This approach isn't really easy on the trakt API therefore we are
* nice and save the returned information in an SQL database to only need to
* query the API once.
*/
//@formatter:off
//2. Extract trakt id for every series
traktShows = distinctSeries.parallelStream()
.map(trakt::searchShow)
.flatMap(Optional<BaseEntityWrapper<NetflixShowEpisode,Show>>::stream)
.collect(Collectors.toMap(
s -> s.netflixViewItem.getSeries(), //Key by string
s -> s.getEntity())); //Function.identity(); if not wrapped
// 2. Extract trakt id for every movie
traktMovies = distinctMovies.parallelStream()
.map(trakt::searchMovie)
.flatMap(Optional<BaseEntityWrapper<NetflixMovie,Movie>>::stream)
.collect(Collectors.toMap(
s -> s.netflixViewItem,
s -> s.getEntity()));
//@formatter:on
if (verbose) {
trakt.printNotFoundItems();
}
/**
* Key : -> Movie name as found in the netflix viewing history file Value: ->
* Movie object returned by trakt
*/
HashMap<Show, List<Season>> seasonList = trakt.downloadSeriesInfo(traktShows);
Map<Type, Set<String>> notFoundOnTrakt = trakt.getNotFoundItems();
/*
* Generate 3 output files 1. 2. 3.
*/
CSVShowWriter showWriter = new CSVShowWriter(new File(showCsv), ";");
CSVMovieWriter movieWriter = new CSVMovieWriter(new File(movieCsv), ";");
CSVWriter unknowWriter = new CSVWriter(new File(unknownCsv), ";", "Date", "Type", "Title", "Series");
var notFoundShows = notFoundOnTrakt.get(Type.SHOW);
var notFoundMovies = notFoundOnTrakt.get(Type.MOVIE);
trakt.printNotFoundItems();
/*
* Multithread calls or we will wait forever (Do we need small delays to not get
* blacklisted at trakt?
*
*/
ExecutorService executor = Executors.newFixedThreadPool(15,
(Runnable r)->{
Thread t = new Thread(r);
t.setName("T-Pool:");
t.setUncaughtExceptionHandler(new UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread t, Throwable e) {
System.out.println("Uncaught exception: " + t + " " + e.toString());
}
});
return new Thread(r);
}
);
System.out.println("Retrieve runtime for shows. This may take a few seconds...");
var futures = new ArrayList<Future<?>>();
/*
* Variables modified in runnables have to be effective final. Circumvent this issue
* by encapsulating the int in an object. Atomic integers also give us thread safety
* for free.
*/
AtomicInteger knownEpisodeCount = new AtomicInteger(0);
AtomicInteger unknownCount = new AtomicInteger(0);
/*
* Construct the tasks of writing the individual data to a csv file.
* For shows runtime data still has to be downloaded on a per episode basis
*/
for (var viewItem : parsedHistory) {
//Not the nicest use of anonymous classes but keep it for now...
var future = executor.submit(new Runnable() {
ViewItem viewItem;
@Override
public void run() {
try {
if (viewItem.isShow()) {
NetflixShowEpisode netflixShow = (NetflixShowEpisode) viewItem;
if (notFoundShows.contains(netflixShow.getSeries())) {
LOGGER.fine("Non resolved show: " + netflixShow.getTitle());
unknowWriter.writeLine(netflixShow.getViewDate(), Type.SHOW, netflixShow.getTitle(),
netflixShow.getSeries());
unknownCount.incrementAndGet();
} else {
// Special case. While the show object contains a runtime
// It is not specific for an episode but more or less the average of a show.
// Query the episode list and look again
Show show = traktShows.get(netflixShow.getSeries());
int runtime = getRuntimeForEpisode(netflixShow, seasonList);
showWriter.push(show, netflixShow, runtime);
knownEpisodeCount.incrementAndGet();
}
// Output to csv
} else {
// Movie
NetflixMovie netflixMovie = (NetflixMovie) viewItem;
if (notFoundMovies.contains(netflixMovie.getTitle())) {
unknowWriter.writeLine(netflixMovie.getViewDate(), Type.MOVIE,
netflixMovie.getTitle());
unknownCount.incrementAndGet();
} else {
movieWriter.push(traktMovies.get(netflixMovie), netflixMovie);
}
}
} catch (IOException e) {
LOGGER.severe("Error during output file creation: " + e.getMessage());
}
}
//Inject value into anonymous class ...
public Runnable setViewItem(ViewItem viewItem) {
this.viewItem = viewItem;
return this;
}
}.setViewItem(viewItem));
futures.add(future);
}
//Wait for all threads to return
// for(var future : futures) {
// try {
// future.get();
// } catch (InterruptedException | ExecutionException e) {
// e.printStackTrace();
// }
// }
try {
executor.shutdown();
executor.awaitTermination(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
//Close writer //TODO move in finally?
showWriter.close();
movieWriter.close();
unknowWriter.close();
//dumpActiveNonDeamonThreads("After Shutdown");
//Print some information
int charLength = (int)Math.log10(parsedHistory.size())+1;
System.out.printf(
"%nFinished:%n-----------------------------------------%n"
+ "%14s %"+charLength+"d%n%14s %"+charLength+"d%n%14s %"+charLength+"d%n"
+ "%14s %"+charLength+"d%n%14s %"+charLength+"d%n",
"Items parsed:",parsedHistory.size(),"Unique shows:", traktShows.size(),
"Episodes:", knownEpisodeCount.get(),"Movies:", traktMovies.size(),
"Unknown:",unknownCount.get());
} catch (IOException e) {
e.printStackTrace();
}
//Print some
/*The trakt2 api depends on okttp which releases it's ressources after some idle time.
* We don't want to wait so long and trakt doesn't expose the client therefore force
* all threads to shut down at this point
*/
System.exit(0);
}
/**
* A predicate used to retrieve unique values present in a collection based on an arbitrary
* filter value
*
* @param func function executed to retrieve the filter key from the object
* @return a map containing all elements which are present in the collection without duplicates
*/
public Predicate<ViewItem> distinctObjects(Function<? super ViewItem, Object> func) {
var map = new HashSet<Object>();
return t -> map.add(func.apply((ViewItem) t));
}
/**
* Return the runtime for an individual episode
*
* @param show
* @param seasonList
* @return
*/
private int getRuntimeForEpisode(NetflixShowEpisode show, HashMap<Show, List<Season>> seasonList) {
// TODO returns null!
String episodeTitle = show.getTitle();
String seriesName = show.getSeries();
int seasonNumber = show.getSeason();
// Get the Show object as retrieved by trakt
Show traktShow = traktShows.get(seriesName);
// Retrieve the episode number query the
List<Season> seasons = seasonList.get(traktShow);
// System.out.println("Netflix Show: " + show + "\nTrakt: " + traktShow + "\nSeasons: " + seasons
// + "\nSeasonNumber: " + seasonNumber);
// Get the correct Season
Optional<Season> seasonTemp = seasons.stream().filter(s -> s.number == seasonNumber).findAny();
if (!seasonTemp.isPresent()) {
LOGGER.warning("Could not find season of " + seriesName + "(" + seasonNumber + "). Fallback to "
+ "defaul show runtime");
return traktShow.runtime;
}
Season correctSeason = seasonTemp.get();
// Now choose the correct episodes
List<Episode> episodes = correctSeason.episodes;
// Choose the episode depending on the title.
EditDistance<Integer> levDistance = new LevenshteinDistance();
var potentialEpisodes = new PriorityQueue<EpisodeTitleSearchResult>();
for (var e : episodes) {
// Calculate the similarity between the title we want and the title we get since
// trakt and netflix titles may not be 100% identical
int distance = levDistance.apply(episodeTitle, e.title);
potentialEpisodes.add(new EpisodeTitleSearchResult(e, distance));
}
// Get the closest match
EpisodeTitleSearchResult closestMatch = potentialEpisodes.poll();
// Download the summary for the episode
int editDistance = closestMatch.editDistance;
if (editDistance < 5) {
// Episodes downloaded via season summary don't contain the runtime.
try {
int runtime = trakt.getEpisodeRuntime(Integer.toString(closestMatch.episode.ids.trakt));
if (editDistance > 0) {
// Close enough match
LOGGER.warning("No exact match for episode found. Going with closest match: Query:" + episodeTitle
+ " Target: " + closestMatch.episode.title);
}
return runtime;
} catch (IOException e1) {
LOGGER.severe("Error while downloading runtime. Fallback to average show runtime");
e1.printStackTrace();
return traktShow.runtime;
}
} else {
LOGGER.warning("Could not find episode of " + seriesName + "(" + episodeTitle + "). Fallback to "
+ "average show runtime");
return traktShow.runtime;
}
}
/*
*
* Helper classes
*
*/
/**
* Title of episodes returned by trakt might not exactly match the titles
* provided by netflix. Therefore allow search through all episode titles of a
* season and pick the closest candidate.
*
* @author Kilian
*
*/
class EpisodeTitleSearchResult implements Comparable<EpisodeTitleSearchResult> {
int editDistance;
Episode episode;
public EpisodeTitleSearchResult(Episode e, int distance) {
this.episode = e;
this.editDistance = distance;
}
@Override
public int compareTo(EpisodeTitleSearchResult o) {
return Integer.compare(editDistance, o.editDistance);
}
}
public static void main(String[] args) {
//Set logging format
System.setProperty("Djava.util.logging.SimpleFormatter.format",
"%1$tY-%1$tm-%1$td %1$tH:%1$tM:%1$tS %4$-6s %2$s %5$s%6$s%n");
// Input validation
if (args.length >= 2 && args[0].endsWith(".csv") && args[1].length() == 64) {
// Trakt token is sha 256 encrypted? -> 64 characters
new NetflixAnalyzer(args[0], args[1]);
} else if(args.length == 1){
if(args[0].length() == 64) {
System.err.println("No input file path specified. Falback to default "
+ "NetflixViewingHistory.csv in current directory");
new NetflixAnalyzer("NetflixViewingHistory.csv", args[0]);
}else {
System.err.println("The supplied trakt key does not have the correct length to be "
+ "a valid key");
System.err.println("Aborting");
}
}else {
System.err.println("Usage:\n"
+ "\tjava -jar NetflixAnalyzer PATH_TO_VIEWHISTORYFILE.csv traktClientID\n"
+ "\tjava -jar NetflixAnalyzer traktClientID");
System.err.println("Aborting");
}
}
/**
* Debug function used to check which threads prevent the jvm from exiting
* In production we would not use strack traces as these are rather expensive
* @param message
*/
@SuppressWarnings("unused")
private void dumpActiveNonDeamonThreads(String message) {
System.out.println(message);
Set<Thread> threadSet = Thread.getAllStackTraces().keySet();
for(Thread t : threadSet) {
if(!t.isDaemon())
System.out.println(t);
}
}
}