-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitle_basics_data.py
47 lines (40 loc) · 2.14 KB
/
title_basics_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""Clean non-commercial data from title.basics.tsv from IMDb site and prepare for loading into a
database."""
from imdb_data import IMDbData
class TitleBasicsData(IMDbData):
"""Title Basics cleaner for IMDb data that filters out adult titles and non-movies"""
# title.basics.tsv.gz columns from https://developer.imdb.com/non-commercial-datasets/
# * tconst (string) - alphanumeric unique identifier of the title
# * titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode,
# video, etc)
# * primaryTitle (string) – the more popular title / the title used by the filmmakers on
# promotional materials at the point of release
# * originalTitle (string) - original title, in the original language
# * isAdult (boolean) - 0: non-adult title; 1: adult title
# * startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is
# the series start year
# * endYear (YYYY) – TV Series end year. '\N' for all other title types
# * runtimeMinutes – primary runtime of the title, in minutes
# * genres (string array) – includes up to three genres associated with the title
def __init__(self, init_df):
df_name = "title_basics"
super().__init__(init_df, df_name)
self.desired_columns = [
"primaryTitle",
"originalTitle",
"startYear",
"runtimeMinutes",
]
# Assign an index to the DataFrame
self.data_frames[df_name].set_index("tconst", inplace=True)
# Split out the genres column into a separate dataframe
self.data_frames["title_genres"] = self.explode_columns(
self.data_frames[df_name], "genres"
)
self.data_frames[df_name] = self.data_frames[df_name][self.desired_columns]
def clean_data(self, input_df):
"""Filter out rows where 'isAdult' is 1 and keep rows where 'titleType' is 'movie'
Keep only desired columns"""
filtered_df = input_df[input_df.titleType == "movie"]
filtered_df = filtered_df[filtered_df["isAdult"] == "0"]
return self.replace_null(filtered_df)