-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmoviePlotCrawler.py
116 lines (98 loc) · 5.47 KB
/
moviePlotCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from lxml import html
import requests
import csv
import sys
import numpy as np
if __name__ == '__main__':
if len(sys.argv)>1:
genreUrls = [sys.argv[1]]
print('Running crawler with custom argument for genre: {}'.format(genreUrls))
else:
mainUrl = 'https://www.filmy.gr/filmy-presentation-movie-database'
print('Connecting to {}...'.format(mainUrl))
mainPage = requests.get(mainUrl)
mainTree = html.fromstring(mainPage.content)
# Get urls for genre pages
genreUrls = mainTree.xpath('//blockquote/p/em/strong/a/@href')
# Prepare lists that will hold the DB attributes
urls, titles, origTitles, years, countries, genres, plots = [], [], [], [], [], [], []
# Dictionary for movie info parsing
info_dict = {
'year': 'amy_xronia',
'country': 'amy_xwra',
'genre': 'genre-list',
}
# Join data parsed in parallel in a single file
print('Creating csv file to dumb movie info')
movieInfoDumb = 'movieDB_filmy.csv'
csvfile = open(movieInfoDumb, 'a')
header = ['url', 'title', 'original_title', 'year', 'country', 'genre', 'plot']
writer = csv.DictWriter(csvfile, fieldnames=header)
writer.writeheader()
csvfile.close()
# Iterate movie www.filmy.gr per genre category
for genreUrl in genreUrls:
# Catch "documentary" genre error page
if genreUrl == 'https://www.filmy.gr/movies-database/back-to-the-top/':
genreUrl = 'https://www.filmy.gr/genre-list/documentary/'
print('Getting genre pagination in: {}'.format(genreUrl))
# Get genre pagination
genrePage = requests.get(genreUrl)
genrePageTree = html.fromstring(genrePage.content)
# Get main url string of the pagination
# Ex. 'https://www.filmy.gr/genre-list/epic/page'
pageString = '/'.join(genreUrl.split('/'))
# Get the last page of the pagination
if genrePageTree.xpath('//nav/div/a/@href')[-2].split('/'):
lastPage = int(genrePageTree.xpath('//nav/div/a/@href')[-2].split('/')[-2])
# Get page url list
# Ex. ['https://www.filmy.gr/genre-list/epic/page/1',
# 'https://www.filmy.gr/genre-list/epic/page/2',
# 'https://www.filmy.gr/genre-list/epic/page/3']
pageList = ['{}/{}/{}'.format(pageString, 'page', i) for i in range(1, lastPage+1)]
else:
pageList = [pageString]
print('Genre pagination list: {}'.format(pageList))
# Iterate through the multiple pages of each genre pagination
for genrePageUrl in pageList:
print('Parsing genre URL: {}'.format(genrePageUrl))
# Go to genre page
movieListPage = requests.get(genrePageUrl)
movieListPagePageTree = html.fromstring(movieListPage.content)
# Get movie URL
movieUrls = movieListPagePageTree.xpath("//*[contains(@class, 'entry-title')]/a/@href")
# Get movie titles (ex. ['Βασιλιάς Αρθούρος', 'Star Trek', 'Αρμαγεδδών', 'Apocalypto'])
movieTitle = movieListPagePageTree.xpath("//*[contains(@class, 'entry-title')]/text()")
movieGenresDiv = movieListPagePageTree.xpath("//div[contains(@class, 'entry-genre')]")
for moviePageUrl in movieUrls:
print('Parsing movie URL: {}'.format(moviePageUrl))
# Go to movie page
moviePage = requests.get(moviePageUrl)
moviePageTree = html.fromstring(moviePage.content)
# Parse movie details:
# title, original title, year, country, genres, plot
title = moviePageTree.xpath("//div/h1/a/text()")
origTitle = moviePageTree.xpath("//li[1]/span/text()")
info_href = moviePageTree.xpath("//li/span/a/@href")
info = moviePageTree.xpath("//li/span/a/text()")
year_idx = [i for i,label in enumerate(info_href) if info_dict['year'] in label.strip()]
country_idx = [i for i,label in enumerate(info_href) if info_dict['country'] in label.strip()]
genre_idx = [i for i,label in enumerate(info_href) if info_dict['genre'] in label.strip()]
year = ','.join(list(map(lambda x: info[x].strip(), year_idx)))
country = ','.join(list(map(lambda x: info[x].strip(), country_idx)))
genre = ','.join(list(map(lambda x: info[x].strip(), genre_idx)))
# capture the plot by fetching the class 'wpb_wrapper' object with the largest number of characters
el = moviePageTree.xpath('//div[contains(@class, \'wpb_wrapper\')]/blockquote/p[not(starts-with(text(), ":"))]/text()')
if len(el)>3:
test=1
el = moviePageTree.xpath("//div[contains(@class, 'wpb_wrapper')]/blockquote/p/text()")
else:
el = moviePageTree.xpath("//div[contains(@class, 'wpb_wrapper')]//p/text()")
test=2
plot_idx = np.argsort(list(map(len, el)))[-1]
plot = el[plot_idx].strip()
csvfile = open(movieInfoDumb, 'a')
writer = csv.writer(csvfile)
writer.writerow(list(map(lambda x: str(x).strip(), [moviePageUrl, title, origTitle, year, country, genre, plot])))
csvfile.close()
print('Finished parsing https://www.filmy.gr')