-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnameScrape.py
65 lines (50 loc) · 1.92 KB
/
nameScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from bs4 import BeautifulSoup as bs
import urllib.request as ur
import re
#This file is for scraping the episode names from Wikipedia.
#After running this, do check whether all episodes were properly scraped or not.
#paste appropriate link from Wikipedia [format is https://en.wikipedia.org/wiki/List_of_Series_Name_episodes {how convenient for further automation}]
wiki = "https://en.wikipedia.org/wiki/List_of_Modern_Family_episodes"
header = {'User-Agent': 'Mozilla/5.0'} # Needed to prevent 403 error on Wikipedia
req = ur.Request(wiki, headers=header)
page = ur.urlopen(req)
soup = bs(page,'html.parser')
#n = int(input('how many seasons?'))
n = int(20)
table = soup.findAll('table',attrs={'class':"wikitable plainrowheaders wikiepisodetable"}) #fixed format for episode list table for every series
t = 0
#I am using a counter to get all the episode names of all seasons in one go.
f = open('ep_names\\Episodes.txt','w')
f.close()
while t!=n:
ep_num = 1
with open('ep_names\\Episodes.txt','a+') as f:
try:
for row in table[t].findAll('tr',{'class':'vevent'}):
#could've used findAll('tr')[1:]: as well
cells = row.findAll('td')
#safety first
try:
#name = cells[1].a.string.strip() // earlier method, not universal
#print(type(name))
name = cells[1].get_text() # probably universal
#if name.startswith('"') and name.endswith('"'):
#minor discrepancies taken care of
if '"' in name:
name = re.sub('"','',name)
if '?' in name:
name = re.sub('?','',name)
if ':' in name:
name = re.sub(':', ' -', name)
if '!' in name:
name = re.sub('!', '', name)
#zfill adds 0s to make names consistent.
f.write('S'+str(t+1).zfill(2)+'E'+str(ep_num).zfill(2)+ " " +name + '\n')
ep_num += 1
# f.write(name+'\n')
except Exception as e:
f.write('\n***'+str(e)+'****\n')
pass #no interruptions.
except IndexError:
pass
t+=1