-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
144 lines (120 loc) · 4.8 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import requests
import logging
import json
import os
import shutil
from bs4 import BeautifulSoup
from dataclasses import dataclass
from configuration import (
WIKIPEDIA_HOST_URL,
WIKIPEDIA_SEARCH_API
)
LOGGER_BASENAME = 'wikisearch'
LOGGER = logging.getLogger(LOGGER_BASENAME)
LOGGER.addHandler(logging.NullHandler())
@dataclass
class SearchResult:
title: str
url: str
class LoggerMixin(object):
def __init__(self) -> None:
self._logger = logging.getLogger(f'{LOGGER_BASENAME}.{self.__class__.__name__}')
class WikipediaSeries(LoggerMixin):
def __init__(self) -> None:
super().__init__()
self.search_url = WIKIPEDIA_SEARCH_API
self.seasons = []
self.title = None
def __str__(self):
return f'series seasons: {self.seasons}'
def _get_query_map(self, name):
query_map = {
'episode_list': f'list of {name} episodes',
'miniseries': f'{name} miniseries',
'name': f'{name}'
}
return query_map
def search_by_name(self, name):
for type, query in self._get_query_map(name).items():
self._logger.debug(f'Searching for {name} with type:{type}')
result = self._search(query)
if result:
if len(result) == 1:
self.title = name
return result
def _search(self, query):
parameters = {'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query}
response = requests.get(self.search_url, params=parameters)
if response.ok:
return [SearchResult(*args) for args in zip(response.json()[1], response.json()[3])]
else:
self._logger.error(f'Request failed with code {response.code} and message {response.text}')
def get_soup_by_url(self, url):
html_response = requests.get(url)
soup = BeautifulSoup(html_response.text, 'html.parser')
return soup
def parse_seasons_from_soup(self, soup):
season_list = []
table = soup.find("table", {"class": "wikitable plainrowheaders"})
t_headers = table.find_all("th")
for header in t_headers:
season = header.find("a")
if season:
season_list.append(season.contents[0])
return season_list
def parse_seasons_and_episodes_from_soup(self, soup):
season_list = []
tables = soup.find_all("table", {"class": "wikitable plainrowheaders wikiepisodetable"})
for table in tables:
episode_list = []
season_header = table.find_previous_sibling('h3')
season_title = season_header.find("span", {"class": "mw-headline"}).get_text(strip=True)
season = Season(season_title)
season.episodes = self.parse_html_table_to_json(table)
season_list.append(season)
self.seasons = season_list
def parse_html_table_to_json(self, table):
table_data = [[cell.text.strip('"') for cell in row] for row in table("tr", {"class": "vevent"})]
table_headers = [cell.text.strip() for cell in table.find("tr")("th", {"scope": "col"})]
results_list = []
for row in table_data:
res_dict = {}
for idx, item in enumerate(row):
res_dict[table_headers[idx]] = item
results_list.append(res_dict)
return json.dumps(results_list, indent=4)
def write_to_file_system(self):
for season in self.seasons:
self._logger.debug(f"writing results to file sysytem for season: {season.number}")
directory = os.path.dirname(f'./results/{self.title}/{season.number}/')
if os.path.exists(directory):
self._logger.warning(f"Season folder already exists {directory}, overwriting it.")
self.delete_dir_tree(directory)
os.makedirs(directory)
with open(f'{directory}/episodes.json', 'w') as episodes_file:
episodes_file.write(season.episodes)
def delete_dir_tree(self, dir_path):
try:
shutil.rmtree(dir_path)
except OSError as e:
self._logger.error(f"Error: {dir_path} : {e.strerror}")
class Season:
def __init__(self, number) -> None:
super().__init__()
self.number = number
self.episodes = []
def get_episodes_json(self):
episodes = []
for episode in self.episodes:
episodes.append(episode.__dict__)
return json.dumps(episodes)
class Episode:
def __init__(self, title, number) -> None:
super().__init__()
self.title = title
self.number = number
def __str__(self):
return f'episode:{self.number}, title:{self.title}'