-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreviews.py
68 lines (54 loc) · 2.06 KB
/
reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup
from requests import get # this can be sync
from datetime import datetime, timedelta
cached = {}
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.5",
"Cache-Control": "no-cache",
"Host": "top.gg",
"DNT": "1",
}
def scrape(uri: str = "/bot/619328560141697036", headers=None):
# with open("export.html", encoding="utf-8", errors="replace") as rfile:
# return rfile.read()
if headers is None:
headers = DEFAULT_HEADERS
if uri in cached.keys():
if cached[uri]["expires"] >= datetime.utcnow():
return cached[uri]["content"]
del cached[uri]
html = get("https://top.gg" + uri, headers=headers).text
cached[uri] = {
"content": html,
"expires": datetime.utcnow()+timedelta(hours=3)
}
return html
def soupify(html: str):
return BeautifulSoup(html, "html.parser")
def find_reviews_section(soup, string: bool = True, clean_hidden: bool = True):
results = soup.find_all("div", "reviews__wrapper")
if not results:
return soup.prettify()
if not string:
return results[0]
found = str(results[0])
if clean_hidden:
found = found.replace("opacity:0;", "")
return found
def find_review_contents(soup, limit: int = 1):
section = find_reviews_section(soup, string=False)
return section.find_all("p", "comment-content", limit=limit)
def find_review_authors(soup, limit: int = 1):
section = find_reviews_section(soup, string=False)
return section.find_all("p", "review__username", limit=limit)
def pair_reviews(soup, limit: int = 1):
n = []
usernames = find_review_authors(soup, limit)
contents = find_review_contents(soup, limit)
for index, username in enumerate(usernames):
n[index] = (username, contents[index])
return n
if __name__ == "__main__":
find_reviews_section(soupify(scrape()))