Skip to content

Commit

Permalink
refactor: minor refactor, remove unused vars, add some types
Browse files Browse the repository at this point in the history
  • Loading branch information
drendog committed Oct 18, 2020
1 parent aa9c1f4 commit 31e7dcf
Showing 1 changed file with 11 additions and 10 deletions.
21 changes: 11 additions & 10 deletions module/scraper_notices.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import re
import telegram
import hashlib
import time
import errno
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import CallbackContext

Expand All @@ -30,14 +32,17 @@ def get_links(label, url):
base_url = base_url[:base_url.find(".unict.it")] + ".unict.it"

return [
{ label: link.get('href'), "content": get_content_checksum(base_url + link.get('href')) }
{
label: link.get('href'),
"content": get_content_checksum(base_url + link.get('href'))
}
for link in result if "/docenti/" not in link.get('href')
]
except Exception as e:
open("logs/errors.txt", "a+").write("{}\n".format(e))
return None

def get_content_checksum(url):
def get_content_checksum(url: str) -> str:
try:
time.sleep(1) # delay to avoid "Max retries exceeds" for too many requests
req = requests.get(url)
Expand All @@ -53,8 +58,7 @@ def get_content_checksum(url):
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
for c in cols:
table_content += c + "\t"
table_content = "\t".join(cols)
table_content +="\n"

table.decompose() # remove table from content
Expand All @@ -72,12 +76,11 @@ def get_content_checksum(url):
md5.update(content.encode('utf-8'))
return md5.hexdigest()
return None

except Exception as e:
open("logs/errors.txt", "a+").write("{}\n".format(e))
return None

def get_content(url):
def get_content(url: str) -> [str, str]:
try:
time.sleep(1) # delay to avoid "Max retries exceeds" for too many requests
req = requests.get(url)
Expand All @@ -93,8 +96,7 @@ def get_content(url):
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
for c in cols:
table_content += c + "\t"
table_content = "\t".join(cols)
table_content +="\n"

table.decompose() # remove table from content
Expand Down Expand Up @@ -207,7 +209,7 @@ def send_news_approve_message(context: CallbackContext, notice_p, channel_folder
if notice_message != "":
try:
# notice disk id is used to identify an approval pending message. OS clock's used for this
notice_disk_id = time.clock()
notice_disk_id = time.clock_gettime()
approving_notice_filename = "{}/{}/{}_{}.dat".format(channel_folder, pending_approval_folder, page_name, notice_disk_id)

if not os.path.exists(os.path.dirname(approving_notice_filename)):
Expand Down Expand Up @@ -237,7 +239,6 @@ def send_news_approve_message(context: CallbackContext, notice_p, channel_folder


def scrape_notices(context):
job = context.job
notices_urls_cp = copy.deepcopy(notices_urls)

for i in notices_urls_cp:
Expand Down

0 comments on commit 31e7dcf

Please sign in to comment.