Skip to content

Commit

Permalink
Update to new expressen nyhetsdygnet layout
Browse files Browse the repository at this point in the history
  • Loading branch information
pierrelefevre committed May 31, 2024
1 parent 380a249 commit 8ce6735
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 19 deletions.
37 changes: 23 additions & 14 deletions aggregator/expressen_se.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,32 @@
def scrape_story(url):
page = requests.get(url, headers={"User-Agent": helpers.chrome_user_agent})

soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(page.content, "html.parser")

article = {"url": url, "fetched_at": helpers.get_timestamp()}
full_text = ""

try:
title = '\n'.join([p.get_text() for p in soup.find(
class_="article__header").find_all('h1')]).replace("\xa0", "")
title = "\n".join(
[p.get_text() for p in soup.find(class_="article__header").find_all("h1")]
).replace("\xa0", "")
article["title"] = title
full_text += title + "\n"
except:
pass

try:
location = soup.find(
"a", href=lambda href: href and "/tagg/location" in href).get_text()
"a", href=lambda href: href and "/tagg/location" in href
).get_text()
article["location"] = location
except:
pass

try:
preamble = '\n'.join([p.get_text() for p in soup.find(
class_="article__preamble").find_all('p')]).replace("\xa0", "")
preamble = "\n".join(
[p.get_text() for p in soup.find(class_="article__preamble").find_all("p")]
).replace("\xa0", "")

if preamble.startswith("Premium"):
return None
Expand All @@ -40,8 +43,9 @@ def scrape_story(url):
pass

try:
body = '\n'.join([p.get_text() for p in soup.find(
class_="article__body-text").find_all('p')]).replace("\xa0", "")
body = "\n".join(
[p.get_text() for p in soup.find(class_="article__body-text").find_all("p")]
).replace("\xa0", "")
article["body"] = body
full_text += body + "\n"
except:
Expand All @@ -68,15 +72,15 @@ def scrape():

# chrome user agent
page = requests.get(url, headers={"User-Agent": helpers.chrome_user_agent})
soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(page.content, "html.parser")

# Get all articles
list_elements = soup.find_all(class_="list-page__item")
list_elements = soup.find_all("a", class_="list-page__item__link")

news_links = []

for element in list_elements:
href = element.find('a')['href']
href = element["href"]
if "expressen.se" not in href:
href = "https://www.expressen.se" + href
if "premium" in href:
Expand All @@ -101,7 +105,12 @@ def scrape():
return articles


if __name__ == '__main__':
print(json.dumps(scrape_story(
"https://www.expressen.se/tv/nyheter/polisanmals-efter-skamtet-kommer-aka-in-/")))
if __name__ == "__main__":
print(
json.dumps(
scrape_story(
"https://www.expressen.se/tv/nyheter/polisanmals-efter-skamtet-kommer-aka-in-/"
)
)
)
# print(json.dumps(scrape()))
12 changes: 7 additions & 5 deletions aggregator/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dotenv import load_dotenv
import hashlib

chrome_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
chrome_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
load_dotenv()


Expand All @@ -13,7 +13,8 @@ def get_timestamp():

def log(message, level="INFO"):
processed = message.replace("\n", " ")
print(f'{get_timestamp()} [{level}] {processed}', file=sys.stderr, end="\n")
print(f"{get_timestamp()} [{level}] {processed}", file=sys.stderr, end="\n")


# get raw png bytes and convert to string

Expand All @@ -22,14 +23,15 @@ def get_image(url):
import requests
from io import BytesIO
from PIL import Image
response = requests.get(url, headers={'User-Agent': chrome_user_agent})

response = requests.get(url, headers={"User-Agent": chrome_user_agent})
image = Image.open(BytesIO(response.content))
image = image.convert('RGB')
image = image.convert("RGB")
buffer = BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()


def get_hash(story):
hash = hashlib.sha256(story["full_text"].encode('utf-8')).hexdigest()
hash = hashlib.sha256(story["full_text"].encode("utf-8")).hexdigest()
return str(hash)

0 comments on commit 8ce6735

Please sign in to comment.