Skip to content

Commit

Permalink
rss: datetime fix, source info added
Browse files Browse the repository at this point in the history
Signed-off-by: Clemens Vasters <clemens@vasters.com>
  • Loading branch information
clemensv committed Sep 17, 2024
1 parent e699c3c commit a32b36d
Show file tree
Hide file tree
Showing 20 changed files with 179 additions and 146 deletions.
1 change: 1 addition & 0 deletions rss/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ version = "0.1.0"
description = "A project to fetch data from RSS feeds and publish them as CloudEvents"
authors = ["Clemens Vasters <clemensv@microsoft.com>"]


[tool.poetry.dependencies]
python = ">=3.8"
requests = ">=2.32.3"
Expand Down
29 changes: 25 additions & 4 deletions rss/rssbridge/rssbridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

from requests import RequestException

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../rssbridge_producer")))

from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemauthor import FeedItemAuthor
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemcontent import FeedItemContent
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemsource import FeedItemSource
Expand All @@ -32,6 +34,7 @@
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemsummary import FeedItemSummary
from rssbridge_producer_kafka_producer.producer import MicrosoftOpenDataRssFeedsEventProducer
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditem import FeedItem
from rssbridge_producer_data.microsoft.opendata.rssfeeds.link import Link

# Logging configuration
if sys.gettrace() is not None:
Expand Down Expand Up @@ -195,7 +198,7 @@ def remove_feed(url: str):
save_feedstore(feed_urls)


def feeditem_from_feedparser_entry(entry) -> FeedItem:
def feeditem_from_feedparser_entry(feed, entry) -> FeedItem:
"""
Create a FeedItem instance from a feedparser entry.
Expand Down Expand Up @@ -295,12 +298,12 @@ def parse_date(parsed_date_value) -> datetime|None:
return datetime.fromisoformat(parsed_date_value).astimezone(timezone.utc)
return None

return FeedItem(
feed_item = FeedItem(
author=parse_author_detail(entry.get('author_detail')),
publisher=parse_publisher_detail(entry.get('publisher_detail')),
summary=parse_summary_detail(entry.get('summary_detail')),
title=parse_title_detail(entry.get('title_detail')),
source=parse_source_detail(entry.get('source')),
source=None,
content=parse_content_detail(entry.get('content')),
enclosures=parse_enclosure_detail(entry.get('enclosures')),
published=parse_date(entry.get('published_parsed')),
Expand All @@ -313,6 +316,24 @@ def parse_date(parsed_date_value) -> datetime|None:
contributors=[parse_author_detail(contrib) for contrib in entry.get('contributors', [])],
links=entry.get('links')
)
if not feed_item.source:
feed_item.source = FeedItemSource(
author=entry.get('author'),
author_detail=parse_author_detail(entry.get('author_detail')),
contributors=[parse_author_detail(contrib) for contrib in entry.get('contributors', [])],
icon=feed.feed.get('image').get('href') if feed.feed.get('image') else None,
id=entry.get('id'),
link=feed.feed.get('link'),
links=[],
logo=feed.feed.get('image').get('href') if feed.feed.get('image') else None,
rights=feed.feed.get('rights'),
subtitle=feed.feed.get('subtitle'),
title=feed.feed.get('title'),
updated=parse_date(feed.feed.get('updated_parsed'))
)
for link in feed.feed.get('links'):
feed_item.source.links.append(Link(title=link.get('title'), href=link.get('href'), rel=link.get('rel'), type=link.get('type')))
return feed_item


def fetch_feed(url: str, etag: Optional[str] = None) -> requests.Response:
Expand Down Expand Up @@ -391,7 +412,7 @@ async def process_feed(feed_url: str, state: dict, producer_instance: MicrosoftO
if 'published_parsed' in entry and entry.published_parsed: # won't handle entries without pub date
pub_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
if pub_date > last_checked_datetime:
item = feeditem_from_feedparser_entry(entry)
item: FeedItem = feeditem_from_feedparser_entry(feed, entry)
try:
new_items.append(item)
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
import dataclasses
import dataclasses_json
import json
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemsummary import FeedItemSummary
from datetime import datetime
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemenclosure import FeedItemEnclosure
from marshmallow import fields
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemsource import FeedItemSource
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemsummary import FeedItemSummary
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemauthor import FeedItemAuthor
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditempublisher import FeedItemPublisher
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemcontent import FeedItemContent
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemtitle import FeedItemTitle
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditempublisher import FeedItemPublisher
from rssbridge_producer_data.microsoft.opendata.rssfeeds.link import Link
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemtitle import FeedItemTitle
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemenclosure import FeedItemEnclosure
import datetime


@dataclasses_json.dataclass_json
Expand All @@ -25,40 +26,40 @@ class FeedItem:
"""
A FeedItem record.
Attributes:
author (typing.Optional[FeedItemAuthor]):
publisher (typing.Optional[FeedItemPublisher]):
summary (typing.Optional[FeedItemSummary]):
title (typing.Optional[FeedItemTitle]):
source (typing.Optional[FeedItemSource]):
content (typing.Optional[typing.List[FeedItemContent]]):
enclosures (typing.Optional[typing.List[FeedItemEnclosure]]):
published (typing.Optional[datetime]):
updated (typing.Optional[datetime]):
created (typing.Optional[datetime]):
expired (typing.Optional[datetime]):
id (typing.Optional[str]):
license (typing.Optional[str]):
comments (typing.Optional[str]):
contributors (typing.Optional[typing.List[FeedItemAuthor]]):
author (typing.Optional[FeedItemAuthor]):
publisher (typing.Optional[FeedItemPublisher]):
summary (typing.Optional[FeedItemSummary]):
title (typing.Optional[FeedItemTitle]):
source (typing.Optional[FeedItemSource]):
content (typing.Optional[typing.List[FeedItemContent]]):
enclosures (typing.Optional[typing.List[FeedItemEnclosure]]):
published (typing.Optional[datetime.datetime]):
updated (typing.Optional[datetime.datetime]):
created (typing.Optional[datetime.datetime]):
expired (typing.Optional[datetime.datetime]):
id (typing.Optional[str]):
license (typing.Optional[str]):
comments (typing.Optional[str]):
contributors (typing.Optional[typing.List[FeedItemAuthor]]):
links (typing.Optional[typing.List[Link]]): """

author: typing.Optional[FeedItemAuthor]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="author"))
publisher: typing.Optional[FeedItemPublisher]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="publisher"))
summary: typing.Optional[FeedItemSummary]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="summary"))
title: typing.Optional[FeedItemTitle]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="title"))
source: typing.Optional[FeedItemSource]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="source"))
content: typing.Optional[typing.List[FeedItemContent]]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="content"))
enclosures: typing.Optional[typing.List[FeedItemEnclosure]]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="enclosures"))
published: typing.Optional[datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="published"))
updated: typing.Optional[datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="updated"))
created: typing.Optional[datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="created"))
expired: typing.Optional[datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="expired"))
published: typing.Optional[datetime.datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="published", encoder=lambda d: datetime.datetime.isoformat(d) if d else None, decoder=lambda d:datetime.datetime.fromisoformat(d) if d else None, mm_field=fields.DateTime(format='iso')))
updated: typing.Optional[datetime.datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="updated", encoder=lambda d: datetime.datetime.isoformat(d) if d else None, decoder=lambda d:datetime.datetime.fromisoformat(d) if d else None, mm_field=fields.DateTime(format='iso')))
created: typing.Optional[datetime.datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="created", encoder=lambda d: datetime.datetime.isoformat(d) if d else None, decoder=lambda d:datetime.datetime.fromisoformat(d) if d else None, mm_field=fields.DateTime(format='iso')))
expired: typing.Optional[datetime.datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="expired", encoder=lambda d: datetime.datetime.isoformat(d) if d else None, decoder=lambda d:datetime.datetime.fromisoformat(d) if d else None, mm_field=fields.DateTime(format='iso')))
id: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="id"))
license: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="license"))
comments: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="comments"))
contributors: typing.Optional[typing.List[FeedItemAuthor]]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="contributors"))
links: typing.Optional[typing.List[Link]]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="links"))
links: typing.Optional[typing.List[Link]]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="links"))


def __post_init__(self):
""" Initializes the dataclass with the provided keyword arguments."""
Expand All @@ -83,10 +84,10 @@ def __post_init__(self):
def from_serializer_dict(cls, data: dict) -> 'FeedItem':
"""
Converts a dictionary to a dataclass instance.
Args:
data: The dictionary to convert to a dataclass.
Returns:
The dataclass representation of the dictionary.
"""
Expand All @@ -105,7 +106,7 @@ def to_serializer_dict(self) -> dict:
def _dict_resolver(self, data):
"""
Helps resolving the Enum values to their actual values and fixes the key names.
"""
"""
def _resolve_enum(v):
if isinstance(v,enum.Enum):
return v.value
Expand All @@ -117,7 +118,7 @@ def _fix_key(k):
def to_byte_array(self, content_type_string: str) -> bytes:
"""
Converts the dataclass to a byte array based on the content type string.
Args:
content_type_string: The content type string to convert the dataclass to.
Supported content types:
Expand All @@ -126,7 +127,7 @@ def to_byte_array(self, content_type_string: str) -> bytes:
'+gzip': Compresses the byte array using gzip, e.g. 'application/json+gzip'.
Returns:
The byte array representation of the dataclass.
The byte array representation of the dataclass.
"""
content_type = content_type_string.split(';')[0].strip()
result = None
Expand All @@ -148,10 +149,10 @@ def to_byte_array(self, content_type_string: str) -> bytes:
def from_data(cls, data: typing.Any, content_type_string: typing.Optional[str] = None) -> typing.Optional['FeedItem']:
"""
Converts the data to a dataclass based on the content type string.
Args:
data: The data to convert to a dataclass.
content_type_string: The content type string to convert the data to.
content_type_string: The content type string to convert the data to.
Supported content types:
'application/json': Attempts to decode the data from JSON encoded format.
Supported content type extensions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class FeedItemAuthor:

name: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="name"))
href: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="href"))
email: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="email"))
email: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="email"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class FeedItemContent:
value: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="value"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))
language: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="language"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class FeedItemEnclosure:

href: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="href"))
length: typing.Optional[int]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="length"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class FeedItemPublisher:

name: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="name"))
href: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="href"))
email: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="email"))
email: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="email"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import dataclasses
import dataclasses_json
import json
from marshmallow import fields
from rssbridge_producer_data.microsoft.opendata.rssfeeds.link import Link
from datetime import datetime
from rssbridge_producer_data.microsoft.opendata.rssfeeds.feeditemauthor import FeedItemAuthor
import datetime


@dataclasses_json.dataclass_json
Expand All @@ -30,7 +31,7 @@ class FeedItemSource:
rights (typing.Optional[str]):
subtitle (typing.Optional[str]):
title (typing.Optional[str]):
updated (typing.Optional[datetime]): """
updated (typing.Optional[datetime.datetime]): """

author: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="author"))
author_detail: typing.Optional[FeedItemAuthor]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="author_detail"))
Expand All @@ -43,7 +44,7 @@ class FeedItemSource:
rights: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="rights"))
subtitle: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="subtitle"))
title: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="title"))
updated: typing.Optional[datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="updated"))
updated: typing.Optional[datetime.datetime]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="updated", encoder=lambda d: datetime.datetime.isoformat(d) if d else None, decoder=lambda d:datetime.datetime.fromisoformat(d) if d else None, mm_field=fields.DateTime(format='iso')))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class FeedItemSummary:
value: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="value"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))
language: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="language"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class FeedItemTitle:
value: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="value"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))
language: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="language"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))
base: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="base"))


def __post_init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Link:
rel: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="rel"))
href: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="href"))
type: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="type"))
title: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="title"))
title: typing.Optional[str]=dataclasses.field(kw_only=True, metadata=dataclasses_json.config(field_name="title"))


def __post_init__(self):
Expand Down
Loading

0 comments on commit a32b36d

Please sign in to comment.