Skip to content

Commit

Permalink
Fix many ruff lints
Browse files Browse the repository at this point in the history
  • Loading branch information
jodal committed Jun 24, 2024
1 parent a384eba commit f1d7003
Show file tree
Hide file tree
Showing 21 changed files with 77 additions and 89 deletions.
8 changes: 4 additions & 4 deletions comics/aggregator/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _get_valid_date(self, crawler, date):
elif date < crawler.history_capable:
logger.info(
"%s: Adjusting date from %s to %s because of "
+ "limited history capability",
"limited history capability",
crawler.comic.slug,
date,
crawler.history_capable,
Expand All @@ -99,7 +99,7 @@ def _get_valid_date(self, crawler, date):
elif date > crawler.current_date:
logger.info(
"%s: Adjusting date from %s to %s because the given "
+ "date is in the future in the comic's time zone",
"date is in the future in the comic's time zone",
crawler.comic.slug,
date,
crawler.current_date,
Expand Down Expand Up @@ -143,10 +143,10 @@ def _get_comic_by_slug(self, comic_slug):

try:
comic = Comic.objects.get(slug=comic_slug)
except Comic.DoesNotExist:
except Comic.DoesNotExist as exc:
error_msg = "Comic %s not found" % comic_slug
logger.error(error_msg)
raise ComicsError(error_msg)
raise ComicsError(error_msg) from exc
return comic

def set_date_interval(self, from_date, to_date):
Expand Down
59 changes: 30 additions & 29 deletions comics/aggregator/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import xml.sax
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING

import httpx
import pytz
Expand All @@ -19,29 +19,31 @@
)
from comics.aggregator.feedparser import FeedParser
from comics.aggregator.lxmlparser import LxmlParser
from comics.core.models import Comic

if TYPE_CHECKING:
from comics.core.models import Comic

# For testability
now = timezone.now
today = datetime.date.today


RequestHeaders = Dict[str, str]
RequestHeaders = dict[str, str]


@dataclass
class CrawlerRelease:
comic: Comic
pub_date: datetime.date
has_rerun_releases: bool = False
_images: List[CrawlerImage] = field(default_factory=list)
_images: list[CrawlerImage] = field(default_factory=list)

@property
def identifier(self) -> str:
return f"{self.comic.slug}/{self.pub_date}"

@property
def images(self) -> List[CrawlerImage]:
def images(self) -> list[CrawlerImage]:
return self._images

def add_image(self, image: CrawlerImage) -> None:
Expand All @@ -52,8 +54,8 @@ def add_image(self, image: CrawlerImage) -> None:
@dataclass
class CrawlerImage:
url: str
title: Optional[str] = None
text: Optional[str] = None
title: str | None = None
text: str | None = None
request_headers: RequestHeaders = field(default_factory=dict)

def __post_init__(self) -> None:
Expand All @@ -68,7 +70,7 @@ def validate(self, identifier: str) -> None:
raise ImageURLNotFound(identifier)


CrawlerResult = Union[None, CrawlerImage, List[CrawlerImage]]
CrawlerResult = list[CrawlerImage] | CrawlerImage | None


@dataclass
Expand All @@ -77,11 +79,11 @@ class CrawlerBase:

# ### Crawler settings
# Date of oldest release available for crawling
history_capable_date: Optional[str] = None
history_capable_date: str | None = None
# Number of days a release is available for crawling
history_capable_days: Optional[int] = None
history_capable_days: int | None = None
# On what weekdays the comic is published (example: "Mo,We,Fr")
schedule: Optional[str] = None
schedule: str | None = None
# In approximately what time zone the comic is published
# (example: "Europe/Oslo")
time_zone: str = "UTC"
Expand All @@ -97,14 +99,14 @@ class CrawlerBase:
headers: RequestHeaders = field(default_factory=dict)

# Feed object which is reused when crawling multiple dates
feed: Optional[FeedParser] = None
feed: FeedParser | None = None

# Page objects mapped against URL for use when crawling multiple dates
pages: Dict[str, LxmlParser] = field(default_factory=dict)
pages: dict[str, LxmlParser] = field(default_factory=dict)

def get_crawler_release(
self, pub_date: Optional[datetime.date] = None
) -> Optional[CrawlerRelease]:
self, pub_date: datetime.date | None = None
) -> CrawlerRelease | None:
"""Get meta data for release at pub_date, or the latest release"""

pub_date = self._get_date_to_crawl(pub_date)
Expand All @@ -115,9 +117,9 @@ def get_crawler_release(
try:
results = self.crawl(pub_date)
except (httpx.HTTPError, httpx.InvalidURL, OSError) as error:
raise CrawlerHTTPError(release.identifier, error)
raise CrawlerHTTPError(release.identifier, error) from error
except xml.sax.SAXException as error:
raise CrawlerHTTPError(release.identifier, str(error))
raise CrawlerHTTPError(release.identifier, str(error)) from error

if not results:
return None
Expand All @@ -132,7 +134,7 @@ def get_crawler_release(

return release

def _get_date_to_crawl(self, pub_date: Optional[datetime.date]) -> datetime.date:
def _get_date_to_crawl(self, pub_date: datetime.date | None) -> datetime.date:
identifier = f"{self.comic.slug}/{pub_date}"

if pub_date is None:
Expand All @@ -141,9 +143,11 @@ def _get_date_to_crawl(self, pub_date: Optional[datetime.date]) -> datetime.date
if pub_date < self.history_capable:
raise NotHistoryCapable(identifier, self.history_capable)

if self.multiple_releases_per_day is False:
if self.comic.release_set.filter(pub_date=pub_date).count() > 0:
raise ReleaseAlreadyExists(identifier)
if (
self.multiple_releases_per_day is False
and self.comic.release_set.filter(pub_date=pub_date).count() > 0
):
raise ReleaseAlreadyExists(identifier)

return pub_date

Expand Down Expand Up @@ -192,8 +196,8 @@ def parse_page(self, page_url: str) -> LxmlParser:
self.pages[page_url] = LxmlParser(page_url, headers=self.headers)
return self.pages[page_url]

def string_to_date(self, string: str, format: str) -> datetime.date:
return datetime.datetime.strptime(string, format).date()
def string_to_date(self, string: str, fmt: str) -> datetime.date:
return datetime.datetime.strptime(string, fmt).date()

def date_to_epoch(self, date: datetime.date) -> int:
"""The UNIX time of midnight at ``date`` in the comic's time zone"""
Expand Down Expand Up @@ -311,9 +315,9 @@ class NettserierCrawlerBase(CrawlerBase):
# In order to get older releases we need to
# loop through the pages and check the published date
time_zone = "Europe/Oslo"
page_cache: Dict[str, Tuple[LxmlParser, datetime.date]] = {}
page_cache: dict[str, tuple[LxmlParser, datetime.date]] = {}

def get_page(self, url: str) -> Tuple[LxmlParser, datetime.date]:
def get_page(self, url: str) -> tuple[LxmlParser, datetime.date]:
if url not in self.page_cache:
page = self.parse_page(url)
page_date = page.text('p[class="comic-pubtime"]')
Expand All @@ -339,10 +343,7 @@ def crawl_helper(self, short_name: str, pub_date: datetime.date) -> CrawlerResul
title = page.text("div.comic-text h4")
text = page.text("div.comic-text p", allow_multiple=True)

if text[0].find("Published") > -1:
text = None
else:
text = text[0]
text = None if text[0].find("Published") > -1 else text[0]

# Get comic image
url = page.src('img[src*="/_ns/files"]')
Expand Down
25 changes: 14 additions & 11 deletions comics/aggregator/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,21 @@ def _download_image(self, url, request_headers):
temp_file = tempfile.NamedTemporaryFile(suffix="comics")
temp_file.write(response.content)
temp_file.seek(0)
return temp_file
except (httpx.HTTPError, httpx.InvalidURL, OSError) as error:
raise DownloaderHTTPError(self.identifier, error)
raise DownloaderHTTPError(self.identifier, error) from error
else:
return temp_file

def _get_sha256sum(self, file_handle):
original_position = file_handle.tell()
hash = hashlib.sha256()
h = hashlib.sha256()
while True:
data = file_handle.read(8096)
if not data:
break
hash.update(data)
h.update(data)
file_handle.seek(original_position)
return hash.hexdigest()
return h.hexdigest()

def _check_if_blacklisted(self, checksum):
if checksum in settings.COMICS_IMAGE_BLACKLIST:
Expand All @@ -110,21 +111,23 @@ def _check_if_blacklisted(self, checksum):
def _get_existing_image(self, comic, has_rerun_releases, checksum):
try:
image = Image.objects.get(comic=comic, checksum=checksum)
except Image.DoesNotExist:
return None
else:
if image is not None and not has_rerun_releases:
raise ImageAlreadyExists(self.identifier)
return image
except Image.DoesNotExist:
return None

def _validate_image(self, image_file):
try:
image = PILImage.open(image_file)
image.load()
return image
except IndexError:
raise ImageIsCorrupt(self.identifier)
except IndexError as error:
raise ImageIsCorrupt(self.identifier) from error
except OSError as error:
raise ImageIsCorrupt(self.identifier, error)
raise ImageIsCorrupt(self.identifier, error) from error
else:
return image

def _get_file_extension(self, image):
if image.format not in IMAGE_FORMATS:
Expand Down
2 changes: 1 addition & 1 deletion comics/api/tests/test_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_list_releases(self):
self.assertEqual(
image["text"],
"People into masturbatory "
+ "navel-gazing have a lot to learn about masturbation.",
"navel-gazing have a lot to learn about masturbation.",
)
self.assertEqual(image["height"], 235)
self.assertEqual(image["width"], 740)
Expand Down
3 changes: 1 addition & 2 deletions comics/browser/views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import json
from typing import Optional

from django.conf import settings
from django.contrib.auth.decorators import login_required
Expand Down Expand Up @@ -143,7 +142,7 @@ def get_view_type(self):
class ReleaseDateMixin(ReleaseMixin):
"""Things common for all *date based* views"""

date_field: Optional[str] = "pub_date"
date_field: str | None = "pub_date"
month_format = "%m"


Expand Down
2 changes: 2 additions & 0 deletions comics/comics/20px.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# noqa: N999

from comics.aggregator.crawler import CrawlerBase, CrawlerImage
from comics.core.comic_data import ComicDataBase

Expand Down
2 changes: 2 additions & 0 deletions comics/comics/8bittheater.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# noqa: N999

from comics.aggregator.crawler import CrawlerBase
from comics.core.comic_data import ComicDataBase

Expand Down
3 changes: 1 addition & 2 deletions comics/comics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
from types import ModuleType
from typing import List


def get_comic_module_names() -> List[str]:
def get_comic_module_names() -> list[str]:
module_files = os.listdir(os.path.dirname(__file__))
module_names = []
for file in module_files:
Expand Down
3 changes: 1 addition & 2 deletions comics/comics/adam4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ def crawl(self, pub_date):
urls = entry.content0.src('img[src*="/wp-content/"]', allow_multiple=True)

for url in urls:
url = url.replace("comics-rss", "comics")
results.append(CrawlerImage(url))
results.append(CrawlerImage(url.replace("comics-rss", "comics")))

if not results:
continue
Expand Down
5 changes: 1 addition & 4 deletions comics/comics/crookedgremlins.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ def crawl(self, pub_date):

# Put together the text from multiple paragraphs
text_paragraphs = page.text(".post-content p", allow_multiple=True)
if text_paragraphs is not None:
text = "\n\n".join(text_paragraphs)
else:
text = None
text = "\n\n".join(text_paragraphs) if text_paragraphs is not None else None

return CrawlerImage(url, title, text)
1 change: 0 additions & 1 deletion comics/comics/ctrlaltdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ class ComicData(ComicDataBase):


class Crawler(CrawlerBase):
# history_capable_date = "2002-10-23"
history_capable_days = 20
schedule = "Mo,We,Fr"
time_zone = "US/Eastern"
Expand Down
5 changes: 1 addition & 4 deletions comics/comics/jesusandmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def crawl(self, pub_date):
return
url = url[0]
text = release_page.root.xpath('//div[@class="entry"]/p')
if text:
text = text[0].text
else:
text = None
text = text[0].text if text else None

return CrawlerImage(url, title, text)
5 changes: 1 addition & 4 deletions comics/comics/nerfnow.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ def crawl(self, pub_date):

# Put together text from multiple paragraphs
text_paragraphs = entry.content0.text("p", allow_multiple=True)
if text_paragraphs is not None:
text = "\n\n".join(text_paragraphs)
else:
text = None
text = "\n\n".join(text_paragraphs) if text_paragraphs is not None else None

return CrawlerImage(url, title, text)
2 changes: 0 additions & 2 deletions comics/comics/savagechickens.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ class Crawler(CrawlerBase):
def crawl(self, pub_date):
feed = self.parse_feed("http://www.savagechickens.com/feed")
for entry in feed.for_date(pub_date):
if "Cartoons" not in entry.tags:
print("skipping")
url = entry.content0.src('img[src*="/wp-content/"]')
title = entry.title
return CrawlerImage(url, title)
5 changes: 1 addition & 4 deletions comics/comics/vgcats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@ class Crawler(CrawlerBase):

def crawl(self, pub_date):
# FIXME: Seems like they are using gif images now and then
if pub_date < datetime.date(2003, 5, 1):
file_ext = "gif"
else:
file_ext = "jpg"
file_ext = "gif" if pub_date < datetime.date(2003, 5, 1) else "jpg"
url = "http://www.vgcats.com/comics/images/{}.{}".format(
pub_date.strftime("%y%m%d"),
file_ext,
Expand Down
Loading

0 comments on commit f1d7003

Please sign in to comment.