diff --git a/lncrawl/VERSION b/lncrawl/VERSION index d5c0c9914..87ce49290 100644 --- a/lncrawl/VERSION +++ b/lncrawl/VERSION @@ -1 +1 @@ -3.5.1 +3.5.2 diff --git a/lncrawl/core/__init__.py b/lncrawl/core/__init__.py index 4222ee416..0606bbcf3 100644 --- a/lncrawl/core/__init__.py +++ b/lncrawl/core/__init__.py @@ -11,7 +11,8 @@ from ..assets.version import get_version from ..bots import run_bot from .arguments import get_args -from .display import cancel_method, description, error_message, input_suppression +from .display import (cancel_method, description, error_message, + input_suppression) from .logconfig import configure_logging from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher from .sources import load_sources @@ -66,9 +67,10 @@ def start_app(): try: bot = os.getenv("BOT", "").lower() run_bot(bot) - except Exception as e: - if not isinstance(e, KeyboardInterrupt): - error_message(*sys.exc_info()) + except KeyboardInterrupt: + pass + except Exception: + error_message(*sys.exc_info()) if args.auto_proxy: stop_proxy_fetcher() diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py index 20c89b00b..cd313d4f4 100644 --- a/lncrawl/core/crawler.py +++ b/lncrawl/core/crawler.py @@ -162,11 +162,11 @@ def download_chapters( chapter.body = future.result() self.extract_chapter_images(chapter) chapter.success = True - except Exception as e: + except KeyboardInterrupt: + break + except Exception: if isinstance(chapter, Chapter): chapter.body = "" chapter.success = False - if isinstance(e, KeyboardInterrupt): - break finally: yield 1 diff --git a/lncrawl/core/novel_search.py b/lncrawl/core/novel_search.py index 01efe047d..de4e9d6b1 100644 --- a/lncrawl/core/novel_search.py +++ b/lncrawl/core/novel_search.py @@ -1,137 +1,105 @@ """ To search for novels in selected sources """ +import random import logging -import os -from concurrent import futures from typing import Dict, List -from bs4 import Tag +from concurrent.futures import Future from slugify import slugify -from tqdm import tqdm -from ..core.sources import crawler_list, prepare_crawler from ..models import CombinedSearchResult, SearchResult +from .sources import crawler_list, prepare_crawler +from .taskman import TaskManager SEARCH_TIMEOUT = 60 +MAX_RESULTS = 15 logger = logging.getLogger(__name__) -executor = futures.ThreadPoolExecutor(20) +taskman = TaskManager(10) -def _perform_search(app, link, bar): +def _perform_search(app, link): + from .app import App + assert isinstance(app, App) try: crawler = prepare_crawler(link) results = [] for item in crawler.search_novel(app.user_input): - if not item.get("url"): - continue if not isinstance(item, SearchResult): item = SearchResult(**item) + if not (item.url and item.title): + continue results.append(item) - logger.debug(results) - logger.info("%d results from %s", len(results), link) + logger.info(f"{len(results)} results from {link}") return results - except KeyboardInterrupt as e: - raise e except Exception: if logger.isEnabledFor(logging.DEBUG): logging.exception(" Search Failed! << %s >>", link) return [] - - -def _combine_results(results: List[SearchResult]) -> List[CombinedSearchResult]: - combined: Dict[str, List[SearchResult]] = {} - for item in results: - - if item.title is None: - logger.warn(f'Title is type None in {item}') - continue - elif isinstance(item.title, Tag): - logger.warn(f'Title is type Tag in {item}') - item.title = item.title.get_text() - - key = slugify(item.title) - - if len(key) <= 2: - continue - - combined.setdefault(key, []) - combined[key].append(item) - - processed: List[CombinedSearchResult] = [] - for key, value in combined.items(): - value.sort(key=lambda x: x.url) - processed.append( - CombinedSearchResult( - id=key, - title=value[0].title, - novels=value, - ) - ) - - processed.sort(key=lambda x: -len(x.novels)) - return processed[:15] # Control the number of results + finally: + app.progress += 1 def search_novels(app): from .app import App - assert isinstance(app, App) if not app.crawler_links: return sources = app.crawler_links.copy() - # random.shuffle(sources) - - is_debug = os.getenv("debug_mode") - bar = tqdm( - desc="Searching", - total=len(sources), - unit="source", - disable=is_debug, - ) + random.shuffle(sources) # Add future tasks - checked = {} - futures_to_check = [] + checked = set() app.progress = 0 + futures: List[Future] = [] for link in sources: crawler = crawler_list[link] if crawler in checked: - bar.update() continue - checked[crawler] = True - future = executor.submit(_perform_search, app, link, bar) - futures_to_check.append(future) + checked.add(crawler) + f = taskman.submit_task(_perform_search, app, link) + futures.append(f) # Resolve all futures - results: List[SearchResult] = [] - for i, f in enumerate(futures_to_check): - assert isinstance(f, futures.Future) - try: - f.result(SEARCH_TIMEOUT) - except KeyboardInterrupt: - break - except TimeoutError: - f.cancel() - except Exception as e: - if is_debug: - logger.error("Failed to complete search", e) - finally: - app.progress += 1 - bar.update() + try: + taskman.resolve_futures( + futures, + desc="Searching", + unit="source", + timeout=SEARCH_TIMEOUT, + ) + except Exception: + if logger.isEnabledFor(logging.DEBUG): + logging.exception(" Search Failed!") - # Cancel any remaining futures - for f in futures_to_check: - assert isinstance(f, futures.Future) - if not f.done(): - f.cancel() - elif not f.cancelled(): - results += f.result() + # Combine the search results + combined: Dict[str, List[SearchResult]] = {} + for f in futures: + if not f or not f.done() or f.cancelled(): + continue + for item in f.result() or []: + if not item: + continue + key = slugify(item.title) + if len(key) <= 2: + continue + combined.setdefault(key, []) + combined[key].append(item) # Process combined search results - app.search_results = _combine_results(results) - bar.close() + processed: List[CombinedSearchResult] = [] + for key, value in combined.items(): + value.sort(key=lambda x: x.url) + processed.append( + CombinedSearchResult( + id=key, + title=value[0].title, + novels=value, + ) + ) + processed.sort(key=lambda x: -len(x.novels)) + app.search_results = processed[:MAX_RESULTS] diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py index 7fccb7f38..05e83b391 100644 --- a/lncrawl/core/taskman.py +++ b/lncrawl/core/taskman.py @@ -3,7 +3,7 @@ from abc import ABC from concurrent.futures import Future, ThreadPoolExecutor from threading import Semaphore, Thread -from typing import Dict, Iterable, List, Optional, TypeVar +from typing import Any, Dict, Iterable, List, Optional from tqdm import tqdm @@ -11,8 +11,6 @@ logger = logging.getLogger(__name__) -T = TypeVar("T") - MAX_WORKER_COUNT = 5 MAX_REQUESTS_PER_DOMAIN = 25 @@ -21,15 +19,19 @@ class TaskManager(ABC): - def __init__(self) -> None: + def __init__( + self, + workers: int = MAX_WORKER_COUNT, + ratelimit: Optional[float] = None, + ) -> None: """A helper class for task queueing and parallel task execution. It is being used as a superclass of the Crawler. Args: - - workers (int, optional): Number of concurrent workers to expect. Default: 10. + - workers (int, optional): Number of concurrent workers to expect. Default: 5. - ratelimit (float, optional): Number of requests per second. """ - self.init_executor(MAX_WORKER_COUNT) + self.init_executor(workers, ratelimit) def __del__(self) -> None: if hasattr(self, "_executor"): @@ -61,7 +63,8 @@ def init_executor( it will shutdown the current executor, and cancel all pending tasks. Args: - - workers (int): Number of workers to expect in the new executor. + - workers (int, optional): Number of concurrent workers to expect. Default: 5. + - ratelimit (float, optional): Number of requests per second. """ self._futures: List[Future] = [] self.__del__() # cleanup previous initialization @@ -78,7 +81,7 @@ def init_executor( ) self._submit = self._executor.submit - self._executor.submit = self.submit_task + setattr(self._executor, 'submit', self.submit_task) def submit_task(self, fn, *args, **kwargs) -> Future: """Submits a callable to be executed with the given arguments. @@ -91,6 +94,8 @@ def submit_task(self, fn, *args, **kwargs) -> Future: """ if hasattr(self, "_limiter"): fn = self._limiter.wrap(fn) + if not self._submit: + raise Exception('No executor is available') future = self._submit(fn, *args, **kwargs) self._futures.append(future) return future @@ -102,7 +107,7 @@ def progress_bar( total=None, unit=None, disable=False, - timeout: float = None, + timeout: float | None = None, ): if os.getenv("debug_mode"): disable = True @@ -164,12 +169,12 @@ def cancel_futures(self, futures: Iterable[Future]) -> None: def resolve_futures( self, futures: Iterable[Future], - timeout: float = None, + timeout: float | None = None, disable_bar=False, desc=None, unit=None, fail_fast=False, - ) -> None: + ) -> List[Any]: """Wait for the futures to be done. Args: @@ -182,27 +187,32 @@ def resolve_futures( fail_fast: Fail on first error """ if not futures: - return + return [] + _futures = list(futures or []) bar = self.progress_bar( desc=desc, unit=unit, - total=len(futures), + total=len(_futures), disable=disable_bar, timeout=timeout, ) + _results = [] try: - for future in futures: + for future in _futures: if fail_fast: - future.result(timeout) + r = future.result(timeout) + _results.append(r) bar.update() continue try: - future.result(timeout) + r = future.result(timeout) + _results.append(r) + except KeyboardInterrupt: + break except Exception as e: - if isinstance(e, KeyboardInterrupt): - break + _results.append(None) if bar.disable: logger.exception("Failure to resolve future") else: @@ -210,6 +220,10 @@ def resolve_futures( logger.warning(f"{type(e).__name__}: {e}") finally: bar.update() + except KeyboardInterrupt: + pass finally: Thread(target=lambda: self.cancel_futures(futures)).start() bar.close() + + return _results diff --git a/lncrawl/models/search_result.py b/lncrawl/models/search_result.py index f045afd6f..801d011d1 100644 --- a/lncrawl/models/search_result.py +++ b/lncrawl/models/search_result.py @@ -11,9 +11,9 @@ def __init__( info: str = "", **kwargs, ) -> None: - self.title = title - self.url = url - self.info = info + self.title = str(title) + self.url = str(url) + self.info = str(info) self.update(kwargs) @@ -26,6 +26,6 @@ def __init__( **kwargs, ) -> None: self.id = id - self.title = title + self.title = str(title) self.novels = novels self.update(kwargs) diff --git a/lncrawl/templates/browser/basic.py b/lncrawl/templates/browser/basic.py index a43876d1c..d6827d967 100644 --- a/lncrawl/templates/browser/basic.py +++ b/lncrawl/templates/browser/basic.py @@ -96,7 +96,11 @@ def read_novel_info(self) -> None: finally: self.close_browser() - def download_chapters(self, chapters: List[Chapter]) -> Generator[int, None, None]: + def download_chapters( + self, + chapters: List[Chapter], + fail_fast=False, + ) -> Generator[int, None, None]: try: yield from super().download_chapters(chapters, fail_fast=True) except ScraperErrorGroup as e: diff --git a/lncrawl/templates/madara.py b/lncrawl/templates/madara.py index c03b063ee..74808d783 100644 --- a/lncrawl/templates/madara.py +++ b/lncrawl/templates/madara.py @@ -46,11 +46,12 @@ def parse_title(self, soup: BeautifulSoup) -> str: def parse_cover(self, soup: BeautifulSoup) -> str: tag = soup.select_one(".summary_image a img") - assert tag - if tag.has_attr("data-src"): - return self.absolute_url(tag["data-src"]) - if tag.has_attr("src"): - return self.absolute_url(tag["src"]) + if isinstance(tag, Tag): + if tag.has_attr("data-src"): + return self.absolute_url(tag["data-src"]) + if tag.has_attr("src"): + return self.absolute_url(tag["src"]) + return '' def parse_authors(self, soup: BeautifulSoup): for a in soup.select('.author-content a[href*="manga-author"]'): @@ -66,7 +67,8 @@ def select_chapter_tags(self, soup: BeautifulSoup): raise Exception("No chapters on first URL") except Exception: nl_id = soup.select_one("#manga-chapters-holder[data-id]") - assert isinstance(nl_id, Tag) + if not isinstance(nl_id, Tag): + raise Exception('No chapter chapter id tag found') response = self.submit_form( f"{self.home_url}wp-admin/admin-ajax.php", data={