Skip to content

Commit

Permalink
Fixes the novel search feature
Browse files Browse the repository at this point in the history
  • Loading branch information
dipu-bd committed Apr 8, 2024
1 parent 8f2958e commit 498a3d2
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 124 deletions.
2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.5.1
3.5.2
10 changes: 6 additions & 4 deletions lncrawl/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from ..assets.version import get_version
from ..bots import run_bot
from .arguments import get_args
from .display import cancel_method, description, error_message, input_suppression
from .display import (cancel_method, description, error_message,
input_suppression)
from .logconfig import configure_logging
from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher
from .sources import load_sources
Expand Down Expand Up @@ -66,9 +67,10 @@ def start_app():
try:
bot = os.getenv("BOT", "").lower()
run_bot(bot)
except Exception as e:
if not isinstance(e, KeyboardInterrupt):
error_message(*sys.exc_info())
except KeyboardInterrupt:
pass
except Exception:
error_message(*sys.exc_info())

if args.auto_proxy:
stop_proxy_fetcher()
Expand Down
6 changes: 3 additions & 3 deletions lncrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ def download_chapters(
chapter.body = future.result()
self.extract_chapter_images(chapter)
chapter.success = True
except Exception as e:
except KeyboardInterrupt:
break
except Exception:
if isinstance(chapter, Chapter):
chapter.body = ""
chapter.success = False
if isinstance(e, KeyboardInterrupt):
break
finally:
yield 1
142 changes: 55 additions & 87 deletions lncrawl/core/novel_search.py
Original file line number Diff line number Diff line change
@@ -1,137 +1,105 @@
"""
To search for novels in selected sources
"""
import random
import logging
import os
from concurrent import futures
from typing import Dict, List

from bs4 import Tag
from concurrent.futures import Future
from slugify import slugify
from tqdm import tqdm

from ..core.sources import crawler_list, prepare_crawler
from ..models import CombinedSearchResult, SearchResult
from .sources import crawler_list, prepare_crawler
from .taskman import TaskManager

SEARCH_TIMEOUT = 60
MAX_RESULTS = 15

logger = logging.getLogger(__name__)
executor = futures.ThreadPoolExecutor(20)
taskman = TaskManager(10)


def _perform_search(app, link, bar):
def _perform_search(app, link):
from .app import App
assert isinstance(app, App)
try:
crawler = prepare_crawler(link)
results = []
for item in crawler.search_novel(app.user_input):
if not item.get("url"):
continue
if not isinstance(item, SearchResult):
item = SearchResult(**item)
if not (item.url and item.title):
continue
results.append(item)

logger.debug(results)
logger.info("%d results from %s", len(results), link)
logger.info(f"{len(results)} results from {link}")
return results
except KeyboardInterrupt as e:
raise e
except Exception:
if logger.isEnabledFor(logging.DEBUG):
logging.exception("<!> Search Failed! << %s >>", link)
return []


def _combine_results(results: List[SearchResult]) -> List[CombinedSearchResult]:
combined: Dict[str, List[SearchResult]] = {}
for item in results:

if item.title is None:
logger.warn(f'Title is type None in {item}')
continue
elif isinstance(item.title, Tag):
logger.warn(f'Title is type Tag in {item}')
item.title = item.title.get_text()

key = slugify(item.title)

if len(key) <= 2:
continue

combined.setdefault(key, [])
combined[key].append(item)

processed: List[CombinedSearchResult] = []
for key, value in combined.items():
value.sort(key=lambda x: x.url)
processed.append(
CombinedSearchResult(
id=key,
title=value[0].title,
novels=value,
)
)

processed.sort(key=lambda x: -len(x.novels))
return processed[:15] # Control the number of results
finally:
app.progress += 1


def search_novels(app):
from .app import App

assert isinstance(app, App)

if not app.crawler_links:
return

sources = app.crawler_links.copy()
# random.shuffle(sources)

is_debug = os.getenv("debug_mode")
bar = tqdm(
desc="Searching",
total=len(sources),
unit="source",
disable=is_debug,
)
random.shuffle(sources)

# Add future tasks
checked = {}
futures_to_check = []
checked = set()
app.progress = 0
futures: List[Future] = []
for link in sources:
crawler = crawler_list[link]
if crawler in checked:
bar.update()
continue
checked[crawler] = True
future = executor.submit(_perform_search, app, link, bar)
futures_to_check.append(future)
checked.add(crawler)
f = taskman.submit_task(_perform_search, app, link)
futures.append(f)

# Resolve all futures
results: List[SearchResult] = []
for i, f in enumerate(futures_to_check):
assert isinstance(f, futures.Future)
try:
f.result(SEARCH_TIMEOUT)
except KeyboardInterrupt:
break
except TimeoutError:
f.cancel()
except Exception as e:
if is_debug:
logger.error("Failed to complete search", e)
finally:
app.progress += 1
bar.update()
try:
taskman.resolve_futures(
futures,
desc="Searching",
unit="source",
timeout=SEARCH_TIMEOUT,
)
except Exception:
if logger.isEnabledFor(logging.DEBUG):
logging.exception("<!> Search Failed!")

# Cancel any remaining futures
for f in futures_to_check:
assert isinstance(f, futures.Future)
if not f.done():
f.cancel()
elif not f.cancelled():
results += f.result()
# Combine the search results
combined: Dict[str, List[SearchResult]] = {}
for f in futures:
if not f or not f.done() or f.cancelled():
continue
for item in f.result() or []:
if not item:
continue
key = slugify(item.title)
if len(key) <= 2:
continue
combined.setdefault(key, [])
combined[key].append(item)

# Process combined search results
app.search_results = _combine_results(results)
bar.close()
processed: List[CombinedSearchResult] = []
for key, value in combined.items():
value.sort(key=lambda x: x.url)
processed.append(
CombinedSearchResult(
id=key,
title=value[0].title,
novels=value,
)
)
processed.sort(key=lambda x: -len(x.novels))
app.search_results = processed[:MAX_RESULTS]
50 changes: 32 additions & 18 deletions lncrawl/core/taskman.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,14 @@
from abc import ABC
from concurrent.futures import Future, ThreadPoolExecutor
from threading import Semaphore, Thread
from typing import Dict, Iterable, List, Optional, TypeVar
from typing import Any, Dict, Iterable, List, Optional

from tqdm import tqdm

from ..utils.ratelimit import RateLimiter

logger = logging.getLogger(__name__)

T = TypeVar("T")

MAX_WORKER_COUNT = 5
MAX_REQUESTS_PER_DOMAIN = 25

Expand All @@ -21,15 +19,19 @@


class TaskManager(ABC):
def __init__(self) -> None:
def __init__(
self,
workers: int = MAX_WORKER_COUNT,
ratelimit: Optional[float] = None,
) -> None:
"""A helper class for task queueing and parallel task execution.
It is being used as a superclass of the Crawler.
Args:
- workers (int, optional): Number of concurrent workers to expect. Default: 10.
- workers (int, optional): Number of concurrent workers to expect. Default: 5.
- ratelimit (float, optional): Number of requests per second.
"""
self.init_executor(MAX_WORKER_COUNT)
self.init_executor(workers, ratelimit)

def __del__(self) -> None:
if hasattr(self, "_executor"):
Expand Down Expand Up @@ -61,7 +63,8 @@ def init_executor(
it will shutdown the current executor, and cancel all pending tasks.
Args:
- workers (int): Number of workers to expect in the new executor.
- workers (int, optional): Number of concurrent workers to expect. Default: 5.
- ratelimit (float, optional): Number of requests per second.
"""
self._futures: List[Future] = []
self.__del__() # cleanup previous initialization
Expand All @@ -78,7 +81,7 @@ def init_executor(
)

self._submit = self._executor.submit
self._executor.submit = self.submit_task
setattr(self._executor, 'submit', self.submit_task)

def submit_task(self, fn, *args, **kwargs) -> Future:
"""Submits a callable to be executed with the given arguments.
Expand All @@ -91,6 +94,8 @@ def submit_task(self, fn, *args, **kwargs) -> Future:
"""
if hasattr(self, "_limiter"):
fn = self._limiter.wrap(fn)
if not self._submit:
raise Exception('No executor is available')
future = self._submit(fn, *args, **kwargs)
self._futures.append(future)
return future
Expand All @@ -102,7 +107,7 @@ def progress_bar(
total=None,
unit=None,
disable=False,
timeout: float = None,
timeout: float | None = None,
):
if os.getenv("debug_mode"):
disable = True
Expand Down Expand Up @@ -164,12 +169,12 @@ def cancel_futures(self, futures: Iterable[Future]) -> None:
def resolve_futures(
self,
futures: Iterable[Future],
timeout: float = None,
timeout: float | None = None,
disable_bar=False,
desc=None,
unit=None,
fail_fast=False,
) -> None:
) -> List[Any]:
"""Wait for the futures to be done.
Args:
Expand All @@ -182,34 +187,43 @@ def resolve_futures(
fail_fast: Fail on first error
"""
if not futures:
return
return []

_futures = list(futures or [])
bar = self.progress_bar(
desc=desc,
unit=unit,
total=len(futures),
total=len(_futures),
disable=disable_bar,
timeout=timeout,
)

_results = []
try:
for future in futures:
for future in _futures:
if fail_fast:
future.result(timeout)
r = future.result(timeout)
_results.append(r)
bar.update()
continue
try:
future.result(timeout)
r = future.result(timeout)
_results.append(r)
except KeyboardInterrupt:
break
except Exception as e:
if isinstance(e, KeyboardInterrupt):
break
_results.append(None)
if bar.disable:
logger.exception("Failure to resolve future")
else:
bar.clear()
logger.warning(f"{type(e).__name__}: {e}")
finally:
bar.update()
except KeyboardInterrupt:
pass
finally:
Thread(target=lambda: self.cancel_futures(futures)).start()
bar.close()

return _results
8 changes: 4 additions & 4 deletions lncrawl/models/search_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ def __init__(
info: str = "",
**kwargs,
) -> None:
self.title = title
self.url = url
self.info = info
self.title = str(title)
self.url = str(url)
self.info = str(info)
self.update(kwargs)


Expand All @@ -26,6 +26,6 @@ def __init__(
**kwargs,
) -> None:
self.id = id
self.title = title
self.title = str(title)
self.novels = novels
self.update(kwargs)
Loading

0 comments on commit 498a3d2

Please sign in to comment.