Skip to content

Commit

Permalink
Issue #83: Use Asyncio.
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 11, 2023
1 parent 66295f3 commit 0ba58e3
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 14 deletions.
4 changes: 1 addition & 3 deletions dirhunt/colors.py → dirhunt/console.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from colorama import Fore


def status_code_colors(status_code):
"""Return a color for a status code."""
if 100 <= status_code < 200:
return "white"
elif 200 == status_code:
Expand Down
33 changes: 26 additions & 7 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from dirhunt.sessions import Session
from dirhunt.sources import Sources
from dirhunt.url_info import UrlsInfo
from dirhunt.utils import retry_error

"""Flags importance"""

Expand Down Expand Up @@ -56,6 +57,7 @@ def release(self, domain: str):

class Crawler:
urls_info = None
started = False

def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop):
"""Initialize Crawler.
Expand All @@ -81,26 +83,43 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop

async def start(self):
"""Add urls to process."""
if self.started:
await self.restart()
return
for url in self.configuration.urls:
crawler_url = CrawlerUrl(self, url, depth=self.configuration.max_depth)
await self.add_domain(crawler_url.url.domain)
await self.add_crawler_url(crawler_url)
self.add_domain_protocol(crawler_url)
self.started = True
await self.run_tasks()

async def run_tasks(self) -> None:
"""Run asyncio tasks."""
while self.tasks:
await asyncio.wait(self.tasks)
await self.session.close()

async def restart(self):
"""Restart crawler."""
await self.run_tasks()

async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
"""Add crawler_url to tasks"""
if crawler_url in self.crawler_urls or not self.in_domains(
crawler_url.url.domain
):
return
# TODO: move to CrawlerUrl after retrieve the data
self.current_processed_count += 1
self.crawler_urls.add(crawler_url)
await self.add_crawler_url_task(crawler_url)

async def add_crawler_url_task(self, crawler_url) -> asyncio.Task:
"""Add crawler_url to tasks"""
return self.add_task(
crawler_url.retrieve(), name=f"crawlerurl-{self.current_processed_count}"
retry_error(crawler_url.retrieve, KeyboardInterrupt)(),
name=f"crawlerurl-{self.current_processed_count}",
)

async def add_domain(self, domain: str):
Expand All @@ -112,6 +131,7 @@ async def add_domain(self, domain: str):

@functools.lru_cache(maxsize=128)
def in_domains(self, target_domain):
"""Check if target_domain is in domains."""
if target_domain in self.domains:
return True
if self.configuration.not_follow_subdomains:
Expand Down Expand Up @@ -139,6 +159,11 @@ def add_domain_protocol(self, crawler_url: "CrawlerUrl"):
"""Add domain protocol"""
self.domain_protocols[crawler_url.url.domain].add(crawler_url.url.protocol)

@property
def pending_crawler_urls(self):
"""Return pending crawler_urls without finished."""
return filter(lambda x: not x.finished, self.crawler_urls)

def add_init_urls(self, *urls):
"""Add urls to queue."""
self.initial_urls.extend(urls)
Expand Down Expand Up @@ -245,12 +270,6 @@ def print_urls_info(self):
)
self.urls_info.start()

def restart(self):
try:
self.add_lock.release()
except (ThreadError, RuntimeError):
pass

def options(self):
return {
"interesting_extensions": self.interesting_extensions,
Expand Down
4 changes: 4 additions & 0 deletions dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ def set_type(self, content_type):
):
self.url_type = "document"

@property
def finished(self) -> bool:
return self.processor is not None

def maybe_rewrite(self):
return self.url_type not in ["asset", "directory"]

Expand Down
9 changes: 8 additions & 1 deletion dirhunt/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,14 @@ def hunt(**kwargs: ConfigurationDict):
return
loop = asyncio.get_event_loop()
crawler = Crawler(configuration, loop)
loop.run_until_complete(crawler.start())
while True:
try:
loop.run_until_complete(crawler.start())
except KeyboardInterrupt:
click.echo("Goodbye!")
input()
else:
break


def main():
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from bs4 import Comment
from colorama import Fore, Back

from dirhunt.colors import status_code_colors
from dirhunt.console import status_code_colors
from dirhunt.crawler_url import CrawlerUrl, CrawlerUrlRequest
from dirhunt.url import Url, full_url_address
from dirhunt.url_loop import is_url_loop
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/tests/test_colors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from colorama import Fore

from dirhunt.colors import status_code_colors
from dirhunt.console import status_code_colors


class TestColors(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/url_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from urllib3.exceptions import ReadTimeoutError

from dirhunt.cli import random_spinner
from dirhunt.colors import status_code_colors
from dirhunt.console import status_code_colors
from dirhunt.exceptions import EmptyError, RequestError
from dirhunt.pool import Pool
from dirhunt.utils import colored, remove_ansi_escape
Expand Down
11 changes: 11 additions & 0 deletions dirhunt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,14 @@ def multiplier_arg(argument):

def multiplier_args(arguments):
return flat_list([multiplier_arg(argument) for argument in arguments])


def retry_error(fn, exception):
def wrap(*args, **kwargs):
while True:
try:
return fn(*args, **kwargs)
except exception:
pass

return wrap

0 comments on commit 0ba58e3

Please sign in to comment.