Issue #83: Use Asyncio

Nekmo · Aug 9, 2023 · 622c45f · 622c45f
1 parent 9f0ed72
commit 622c45f
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 169 deletions.
diff --git a/dirhunt/configuration.py b/dirhunt/configuration.py
@@ -38,7 +38,8 @@ class Configuration:
     """
 
     urls: List[str] = field(default_factory=list)
-    threads: int = 10
+    threads: int = 4
+    concurrency: int = 10
     exclude_flags: List[str] = field(default_factory=list)
     include_flags: List[str] = field(default_factory=list)
     interesting_extensions: List[str] = field(default_factory=list)

diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
+import asyncio
 import json
 import multiprocessing
 import os
+from asyncio import Semaphore
 from hashlib import sha256
-from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures.thread import _python_exit
 from threading import Lock, ThreadError
 import datetime
@@ -15,6 +16,7 @@
 from dirhunt import __version__
 from dirhunt._compat import queue, Queue, unregister
 from dirhunt.cli import random_spinner
+from dirhunt.configuration import Configuration
 from dirhunt.crawler_url import CrawlerUrl
 from dirhunt.exceptions import (
     EmptyError,
@@ -23,7 +25,7 @@
     IncompatibleVersionError,
 )
 from dirhunt.json_report import JsonReportEncoder
-from dirhunt.sessions import Sessions
+from dirhunt.sessions import Sessions, Session
 from dirhunt.sources import Sources
 from dirhunt.url_info import UrlsInfo
 
@@ -33,62 +35,63 @@
 resume_dir = os.path.expanduser("~/.cache/dirhunt/")
 
 
-class Crawler(ThreadPoolExecutor):
+class DomainSemaphore:
+    """Asyncio Semaphore per domain."""
+
+    def __init__(self, concurrency: int):
+        """Initialize DomainSemaphore."""
+        self.concurrency = concurrency
+        self.semaphores = {}
+
+    async def acquire(self, domain: str):
+        """Acquire semaphore for domain."""
+        if domain not in self.semaphores:
+            self.semaphores[domain] = Semaphore(self.concurrency)
+        await self.semaphores[domain].acquire()
+
+    def release(self, domain: str):
+        """Release semaphore for domain."""
+        self.semaphores[domain].release()
+
+
+class Crawler:
     urls_info = None
 
-    def __init__(
-        self,
-        max_workers=None,
-        interesting_extensions=None,
-        interesting_files=None,
-        interesting_keywords=None,
-        std=None,
-        progress_enabled=True,
-        timeout=10,
-        depth=3,
-        not_follow_subdomains=False,
-        exclude_sources=(),
-        not_allow_redirects=False,
-        proxies=None,
-        delay=0,
-        limit=1000,
-        to_file=None,
-        user_agent=None,
-        cookies=None,
-        headers=None,
-    ):
-        if not max_workers and not delay:
-            max_workers = (multiprocessing.cpu_count() or 1) * 5
-        elif not max_workers and delay:
-            max_workers = len(proxies or [None])
-        super(Crawler, self).__init__(max_workers)
+    def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop):
+        """Initialize Crawler.
+        :param configuration: Configuration instance
+        :param loop: asyncio loop
+        """
+        self.configuration = configuration
+        self.loop = loop
+        self.tasks = set()
+        self.session = Session()
+        self.domain_semaphore = DomainSemaphore(configuration.concurrency)
         self.domains = set()
         self.results = Queue()
         self.index_of_processors = []
-        self.proxies = proxies
-        self.delay = delay
-        self.sessions = Sessions(proxies, delay, user_agent, cookies, headers)
         self.processing = {}
         self.processed = {}
         self.add_lock = Lock()
-        self.spinner = random_spinner()
         self.start_dt = datetime.datetime.now()
-        self.interesting_extensions = interesting_extensions or []
-        self.interesting_files = interesting_files or []
-        self.interesting_keywords = interesting_keywords or []
-        self.closing = False
-        self.std = std or None
-        self.progress_enabled = progress_enabled
-        self.timeout = timeout
-        self.not_follow_subdomains = not_follow_subdomains
-        self.depth = depth
-        self.exclude_sources = exclude_sources
-        self.sources = Sources(self.add_url, self.add_message, exclude_sources)
-        self.not_allow_redirects = not_allow_redirects
-        self.limit = limit
         self.current_processed_count = 0
-        self.to_file = to_file
-        self.initial_urls = []
+
+    async def start(self):
+        """Add urls to process."""
+        for url in self.configuration.urls:
+            await self.add_crawler_url(
+                CrawlerUrl(self, url, depth=self.configuration.max_depth)
+            )
+        await asyncio.wait(self.tasks)
+
+    async def add_crawler_url(self, crawler_url: CrawlerUrl):
+        """Add crawler_url to tasks"""
+        if crawler_url.url.url in self.processing:
+            return
+        task = self.loop.create_task(crawler_url.retrieve())
+        self.tasks.add(task)
+        self.processing[crawler_url.url.url] = task
+        task.add_done_callback(lambda: self.tasks.discard(task))
 
     def add_init_urls(self, *urls):
         """Add urls to queue."""
@@ -122,44 +125,6 @@ def add_domain(self, domain):
         self.domains.add(domain)
         self.sources.add_domain(domain)
 
-    def add_url(self, crawler_url, force=False, lock=True):
-        """Add url to queue"""
-        if self.closing:
-            return
-        if not isinstance(crawler_url, CrawlerUrl):
-            crawler_url = CrawlerUrl(
-                self, crawler_url, depth=self.depth, timeout=self.timeout
-            )
-        if lock:
-            self.add_lock.acquire()
-        url = crawler_url.url
-        if (
-            not url.is_valid()
-            or not url.only_domain
-            or not self.in_domains(url.only_domain)
-        ):
-            if lock:
-                self.add_lock.release()
-            return
-        if url.url in self.processing or url.url in self.processed:
-            if lock:
-                self.add_lock.release()
-            return self.processing.get(url.url) or self.processed.get(url.url)
-
-        fn = reraise_with_stack(crawler_url.start)
-        if self.closing:
-            if lock:
-                self.add_lock.release()
-            return
-        if force:
-            future = ThreadPoolExecutor(max_workers=1).submit(fn)
-        else:
-            future = self.submit(fn)
-        self.processing[url.url] = future
-        if lock:
-            self.add_lock.release()
-        return future
-
     def add_message(self, body):
         from dirhunt.processors import Message
 

diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
 import cgi
 import socket
+from typing import TYPE_CHECKING
 
 from bs4 import BeautifulSoup
 from requests import RequestException
 from urllib3.exceptions import ReadTimeoutError
 
 from dirhunt.url import Url
-from dirhunt.url_loop import is_url_loop
 
 MAX_RESPONSE_SIZE = 1024 * 512
 FLAGS_WEIGHT = {
@@ -17,71 +17,75 @@
 }
 
 
+if TYPE_CHECKING:
+    from dirhunt.crawler import Crawler
+
+
 class CrawlerUrl(object):
     def __init__(
-        self, crawler, url, depth=3, source=None, exists=None, type=None, timeout=10
+        self,
+        crawler: "Crawler",
+        target_url: str,
+        depth=3,
+        source=None,
+        exists=None,
+        url_type=None,
     ):
         """
 
-        :type crawler: Crawler
-        :type depth: int Máxima recursión sin haber subido respecto esta url
+        :type crawler: Crawler instance
+        :type target_url: Uniform Resource Identifier as string
+        :type depth: int maximum depth to crawl respect to the initial url
         """
+        self.target_url = target_url
+        url = Url(target_url)
         self.flags = set()
         self.depth = depth
-        if not isinstance(url, Url):
-            url = Url(url)
         if url.is_valid():
             url.query = ""
             url.fragment = ""
         self.url = url
         self.crawler = crawler
         self.source = source
         self.exists = exists
-        self.type = type
-        self.timeout = timeout
+        self.url_type = url_type
         if url.is_valid() and (not url.path or url.path == "/"):
             self.type = "directory"
         self.resp = None
         self.processor_data = None
 
-    def add_self_directories(self, exists=None, type_=None):
+    def add_self_directories(self, exists=None, url_type=None):
         for url in self.url.breadcrumb():
-            self.crawler.add_url(
+            self.crawler.add_crawler_url(
                 CrawlerUrl(
                     self.crawler,
                     url,
                     self.depth - 1,
                     self,
                     exists,
-                    type_,
-                    timeout=self.timeout,
+                    url_type,
                 )
             )
             # TODO: si no se puede añadir porque ya se ha añadido, establecer como que ya existe si la orden es exists
 
-    def start(self):
+    async def retrieve(self):
         from dirhunt.processors import (
             get_processor,
             GenericProcessor,
             Error,
             ProcessIndexOfRequest,
         )
 
-        if self.crawler.closing:
-            return self
-        session = self.crawler.sessions.get_session()
         try:
-            with session.get(
+            await self.crawler.domain_semaphore.acquire(self.url.domain)
+            async with self.crawler.session.get(
                 self.url.url,
-                stream=True,
-                verify=False,
-                timeout=self.timeout,
+                verify_ssl=False,
+                timeout=self.crawler.configuration.timeout,
                 allow_redirects=False,
             ) as resp:
                 self.set_type(resp.headers.get("Content-Type"))
-                self.flags.add(str(resp.status_code))
-                if self.crawler.closing:
-                    return self
+                self.flags.add(str(resp.status))
                 text = ""
                 soup = None
                 processor = None
@@ -106,6 +110,8 @@ def start(self):
             self.crawler.results.put(Error(self, e))
             self.close()
             return self
+        finally:
+            self.crawler.domain_semaphore.release(self.url.domain)
 
         if self.must_be_downloaded(resp):
             processor = get_processor(resp, text, self, soup) or GenericProcessor(

diff --git a/dirhunt/management.py b/dirhunt/management.py
@@ -2,13 +2,14 @@
 
 from __future__ import print_function
 
+import asyncio
 import re
 import click as click
 import os
 
 import sys
 
-from click import BadOptionUsage, Path, BadParameter
+from click import BadOptionUsage, Path, BadParameter, UsageError
 
 from dirhunt.configuration import ConfigurationDict, Configuration
 from dirhunt.crawler import Crawler
@@ -141,6 +142,12 @@ def flags_range(flags):
 )
 @click.argument("urls", nargs=-1, type=force_url)
 @click.option("-t", "--threads", type=int, help="Number of threads to use.")
+@click.option(
+    "--concurrency",
+    type=int,
+    default=Configuration.concurrency,
+    help="Number of concurrent requests to domains.",
+)
 @click.option(
     "-x",
     "--exclude-flags",
@@ -253,8 +260,25 @@ def flags_range(flags):
 )
 def hunt(**kwargs: ConfigurationDict):
     """Find web directories without bruteforce"""
+    # Prepare configuration
+    kwargs["urls"] = flat_list(kwargs["urls"])
+    kwargs["proxies"] = multiplier_args(kwargs["proxies"])
+    kwargs["exclude_flags"] = flags_range(kwargs["exclude_flags"])
+    kwargs["include_flags"] = flags_range(kwargs["include_flags"])
+    if kwargs["exclude_flags"] and kwargs["include_flags"]:
+        raise UsageError("--exclude-flags and --include-flags are mutually exclusive.")
     configuration = Configuration(**kwargs)
-    pass
+    welcome()
+    if not configuration.urls:
+        click.echo(
+            "•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n"
+            "Need help? Then use dirhunt --help",
+            err=True,
+        )
+        return
+    loop = asyncio.get_event_loop()
+    crawler = Crawler(configuration, loop)
+    loop.run_until_complete(crawler.start())
 
 
 def main():