Skip to content

Commit

Permalink
Issue #83: Use Asyncio
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 9, 2023
1 parent 9f0ed72 commit 622c45f
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 169 deletions.
3 changes: 2 additions & 1 deletion dirhunt/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ class Configuration:
"""

urls: List[str] = field(default_factory=list)
threads: int = 10
threads: int = 4
concurrency: int = 10
exclude_flags: List[str] = field(default_factory=list)
include_flags: List[str] = field(default_factory=list)
interesting_extensions: List[str] = field(default_factory=list)
Expand Down
137 changes: 51 additions & 86 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
import asyncio
import json
import multiprocessing
import os
from asyncio import Semaphore
from hashlib import sha256
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures.thread import _python_exit
from threading import Lock, ThreadError
import datetime
Expand All @@ -15,6 +16,7 @@
from dirhunt import __version__
from dirhunt._compat import queue, Queue, unregister
from dirhunt.cli import random_spinner
from dirhunt.configuration import Configuration
from dirhunt.crawler_url import CrawlerUrl
from dirhunt.exceptions import (
EmptyError,
Expand All @@ -23,7 +25,7 @@
IncompatibleVersionError,
)
from dirhunt.json_report import JsonReportEncoder
from dirhunt.sessions import Sessions
from dirhunt.sessions import Sessions, Session
from dirhunt.sources import Sources
from dirhunt.url_info import UrlsInfo

Expand All @@ -33,62 +35,63 @@
resume_dir = os.path.expanduser("~/.cache/dirhunt/")


class Crawler(ThreadPoolExecutor):
class DomainSemaphore:
"""Asyncio Semaphore per domain."""

def __init__(self, concurrency: int):
"""Initialize DomainSemaphore."""
self.concurrency = concurrency
self.semaphores = {}

async def acquire(self, domain: str):
"""Acquire semaphore for domain."""
if domain not in self.semaphores:
self.semaphores[domain] = Semaphore(self.concurrency)
await self.semaphores[domain].acquire()

def release(self, domain: str):
"""Release semaphore for domain."""
self.semaphores[domain].release()


class Crawler:
urls_info = None

def __init__(
self,
max_workers=None,
interesting_extensions=None,
interesting_files=None,
interesting_keywords=None,
std=None,
progress_enabled=True,
timeout=10,
depth=3,
not_follow_subdomains=False,
exclude_sources=(),
not_allow_redirects=False,
proxies=None,
delay=0,
limit=1000,
to_file=None,
user_agent=None,
cookies=None,
headers=None,
):
if not max_workers and not delay:
max_workers = (multiprocessing.cpu_count() or 1) * 5
elif not max_workers and delay:
max_workers = len(proxies or [None])
super(Crawler, self).__init__(max_workers)
def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop):
"""Initialize Crawler.
:param configuration: Configuration instance
:param loop: asyncio loop
"""
self.configuration = configuration
self.loop = loop
self.tasks = set()
self.session = Session()
self.domain_semaphore = DomainSemaphore(configuration.concurrency)
self.domains = set()
self.results = Queue()
self.index_of_processors = []
self.proxies = proxies
self.delay = delay
self.sessions = Sessions(proxies, delay, user_agent, cookies, headers)
self.processing = {}
self.processed = {}
self.add_lock = Lock()
self.spinner = random_spinner()
self.start_dt = datetime.datetime.now()
self.interesting_extensions = interesting_extensions or []
self.interesting_files = interesting_files or []
self.interesting_keywords = interesting_keywords or []
self.closing = False
self.std = std or None
self.progress_enabled = progress_enabled
self.timeout = timeout
self.not_follow_subdomains = not_follow_subdomains
self.depth = depth
self.exclude_sources = exclude_sources
self.sources = Sources(self.add_url, self.add_message, exclude_sources)
self.not_allow_redirects = not_allow_redirects
self.limit = limit
self.current_processed_count = 0
self.to_file = to_file
self.initial_urls = []

async def start(self):
"""Add urls to process."""
for url in self.configuration.urls:
await self.add_crawler_url(
CrawlerUrl(self, url, depth=self.configuration.max_depth)
)
await asyncio.wait(self.tasks)

async def add_crawler_url(self, crawler_url: CrawlerUrl):
"""Add crawler_url to tasks"""
if crawler_url.url.url in self.processing:
return
task = self.loop.create_task(crawler_url.retrieve())
self.tasks.add(task)
self.processing[crawler_url.url.url] = task
task.add_done_callback(lambda: self.tasks.discard(task))

def add_init_urls(self, *urls):
"""Add urls to queue."""
Expand Down Expand Up @@ -122,44 +125,6 @@ def add_domain(self, domain):
self.domains.add(domain)
self.sources.add_domain(domain)

def add_url(self, crawler_url, force=False, lock=True):
"""Add url to queue"""
if self.closing:
return
if not isinstance(crawler_url, CrawlerUrl):
crawler_url = CrawlerUrl(
self, crawler_url, depth=self.depth, timeout=self.timeout
)
if lock:
self.add_lock.acquire()
url = crawler_url.url
if (
not url.is_valid()
or not url.only_domain
or not self.in_domains(url.only_domain)
):
if lock:
self.add_lock.release()
return
if url.url in self.processing or url.url in self.processed:
if lock:
self.add_lock.release()
return self.processing.get(url.url) or self.processed.get(url.url)

fn = reraise_with_stack(crawler_url.start)
if self.closing:
if lock:
self.add_lock.release()
return
if force:
future = ThreadPoolExecutor(max_workers=1).submit(fn)
else:
future = self.submit(fn)
self.processing[url.url] = future
if lock:
self.add_lock.release()
return future

def add_message(self, body):
from dirhunt.processors import Message

Expand Down
52 changes: 29 additions & 23 deletions dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
import cgi
import socket
from typing import TYPE_CHECKING

from bs4 import BeautifulSoup
from requests import RequestException
from urllib3.exceptions import ReadTimeoutError

from dirhunt.url import Url
from dirhunt.url_loop import is_url_loop

MAX_RESPONSE_SIZE = 1024 * 512
FLAGS_WEIGHT = {
Expand All @@ -17,71 +17,75 @@
}


if TYPE_CHECKING:
from dirhunt.crawler import Crawler


class CrawlerUrl(object):
def __init__(
self, crawler, url, depth=3, source=None, exists=None, type=None, timeout=10
self,
crawler: "Crawler",
target_url: str,
depth=3,
source=None,
exists=None,
url_type=None,
):
"""
:type crawler: Crawler
:type depth: int Máxima recursión sin haber subido respecto esta url
:type crawler: Crawler instance
:type target_url: Uniform Resource Identifier as string
:type depth: int maximum depth to crawl respect to the initial url
"""
self.target_url = target_url
url = Url(target_url)
self.flags = set()
self.depth = depth
if not isinstance(url, Url):
url = Url(url)
if url.is_valid():
url.query = ""
url.fragment = ""
self.url = url
self.crawler = crawler
self.source = source
self.exists = exists
self.type = type
self.timeout = timeout
self.url_type = url_type
if url.is_valid() and (not url.path or url.path == "/"):
self.type = "directory"
self.resp = None
self.processor_data = None

def add_self_directories(self, exists=None, type_=None):
def add_self_directories(self, exists=None, url_type=None):
for url in self.url.breadcrumb():
self.crawler.add_url(
self.crawler.add_crawler_url(
CrawlerUrl(
self.crawler,
url,
self.depth - 1,
self,
exists,
type_,
timeout=self.timeout,
url_type,
)
)
# TODO: si no se puede añadir porque ya se ha añadido, establecer como que ya existe si la orden es exists

def start(self):
async def retrieve(self):
from dirhunt.processors import (
get_processor,
GenericProcessor,
Error,
ProcessIndexOfRequest,
)

if self.crawler.closing:
return self
session = self.crawler.sessions.get_session()
try:
with session.get(
await self.crawler.domain_semaphore.acquire(self.url.domain)
async with self.crawler.session.get(
self.url.url,
stream=True,
verify=False,
timeout=self.timeout,
verify_ssl=False,
timeout=self.crawler.configuration.timeout,
allow_redirects=False,
) as resp:
self.set_type(resp.headers.get("Content-Type"))
self.flags.add(str(resp.status_code))
if self.crawler.closing:
return self
self.flags.add(str(resp.status))
text = ""
soup = None
processor = None
Expand All @@ -106,6 +110,8 @@ def start(self):
self.crawler.results.put(Error(self, e))
self.close()
return self
finally:
self.crawler.domain_semaphore.release(self.url.domain)

if self.must_be_downloaded(resp):
processor = get_processor(resp, text, self, soup) or GenericProcessor(
Expand Down
28 changes: 26 additions & 2 deletions dirhunt/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from __future__ import print_function

import asyncio
import re
import click as click
import os

import sys

from click import BadOptionUsage, Path, BadParameter
from click import BadOptionUsage, Path, BadParameter, UsageError

from dirhunt.configuration import ConfigurationDict, Configuration
from dirhunt.crawler import Crawler
Expand Down Expand Up @@ -141,6 +142,12 @@ def flags_range(flags):
)
@click.argument("urls", nargs=-1, type=force_url)
@click.option("-t", "--threads", type=int, help="Number of threads to use.")
@click.option(
"--concurrency",
type=int,
default=Configuration.concurrency,
help="Number of concurrent requests to domains.",
)
@click.option(
"-x",
"--exclude-flags",
Expand Down Expand Up @@ -253,8 +260,25 @@ def flags_range(flags):
)
def hunt(**kwargs: ConfigurationDict):
"""Find web directories without bruteforce"""
# Prepare configuration
kwargs["urls"] = flat_list(kwargs["urls"])
kwargs["proxies"] = multiplier_args(kwargs["proxies"])
kwargs["exclude_flags"] = flags_range(kwargs["exclude_flags"])
kwargs["include_flags"] = flags_range(kwargs["include_flags"])
if kwargs["exclude_flags"] and kwargs["include_flags"]:
raise UsageError("--exclude-flags and --include-flags are mutually exclusive.")
configuration = Configuration(**kwargs)
pass
welcome()
if not configuration.urls:
click.echo(
"•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n"
"Need help? Then use dirhunt --help",
err=True,
)
return
loop = asyncio.get_event_loop()
crawler = Crawler(configuration, loop)
loop.run_until_complete(crawler.start())


def main():
Expand Down
Loading

0 comments on commit 622c45f

Please sign in to comment.