Skip to content

Commit

Permalink
Issue #83: Use Asyncio
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 10, 2023
1 parent d742ffe commit fe2707d
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 53 deletions.
47 changes: 22 additions & 25 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import asyncio
import datetime
import functools
import json
import os
from asyncio import Semaphore, Task
Expand Down Expand Up @@ -90,9 +91,8 @@ async def start(self):

async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
"""Add crawler_url to tasks"""
if (
crawler_url in self.crawler_urls
or crawler_url.url.domain not in self.domains
if crawler_url in self.crawler_urls or not self.in_domains(
crawler_url.url.domain
):
return
self.current_processed_count += 1
Expand All @@ -101,6 +101,25 @@ async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Tas
crawler_url.retrieve(), name=f"crawlerurl-{self.current_processed_count}"
)

async def add_domain(self, domain: str):
"""Add domain to domains."""
if domain in self.domains:
return
self.domains.add(domain)
await self.sources.add_domain(domain)

@functools.lru_cache(maxsize=128)
def in_domains(self, target_domain):
if target_domain in self.domains:
return True
if self.configuration.not_follow_subdomains:
return False
for domain in self.domains:
if target_domain.endswith(f".{domain}"):
self.domains.add(target_domain)
return True
return False

def print_error(self, message: str):
"""Print error message to console."""
text = Text()
Expand All @@ -119,28 +138,6 @@ def add_init_urls(self, *urls):
self.add_domain(crawler_url.url.only_domain)
self.add_url(crawler_url, lock=False)

def in_domains(self, domain):
if self.not_follow_subdomains and domain not in self.domains:
return False
initial_domain = domain
while True:
if domain in self.domains:
if initial_domain != domain:
# subdomain
self.add_domain(initial_domain)
return True
parts = domain.split(".")
if len(parts) <= 2:
return False
domain = ".".join(parts[1:])

async def add_domain(self, domain: str):
"""Add domain to domains."""
if domain in self.domains:
return
self.domains.add(domain)
await self.sources.add_domain(domain)

def add_task(
self, coro: Coroutine[Any, Any, Any], name: Optional[str] = None
) -> Task:
Expand Down
2 changes: 2 additions & 0 deletions dirhunt/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
)
]
for url in urls:
if url is None:
continue
await self.add_url(url, depth=0, url_type="asset")

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# VirusTotal,
# Google,
CommonCrawl,
# CrtSh,
CrtSh,
# CertificateSSL,
# Wayback,
]
Expand Down
5 changes: 0 additions & 5 deletions dirhunt/sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
from typing import List, Iterable, Optional

import aiofiles
from aiohttp import ClientError
from platformdirs import user_cache_dir
from typing_extensions import TYPE_CHECKING
Expand Down Expand Up @@ -84,7 +83,3 @@ async def add_url(self, url: str):
await self.sources.crawler.add_crawler_url(
CrawlerUrl(self.sources.crawler, url)
)

def add_error(self, message):
if self.error_callback:
self.error_callback(message)
33 changes: 15 additions & 18 deletions dirhunt/sources/crtsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,18 @@


class CrtSh(SourceBase):
def callback(self, domain):
session = Sessions().get_session()
try:
with session.get(
CRTSH_URL,
params={"q": domain, "output": "json"},
stream=True,
timeout=TIMEOUT,
headers={"User-Agent": USER_AGENT},
) as response:
response.raise_for_status()
certs = response.json()
common_names = {cert["common_name"] for cert in certs}
for common_name in common_names:
self.add_result("https://{}/".format(common_name))
except RequestException as e:
self.add_error("Error on Crt.sh source: {}".format(e))
return
async def search_by_domain(self, domain: str):
async with self.sources.crawler.session.get(
CRTSH_URL,
params={"q": domain, "output": "json"},
timeout=TIMEOUT,
headers={"User-Agent": USER_AGENT},
) as response:
response.raise_for_status()
certs = await response.json()
common_names = {cert["common_name"] for cert in certs}
return [
f"https://{common_name}/"
for common_name in common_names
if "*" not in common_name
]
8 changes: 4 additions & 4 deletions dirhunt/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def full_url_address(address, url):
return url


class Url(object):
class Url:
_urlparsed = None

def __init__(self, address):
Expand Down Expand Up @@ -76,20 +76,20 @@ def protocol(self):

@property
def is_absolute(self):
"""Si es sólo un path o una dirección entera"""
""""""
return bool(self.urlparsed.netloc) if self.urlparsed else False

@property
def domain_port(self):
"""Dominio con el puerto si lo hay"""
"""Domain with port"""
if not self.urlparsed:
return
netloc = self.urlparsed[1]
return netloc.split("@", 1)[-1] or None

@property
def only_domain(self):
"""Dominio sin el puerto"""
"""Return domain without port"""
return (self.domain_port or "").split(":")[0] or None

@property
Expand Down

0 comments on commit fe2707d

Please sign in to comment.