Skip to content

Commit

Permalink
Issue #83: Use Asyncio.
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 11, 2023
1 parent 9c656b7 commit ef98f5e
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 33 deletions.
2 changes: 1 addition & 1 deletion dirhunt/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
Google,
CommonCrawl,
CrtSh,
# CertificateSSL,
CertificateSSL,
# Wayback,
]

Expand Down
1 change: 1 addition & 0 deletions dirhunt/sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

class SourceBase:
max_cache_age = datetime.timedelta(days=7)
allow_subdomains = False
wait_locks = {}
wait_between_requests = None

Expand Down
14 changes: 3 additions & 11 deletions dirhunt/sources/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,9 @@
class CommonCrawl(SourceBase):
async def get_latest_craw_index(self):
url = COMMONCRAWL_URL
try:
async with self.sources.crawler.session.get(
url, timeout=TIMEOUT
) as response:
response.raise_for_status()
crawl_indexes = await response.json()
except (ClientError, ValueError, JSONDecodeError) as e:
self.add_error("Error on CommonCrawl source: {}".format(e))
return
if not crawl_indexes:
return
async with self.sources.crawler.session.get(url, timeout=TIMEOUT) as response:
response.raise_for_status()
crawl_indexes = await response.json()
latest_crawl_index = crawl_indexes[0]
return latest_crawl_index["cdx-api"]

Expand Down
35 changes: 14 additions & 21 deletions dirhunt/sources/ssl.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import socket
import ssl
import sys
from typing import Iterable

from dirhunt.exceptions import SourceError
from dirhunt.sources.base import SourceBase


if sys.version_info < (3,):
ConnectionError = socket.error


DEFAULT_SSL_PORT = 443


Expand All @@ -18,20 +15,16 @@ def get_url(protocol, domain, path):


class CertificateSSL(SourceBase):
def callback(self, domain):
ctx = ssl.create_default_context()
cert = None
try:
with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
s.connect((domain, DEFAULT_SSL_PORT))
cert = s.getpeercert()
except (ConnectionError, ssl.SSLError, ValueError, socket.error):
pass
if cert is None:
return
alt_names = cert.get("subjectAltName") or ()
for alt_name in alt_names:
alt_name_domain = alt_name[1]
if alt_name_domain.startswith("*."):
async def search_by_domain(self, domain: str) -> Iterable[str]:
async with self.sources.crawler.session.get(f"https://{domain}") as response:
response.raise_for_status()
if response.connection is None:
raise SourceError("Connection is not available.")
cert = response.connection.transport.get_extra_info("peercert")
alt_names = cert.get("subjectAltName") or ()
urls = []
for alt_name in alt_names:
alt_name_domain = alt_name[1]
alt_name_domain = alt_name_domain.replace(".*", "", 1)
self.add_result("https://{}/".format(alt_name_domain))
urls.append("https://{}/".format(alt_name_domain))
return urls

0 comments on commit ef98f5e

Please sign in to comment.