Issue #83: Use Asyncio.

Nekmo · Aug 11, 2023 · ef98f5e · ef98f5e
1 parent 9c656b7
commit ef98f5e
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 33 deletions.
diff --git a/dirhunt/sources/__init__.py b/dirhunt/sources/__init__.py
@@ -22,7 +22,7 @@
     Google,
     CommonCrawl,
     CrtSh,
-    # CertificateSSL,
+    CertificateSSL,
     # Wayback,
 ]
 

diff --git a/dirhunt/sources/base.py b/dirhunt/sources/base.py
@@ -21,6 +21,7 @@
 
 class SourceBase:
     max_cache_age = datetime.timedelta(days=7)
+    allow_subdomains = False
     wait_locks = {}
     wait_between_requests = None
 

diff --git a/dirhunt/sources/commoncrawl.py b/dirhunt/sources/commoncrawl.py
@@ -14,17 +14,9 @@
 class CommonCrawl(SourceBase):
     async def get_latest_craw_index(self):
         url = COMMONCRAWL_URL
-        try:
-            async with self.sources.crawler.session.get(
-                url, timeout=TIMEOUT
-            ) as response:
-                response.raise_for_status()
-                crawl_indexes = await response.json()
-        except (ClientError, ValueError, JSONDecodeError) as e:
-            self.add_error("Error on CommonCrawl source: {}".format(e))
-            return
-        if not crawl_indexes:
-            return
+        async with self.sources.crawler.session.get(url, timeout=TIMEOUT) as response:
+            response.raise_for_status()
+            crawl_indexes = await response.json()
         latest_crawl_index = crawl_indexes[0]
         return latest_crawl_index["cdx-api"]
 

diff --git a/dirhunt/sources/ssl.py b/dirhunt/sources/ssl.py
@@ -1,14 +1,11 @@
 import socket
 import ssl
-import sys
+from typing import Iterable
 
+from dirhunt.exceptions import SourceError
 from dirhunt.sources.base import SourceBase
 
 
-if sys.version_info < (3,):
-    ConnectionError = socket.error
-
-
 DEFAULT_SSL_PORT = 443
 
 
@@ -18,20 +15,16 @@ def get_url(protocol, domain, path):
 
 
 class CertificateSSL(SourceBase):
-    def callback(self, domain):
-        ctx = ssl.create_default_context()
-        cert = None
-        try:
-            with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
-                s.connect((domain, DEFAULT_SSL_PORT))
-                cert = s.getpeercert()
-        except (ConnectionError, ssl.SSLError, ValueError, socket.error):
-            pass
-        if cert is None:
-            return
-        alt_names = cert.get("subjectAltName") or ()
-        for alt_name in alt_names:
-            alt_name_domain = alt_name[1]
-            if alt_name_domain.startswith("*."):
+    async def search_by_domain(self, domain: str) -> Iterable[str]:
+        async with self.sources.crawler.session.get(f"https://{domain}") as response:
+            response.raise_for_status()
+            if response.connection is None:
+                raise SourceError("Connection is not available.")
+            cert = response.connection.transport.get_extra_info("peercert")
+            alt_names = cert.get("subjectAltName") or ()
+            urls = []
+            for alt_name in alt_names:
+                alt_name_domain = alt_name[1]
                 alt_name_domain = alt_name_domain.replace(".*", "", 1)
-            self.add_result("https://{}/".format(alt_name_domain))
+                urls.append("https://{}/".format(alt_name_domain))
+            return urls