Issue #83: Use Asyncio.

Nekmo · Aug 10, 2023 · 138bcb4 · 138bcb4
1 parent 1880a35
commit 138bcb4
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 16 deletions.
diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -1,14 +1,15 @@
 # -*- coding: utf-8 -*-
+import asyncio
 import cgi
 from typing import TYPE_CHECKING, Any, Optional, Literal
 
-from aiohttp import ClientResponse
+from aiohttp import ClientResponse, ClientError
 from aiohttp.web_response import Response
 from bs4 import BeautifulSoup
-from requests import RequestException
 import charset_normalizer as chardet
 
 from dirhunt.url import Url
+from dirhunt.utils import get_message_from_exception
 
 RESPONSE_CHUNK = 1024 * 4
 MAX_RESPONSE_SIZE = 1024 * 512
@@ -42,23 +43,22 @@ async def get_content(response: "ClientResponse") -> str:
 
 
 class CrawlerUrlRequest:
-    response = Optional[Response]
+    response: Optional[Response] = None
     content: Optional[str] = None
     _soup: Optional[BeautifulSoup] = None
 
     def __init__(self, crawler_url: "CrawlerUrl"):
         self.crawler_url = crawler_url
         self.crawler = crawler_url.crawler
 
-    async def retrieve(self) -> "ProcessBase":
+    async def retrieve(self) -> Optional["ProcessBase"]:
         from dirhunt.processors import (
             get_processor,
-            Error,
         )
 
+        processor = None
         try:
             await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
-            pass
             async with self.crawler.session.get(
                 self.crawler_url.url.url,
                 verify_ssl=False,
@@ -73,9 +73,11 @@ async def retrieve(self) -> "ProcessBase":
                     self.content = await get_content(response)
                 if processor.has_descendants:
                     processor = get_processor(self)
-        except RequestException as e:
+        except (ClientError, asyncio.TimeoutError) as e:
             self.crawler.current_processed_count += 1
-            processor = Error(self, e)
+            self.crawler.print_error(
+                f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
+            )
         else:
             await processor.process(self)
         finally:

diff --git a/dirhunt/exceptions.py b/dirhunt/exceptions.py
@@ -31,6 +31,10 @@ class IncompatibleVersionError(DirHuntError):
     pass
 
 
+class SourceError(DirHuntError):
+    pass
+
+
 def catch(fn):
     def wrap(*args, **kwargs):
         try:

diff --git a/dirhunt/processors.py b/dirhunt/processors.py
@@ -441,8 +441,8 @@ def analyze_asset(self, asset) -> None:
         if "wordpress" not in self.crawler_url.flags and "wp-content" in asset.path:
             self.crawler_url.flags.update({"wordpress"})
             # Override type always except for root path
-            self.crawler_url.type = (
-                "rewrite" if self.crawler_url.type != "directory" else "directory"
+            self.crawler_url.url_type = (
+                "rewrite" if self.crawler_url.url_type != "directory" else "directory"
             )
             self.crawler_url.depth -= 1
 

diff --git a/dirhunt/sources/base.py b/dirhunt/sources/base.py
@@ -1,3 +1,4 @@
+import asyncio
 import datetime
 import json
 import os
@@ -11,13 +12,17 @@
 
 from dirhunt import __version__
 from dirhunt.crawler_url import CrawlerUrl
+from dirhunt.exceptions import SourceError
+from dirhunt.utils import get_message_from_exception
 
 if TYPE_CHECKING:
     from dirhunt.sources import Sources
 
 
 class SourceBase:
     max_cache_age = datetime.timedelta(days=7)
+    wait_locks = {}
+    wait_between_requests = None
 
     def __init__(self, sources: "Sources", domain: str):
         self.sources = sources
@@ -55,20 +60,27 @@ async def search_by_domain(self, domain: str) -> Iterable[str]:
 
     async def retrieve_urls(self, domain: str):
         urls = None
+        acquired_lock = False
         if not self.is_cache_expired:
             urls = self.get_from_cache()
+        if not urls and self.wait_between_requests:
+            acquired_lock = True
+            await self.acquire_wait_lock()
         if urls is None:
             try:
                 urls = await self.search_by_domain(domain)
-            except ClientError as e:
+            except (ClientError, SourceError, asyncio.TimeoutError) as e:
                 self.sources.crawler.print_error(
-                    f"Failed to retrieve {domain} using the source {self.get_source_name()}: {e}"
+                    f"Failed to retrieve {domain} using the source {self.get_source_name()}: "
+                    f"{get_message_from_exception(e)}"
                 )
                 urls = []
             else:
                 self.save_to_cache(urls)
         for url in urls:
             await self.add_url(url)
+        if acquired_lock:
+            await self.release_wait_lock()
 
     def save_to_cache(self, urls: Iterable[str]) -> None:
         cache_data = {
@@ -85,3 +97,15 @@ async def add_url(self, url: str):
         await self.sources.crawler.add_crawler_url(
             CrawlerUrl(self.sources.crawler, url)
         )
+
+    async def acquire_wait_lock(self):
+        """Acquire wait lock."""
+        if self.get_source_name() not in self.wait_locks:
+            self.wait_locks[self.get_source_name()] = asyncio.Lock()
+        await self.wait_locks[self.get_source_name()].acquire()
+
+    async def release_wait_lock(self):
+        """Release wait lock."""
+        if self.get_source_name() in self.wait_locks:
+            await asyncio.sleep(self.wait_between_requests)
+            self.wait_locks[self.get_source_name()].release()
diff --git a/dirhunt/sources/google.py b/dirhunt/sources/google.py
@@ -6,6 +6,9 @@
 from pathlib import Path
 from typing import Iterable, Optional
 
+from bs4 import BeautifulSoup
+
+from dirhunt.exceptions import SourceError
 from dirhunt.sources.base import SourceBase
 
 TIMEOUT = 10
@@ -15,6 +18,8 @@
 
 
 class Google(SourceBase):
+    wait_between_requests = WAIT
+
     @property
     def google_cookies(self) -> Optional[Morsel]:
         return self.sources.crawler.session.cookie_jar._cookies.get(("google.com", "/"))
@@ -62,8 +67,6 @@ def load_cookies(self):
 
     async def search_by_domain(self, domain: str) -> Iterable[str]:
         """Search by domain in Google."""
-        # TODO: lock for concurrent requests.
-        # Load cookies from file if exists or request to Google if not.
         cookies_path_exists = self.google_cookies_path.exists()
         if not self.google_cookies and cookies_path_exists:
             self.load_cookies()
@@ -91,5 +94,12 @@ async def search_by_domain(self, domain: str) -> Iterable[str]:
                 "btnG": "Google Search",
             },
         )
-        # TODO:
-        return []
+        soup = BeautifulSoup(text, "html.parser")
+
+        urls = [
+            a["href"].replace("/url?q=", "").split("&sa=", 1)[0]
+            for a in soup.find_all("a", {"class": "fuLhoc ZWRArf"})
+        ]
+        if not urls:
+            raise SourceError("Google search not found urls")
+        return urls
diff --git a/dirhunt/utils.py b/dirhunt/utils.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import re
 import string
+from typing import Union, Type
 
 import click
 import requests
@@ -16,6 +17,18 @@
 ARGUMENT_MULT = re.compile("(.+)\*(\d+)$")
 
 
+def get_message_from_exception(exception: Union[Exception, Type[Exception]]) -> str:
+    if isinstance(exception, Exception):
+        exception_name = exception.__class__.__name__
+    else:
+        exception_name = exception.__name__
+    exception_body = str(exception)
+    if exception_body:
+        return "{}: {}".format(exception_name, exception_body)
+    else:
+        return exception_name
+
+
 def lrange(start, end):
     return list(range(start, end))