Issue #83: Use Asyncio (Google source).

Nekmo · Aug 10, 2023 · 1880a35 · 1880a35
1 parent fe2707d
commit 1880a35
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 23 deletions.
diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
@@ -14,7 +14,6 @@
 from click import get_terminal_size
 from rich.console import Console
 from rich.text import Text
-from rich.traceback import install
 
 from dirhunt import __version__
 from dirhunt._compat import queue, Queue, unregister
@@ -32,7 +31,6 @@
 
 
 resume_dir = os.path.expanduser("~/.cache/dirhunt/")
-install(show_locals=True)
 
 
 class DomainSemaphore:
@@ -68,7 +66,7 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
         self.crawler_urls: Set[CrawlerUrl] = set()
         self.domains: Set[str] = set()
         self.console = Console(highlight=False)
-        self.session = Session()
+        self.session = Session(self)
         self.domain_semaphore = DomainSemaphore(configuration.concurrency)
         self.results = Queue()
         self.index_of_processors = []

diff --git a/dirhunt/sessions.py b/dirhunt/sessions.py
@@ -4,9 +4,11 @@
 import warnings
 
 from aiohttp import ClientSession
+from multidict import CIMultiDict
 from requests import Timeout
 from requests.adapters import HTTPAdapter
 from requests.exceptions import ProxyError
+from typing_extensions import TYPE_CHECKING
 
 from dirhunt._compat import Queue
 
@@ -16,8 +18,9 @@
 
 from dirhunt.agents import get_random_user_agent
 
-if sys.version_info < (3, 0):
-    ConnectionError = IOError
+
+if TYPE_CHECKING:
+    from dirhunt.crawler import Crawler
 
 
 MAX_NEGATIVE_VOTES = -3
@@ -318,7 +321,14 @@ def __getitem__(self, item):
 
 
 class Session(ClientSession):
-    pass
+    def __init__(self, crawler: "Crawler", **kwargs):
+        headers = kwargs.pop("headers", {})
+        headers = CIMultiDict(headers)
+        if "User-Agent" not in headers:
+            headers["User-Agent"] = (
+                crawler.configuration.user_agent or get_random_user_agent()
+            )
+        super().__init__(headers=headers, **kwargs)
 
 
 class Sessions(object):

diff --git a/dirhunt/sources/__init__.py b/dirhunt/sources/__init__.py
@@ -19,7 +19,7 @@
 SOURCE_CLASSES: List[Type["SourceBase"]] = [
     # Robots,
     # VirusTotal,
-    # Google,
+    Google,
     CommonCrawl,
     CrtSh,
     # CertificateSSL,

diff --git a/dirhunt/sources/base.py b/dirhunt/sources/base.py
@@ -61,7 +61,9 @@ async def retrieve_urls(self, domain: str):
             try:
                 urls = await self.search_by_domain(domain)
             except ClientError as e:
-                self.sources.crawler.print_error(str(e))
+                self.sources.crawler.print_error(
+                    f"Failed to retrieve {domain} using the source {self.get_source_name()}: {e}"
+                )
                 urls = []
             else:
                 self.save_to_cache(urls)

diff --git a/dirhunt/sources/google.py b/dirhunt/sources/google.py
@@ -1,20 +1,95 @@
+import asyncio
+import datetime
+import json
+import os
+from http.cookies import Morsel, SimpleCookie
+from pathlib import Path
+from typing import Iterable, Optional
+
 from dirhunt.sources.base import SourceBase
-from dirhunt._compat import URLError
-from googlesearch import search
 
-STOP_AFTER = 20
+TIMEOUT = 10
+WAIT = 2
+GOOGLE_INDEX_URL = "https://www.google.com/"
+GOOGLE_SEARCH_URL = "https://www.google.com/search"
 
 
 class Google(SourceBase):
-    def callback(self, domain):
-        results = search("site:{}".format(domain), stop=STOP_AFTER)
-        while True:
-            try:
-                url = next(results)
-            except (IOError, URLError) as e:
-                self.add_error("Error on Google Source: {}".format(e))
-                break
-            except StopIteration:
-                break
-            else:
-                self.add_result(url)
+    @property
+    def google_cookies(self) -> Optional[Morsel]:
+        return self.sources.crawler.session.cookie_jar._cookies.get(("google.com", "/"))
+
+    @property
+    def google_consent_cookie(self) -> Optional[Morsel]:
+        return self.google_cookies.get("CONSENT")
+
+    @property
+    def google_cookies_path(self) -> Path:
+        return self.cache_dir / "google_cookies.txt"
+
+    async def request(self, url: str, params: Optional[dict] = None):
+        """Request to Google."""
+        async with self.sources.crawler.session.get(
+            url,
+            params=params,
+            timeout=TIMEOUT,
+            headers={
+                "User-Agent": "'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'"
+            },
+        ) as response:
+            response.raise_for_status()
+            return await response.text()
+
+    def save_cookies(self):
+        """Save cookies to file."""
+        data = self.google_cookies.output(header="")
+        os.makedirs(str(self.google_cookies_path.parent), exist_ok=True)
+        with open(self.google_cookies_path, "w") as f:
+            f.write(data)
+
+    def load_cookies(self):
+        """Load cookies from file."""
+        with open(self.google_cookies_path, "r") as f:
+            lines = f.readlines()
+            cookie = SimpleCookie()
+            for line in lines:
+                cookie.load(line)
+            self.sources.crawler.session.cookie_jar._cookies[
+                ("google.com", "/")
+            ] = cookie
+
+    """Google Source class."""
+
+    async def search_by_domain(self, domain: str) -> Iterable[str]:
+        """Search by domain in Google."""
+        # TODO: lock for concurrent requests.
+        # Load cookies from file if exists or request to Google if not.
+        cookies_path_exists = self.google_cookies_path.exists()
+        if not self.google_cookies and cookies_path_exists:
+            self.load_cookies()
+        if not self.google_cookies and not cookies_path_exists:
+            await self.request(GOOGLE_INDEX_URL)
+            await asyncio.sleep(2)
+        # Set consent cookie if it is pending.
+        if self.google_consent_cookie and self.google_consent_cookie.value.startswith(
+            "PENDING"
+        ):
+            now = datetime.datetime.now()
+            cookie_value = f"YES+cb.{now.year}{now.month:02}{now.day:02}-17-p0.de+F+678"
+            self.google_consent_cookie.set("CONSENT", cookie_value, cookie_value)
+        # Save cookies to file if not exists.
+        if self.google_cookies and not cookies_path_exists:
+            self.save_cookies()
+        text = await self.request(
+            GOOGLE_SEARCH_URL,
+            params={
+                "q": f"site:{domain}",
+                "hl": "en",
+                "tbs": "0",
+                "safe": "off",
+                "cr": "",
+                "btnG": "Google Search",
+            },
+        )
+        # TODO:
+        return []