From 1880a35053cdb8908e0b2016179e751b8eb9b65b Mon Sep 17 00:00:00 2001 From: Nekmo Date: Thu, 10 Aug 2023 20:45:17 +0200 Subject: [PATCH] Issue #83: Use Asyncio (Google source). --- dirhunt/crawler.py | 4 +- dirhunt/sessions.py | 16 ++++-- dirhunt/sources/__init__.py | 2 +- dirhunt/sources/base.py | 4 +- dirhunt/sources/google.py | 105 ++++++++++++++++++++++++++++++------ 5 files changed, 108 insertions(+), 23 deletions(-) diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py index 66d778d..f3f542c 100644 --- a/dirhunt/crawler.py +++ b/dirhunt/crawler.py @@ -14,7 +14,6 @@ from click import get_terminal_size from rich.console import Console from rich.text import Text -from rich.traceback import install from dirhunt import __version__ from dirhunt._compat import queue, Queue, unregister @@ -32,7 +31,6 @@ resume_dir = os.path.expanduser("~/.cache/dirhunt/") -install(show_locals=True) class DomainSemaphore: @@ -68,7 +66,7 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop self.crawler_urls: Set[CrawlerUrl] = set() self.domains: Set[str] = set() self.console = Console(highlight=False) - self.session = Session() + self.session = Session(self) self.domain_semaphore = DomainSemaphore(configuration.concurrency) self.results = Queue() self.index_of_processors = [] diff --git a/dirhunt/sessions.py b/dirhunt/sessions.py index 024ae0e..0542b52 100644 --- a/dirhunt/sessions.py +++ b/dirhunt/sessions.py @@ -4,9 +4,11 @@ import warnings from aiohttp import ClientSession +from multidict import CIMultiDict from requests import Timeout from requests.adapters import HTTPAdapter from requests.exceptions import ProxyError +from typing_extensions import TYPE_CHECKING from dirhunt._compat import Queue @@ -16,8 +18,9 @@ from dirhunt.agents import get_random_user_agent -if sys.version_info < (3, 0): - ConnectionError = IOError + +if TYPE_CHECKING: + from dirhunt.crawler import Crawler MAX_NEGATIVE_VOTES = -3 @@ -318,7 +321,14 @@ def __getitem__(self, item): class Session(ClientSession): - pass + def __init__(self, crawler: "Crawler", **kwargs): + headers = kwargs.pop("headers", {}) + headers = CIMultiDict(headers) + if "User-Agent" not in headers: + headers["User-Agent"] = ( + crawler.configuration.user_agent or get_random_user_agent() + ) + super().__init__(headers=headers, **kwargs) class Sessions(object): diff --git a/dirhunt/sources/__init__.py b/dirhunt/sources/__init__.py index 7ff257f..a870c61 100644 --- a/dirhunt/sources/__init__.py +++ b/dirhunt/sources/__init__.py @@ -19,7 +19,7 @@ SOURCE_CLASSES: List[Type["SourceBase"]] = [ # Robots, # VirusTotal, - # Google, + Google, CommonCrawl, CrtSh, # CertificateSSL, diff --git a/dirhunt/sources/base.py b/dirhunt/sources/base.py index 92a98a4..6ee0b67 100644 --- a/dirhunt/sources/base.py +++ b/dirhunt/sources/base.py @@ -61,7 +61,9 @@ async def retrieve_urls(self, domain: str): try: urls = await self.search_by_domain(domain) except ClientError as e: - self.sources.crawler.print_error(str(e)) + self.sources.crawler.print_error( + f"Failed to retrieve {domain} using the source {self.get_source_name()}: {e}" + ) urls = [] else: self.save_to_cache(urls) diff --git a/dirhunt/sources/google.py b/dirhunt/sources/google.py index e313f8f..1df6c76 100644 --- a/dirhunt/sources/google.py +++ b/dirhunt/sources/google.py @@ -1,20 +1,95 @@ +import asyncio +import datetime +import json +import os +from http.cookies import Morsel, SimpleCookie +from pathlib import Path +from typing import Iterable, Optional + from dirhunt.sources.base import SourceBase -from dirhunt._compat import URLError -from googlesearch import search -STOP_AFTER = 20 +TIMEOUT = 10 +WAIT = 2 +GOOGLE_INDEX_URL = "https://www.google.com/" +GOOGLE_SEARCH_URL = "https://www.google.com/search" class Google(SourceBase): - def callback(self, domain): - results = search("site:{}".format(domain), stop=STOP_AFTER) - while True: - try: - url = next(results) - except (IOError, URLError) as e: - self.add_error("Error on Google Source: {}".format(e)) - break - except StopIteration: - break - else: - self.add_result(url) + @property + def google_cookies(self) -> Optional[Morsel]: + return self.sources.crawler.session.cookie_jar._cookies.get(("google.com", "/")) + + @property + def google_consent_cookie(self) -> Optional[Morsel]: + return self.google_cookies.get("CONSENT") + + @property + def google_cookies_path(self) -> Path: + return self.cache_dir / "google_cookies.txt" + + async def request(self, url: str, params: Optional[dict] = None): + """Request to Google.""" + async with self.sources.crawler.session.get( + url, + params=params, + timeout=TIMEOUT, + headers={ + "User-Agent": "'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'" + }, + ) as response: + response.raise_for_status() + return await response.text() + + def save_cookies(self): + """Save cookies to file.""" + data = self.google_cookies.output(header="") + os.makedirs(str(self.google_cookies_path.parent), exist_ok=True) + with open(self.google_cookies_path, "w") as f: + f.write(data) + + def load_cookies(self): + """Load cookies from file.""" + with open(self.google_cookies_path, "r") as f: + lines = f.readlines() + cookie = SimpleCookie() + for line in lines: + cookie.load(line) + self.sources.crawler.session.cookie_jar._cookies[ + ("google.com", "/") + ] = cookie + + """Google Source class.""" + + async def search_by_domain(self, domain: str) -> Iterable[str]: + """Search by domain in Google.""" + # TODO: lock for concurrent requests. + # Load cookies from file if exists or request to Google if not. + cookies_path_exists = self.google_cookies_path.exists() + if not self.google_cookies and cookies_path_exists: + self.load_cookies() + if not self.google_cookies and not cookies_path_exists: + await self.request(GOOGLE_INDEX_URL) + await asyncio.sleep(2) + # Set consent cookie if it is pending. + if self.google_consent_cookie and self.google_consent_cookie.value.startswith( + "PENDING" + ): + now = datetime.datetime.now() + cookie_value = f"YES+cb.{now.year}{now.month:02}{now.day:02}-17-p0.de+F+678" + self.google_consent_cookie.set("CONSENT", cookie_value, cookie_value) + # Save cookies to file if not exists. + if self.google_cookies and not cookies_path_exists: + self.save_cookies() + text = await self.request( + GOOGLE_SEARCH_URL, + params={ + "q": f"site:{domain}", + "hl": "en", + "tbs": "0", + "safe": "off", + "cr": "", + "btnG": "Google Search", + }, + ) + # TODO: + return []