Skip to content

Commit

Permalink
Issue #83: Use Asyncio.
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 10, 2023
1 parent 1880a35 commit 138bcb4
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 16 deletions.
18 changes: 10 additions & 8 deletions dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# -*- coding: utf-8 -*-
import asyncio
import cgi
from typing import TYPE_CHECKING, Any, Optional, Literal

from aiohttp import ClientResponse
from aiohttp import ClientResponse, ClientError
from aiohttp.web_response import Response
from bs4 import BeautifulSoup
from requests import RequestException
import charset_normalizer as chardet

from dirhunt.url import Url
from dirhunt.utils import get_message_from_exception

RESPONSE_CHUNK = 1024 * 4
MAX_RESPONSE_SIZE = 1024 * 512
Expand Down Expand Up @@ -42,23 +43,22 @@ async def get_content(response: "ClientResponse") -> str:


class CrawlerUrlRequest:
response = Optional[Response]
response: Optional[Response] = None
content: Optional[str] = None
_soup: Optional[BeautifulSoup] = None

def __init__(self, crawler_url: "CrawlerUrl"):
self.crawler_url = crawler_url
self.crawler = crawler_url.crawler

async def retrieve(self) -> "ProcessBase":
async def retrieve(self) -> Optional["ProcessBase"]:
from dirhunt.processors import (
get_processor,
Error,
)

processor = None
try:
await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
pass
async with self.crawler.session.get(
self.crawler_url.url.url,
verify_ssl=False,
Expand All @@ -73,9 +73,11 @@ async def retrieve(self) -> "ProcessBase":
self.content = await get_content(response)
if processor.has_descendants:
processor = get_processor(self)
except RequestException as e:
except (ClientError, asyncio.TimeoutError) as e:
self.crawler.current_processed_count += 1
processor = Error(self, e)
self.crawler.print_error(
f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
)
else:
await processor.process(self)
finally:
Expand Down
4 changes: 4 additions & 0 deletions dirhunt/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ class IncompatibleVersionError(DirHuntError):
pass


class SourceError(DirHuntError):
pass


def catch(fn):
def wrap(*args, **kwargs):
try:
Expand Down
4 changes: 2 additions & 2 deletions dirhunt/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,8 @@ def analyze_asset(self, asset) -> None:
if "wordpress" not in self.crawler_url.flags and "wp-content" in asset.path:
self.crawler_url.flags.update({"wordpress"})
# Override type always except for root path
self.crawler_url.type = (
"rewrite" if self.crawler_url.type != "directory" else "directory"
self.crawler_url.url_type = (
"rewrite" if self.crawler_url.url_type != "directory" else "directory"
)
self.crawler_url.depth -= 1

Expand Down
28 changes: 26 additions & 2 deletions dirhunt/sources/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import datetime
import json
import os
Expand All @@ -11,13 +12,17 @@

from dirhunt import __version__
from dirhunt.crawler_url import CrawlerUrl
from dirhunt.exceptions import SourceError
from dirhunt.utils import get_message_from_exception

if TYPE_CHECKING:
from dirhunt.sources import Sources


class SourceBase:
max_cache_age = datetime.timedelta(days=7)
wait_locks = {}
wait_between_requests = None

def __init__(self, sources: "Sources", domain: str):
self.sources = sources
Expand Down Expand Up @@ -55,20 +60,27 @@ async def search_by_domain(self, domain: str) -> Iterable[str]:

async def retrieve_urls(self, domain: str):
urls = None
acquired_lock = False
if not self.is_cache_expired:
urls = self.get_from_cache()
if not urls and self.wait_between_requests:
acquired_lock = True
await self.acquire_wait_lock()
if urls is None:
try:
urls = await self.search_by_domain(domain)
except ClientError as e:
except (ClientError, SourceError, asyncio.TimeoutError) as e:
self.sources.crawler.print_error(
f"Failed to retrieve {domain} using the source {self.get_source_name()}: {e}"
f"Failed to retrieve {domain} using the source {self.get_source_name()}: "
f"{get_message_from_exception(e)}"
)
urls = []
else:
self.save_to_cache(urls)
for url in urls:
await self.add_url(url)
if acquired_lock:
await self.release_wait_lock()

def save_to_cache(self, urls: Iterable[str]) -> None:
cache_data = {
Expand All @@ -85,3 +97,15 @@ async def add_url(self, url: str):
await self.sources.crawler.add_crawler_url(
CrawlerUrl(self.sources.crawler, url)
)

async def acquire_wait_lock(self):
"""Acquire wait lock."""
if self.get_source_name() not in self.wait_locks:
self.wait_locks[self.get_source_name()] = asyncio.Lock()
await self.wait_locks[self.get_source_name()].acquire()

async def release_wait_lock(self):
"""Release wait lock."""
if self.get_source_name() in self.wait_locks:
await asyncio.sleep(self.wait_between_requests)
self.wait_locks[self.get_source_name()].release()
18 changes: 14 additions & 4 deletions dirhunt/sources/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from pathlib import Path
from typing import Iterable, Optional

from bs4 import BeautifulSoup

from dirhunt.exceptions import SourceError
from dirhunt.sources.base import SourceBase

TIMEOUT = 10
Expand All @@ -15,6 +18,8 @@


class Google(SourceBase):
wait_between_requests = WAIT

@property
def google_cookies(self) -> Optional[Morsel]:
return self.sources.crawler.session.cookie_jar._cookies.get(("google.com", "/"))
Expand Down Expand Up @@ -62,8 +67,6 @@ def load_cookies(self):

async def search_by_domain(self, domain: str) -> Iterable[str]:
"""Search by domain in Google."""
# TODO: lock for concurrent requests.
# Load cookies from file if exists or request to Google if not.
cookies_path_exists = self.google_cookies_path.exists()
if not self.google_cookies and cookies_path_exists:
self.load_cookies()
Expand Down Expand Up @@ -91,5 +94,12 @@ async def search_by_domain(self, domain: str) -> Iterable[str]:
"btnG": "Google Search",
},
)
# TODO:
return []
soup = BeautifulSoup(text, "html.parser")

urls = [
a["href"].replace("/url?q=", "").split("&sa=", 1)[0]
for a in soup.find_all("a", {"class": "fuLhoc ZWRArf"})
]
if not urls:
raise SourceError("Google search not found urls")
return urls
13 changes: 13 additions & 0 deletions dirhunt/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import re
import string
from typing import Union, Type

import click
import requests
Expand All @@ -16,6 +17,18 @@
ARGUMENT_MULT = re.compile("(.+)\*(\d+)$")


def get_message_from_exception(exception: Union[Exception, Type[Exception]]) -> str:
if isinstance(exception, Exception):
exception_name = exception.__class__.__name__
else:
exception_name = exception.__name__
exception_body = str(exception)
if exception_body:
return "{}: {}".format(exception_name, exception_body)
else:
return exception_name


def lrange(start, end):
return list(range(start, end))

Expand Down

0 comments on commit 138bcb4

Please sign in to comment.