From ada6e874a49c8c87739bfdf658388d14133769a9 Mon Sep 17 00:00:00 2001 From: ArshansGithub <111618520+ArshansGithub@users.noreply.github.com> Date: Fri, 12 May 2023 15:12:16 -0700 Subject: [PATCH 1/3] Update __init__.py added feature for GOOGLE_ABUSE_EXEMPTION cookie which bypasses the captcha presented by google --- yagooglesearch/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yagooglesearch/__init__.py b/yagooglesearch/__init__.py index 9c7723d..ae9526e 100644 --- a/yagooglesearch/__init__.py +++ b/yagooglesearch/__init__.py @@ -86,6 +86,7 @@ def __init__( verify_ssl=True, verbosity=5, verbose_output=False, + google_exemption=None ): """ @@ -118,7 +119,8 @@ def __init__( This may need to be disabled in some HTTPS proxy instances. :param int verbosity: Logging and console output verbosity. :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False. - + :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain google searches. Defaults to None + :rtype: List of str :return: List of URLs found or list of {"rank", "title", "description", "url"} """ @@ -142,6 +144,7 @@ def __init__( self.verify_ssl = verify_ssl self.verbosity = verbosity self.verbose_output = verbose_output + self.google_exemption = google_exemption # Assign log level. ROOT_LOGGER.setLevel((6 - self.verbosity) * 10) @@ -152,7 +155,10 @@ def __init__( self.num = 100 # Initialize cookies to None, will be updated with each request in get_page(). - self.cookies = None + if self.google_exemption: + self.cookies = {'GOOGLE_ABUSE_EXEMPTION': self.google_exemption} + else: + self.cookies = None # Used later to ensure there are not any URL parameter collisions. self.url_parameters = ( From 4be1c20e08a4038120f7f3b52e8b475330549451 Mon Sep 17 00:00:00 2001 From: opsdisk Date: Sat, 10 Jun 2023 14:38:01 -0500 Subject: [PATCH 2/3] Bumped requests version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7463f32..c51383f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4>=4.9.3 -requests>=2.26.0 +requests>=2.31.0 requests[socks] From c4cac70dc977f44494241d03218aca5fee1114bf Mon Sep 17 00:00:00 2001 From: opsdisk Date: Sat, 10 Jun 2023 14:43:50 -0500 Subject: [PATCH 3/3] Minor fixes, formatting, and documenation updates --- README.md | 6 ++++++ yagooglesearch/__init__.py | 29 ++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 59121e8..ae582dd 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,11 @@ for search_query in search_queries: proxy_rotation_index += 1 ``` +## GOOGLE_ABUSE_EXEMPTION cookie + +If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the +`SearchClient` object. + ## &tbs= URL filter clarification The `&tbs=` parameter is used to specify either verbatim or time-based filters. @@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops ## Contributors * [KennBro](https://github.com/KennBro) - +* [ArshansGithub](https://github.com/ArshansGithub) - diff --git a/yagooglesearch/__init__.py b/yagooglesearch/__init__.py index ae9526e..3ee141a 100644 --- a/yagooglesearch/__init__.py +++ b/yagooglesearch/__init__.py @@ -12,7 +12,7 @@ # Custom Python libraries. -__version__ = "1.6.1" +__version__ = "1.7.0" # Logging ROOT_LOGGER = logging.getLogger("yagooglesearch") @@ -86,9 +86,8 @@ def __init__( verify_ssl=True, verbosity=5, verbose_output=False, - google_exemption=None + google_exemption=None, ): - """ SearchClient :param str query: Query string. Must NOT be url-encoded. @@ -119,8 +118,9 @@ def __init__( This may need to be disabled in some HTTPS proxy instances. :param int verbosity: Logging and console output verbosity. :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False. - :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain google searches. Defaults to None - + :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain + google searches. Defaults to None. + :rtype: List of str :return: List of URLs found or list of {"rank", "title", "description", "url"} """ @@ -154,9 +154,10 @@ def __init__( ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.") self.num = 100 - # Initialize cookies to None, will be updated with each request in get_page(). + # Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None. + # It will be updated with each request in get_page(). if self.google_exemption: - self.cookies = {'GOOGLE_ABUSE_EXEMPTION': self.google_exemption} + self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption} else: self.cookies = None @@ -184,7 +185,6 @@ def __init__( # Update proxy_dict if a proxy is provided. if proxy: - # Standardize case since the scheme will be checked against a hard-coded list. self.proxy = proxy.lower() @@ -327,7 +327,12 @@ def get_page(self, url): ROOT_LOGGER.info(f"Requesting URL: {url}") response = requests.get( - url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl + url, + proxies=self.proxy_dict, + headers=headers, + cookies=self.cookies, + timeout=15, + verify=self.verify_ssl, ) # Update the cookies. @@ -347,7 +352,6 @@ def get_page(self, url): # See https://github.com/benbusby/whoogle-search/issues/311 try: if response.cookies["CONSENT"].startswith("PENDING+"): - ROOT_LOGGER.warning( "Looks like your IP address is sourcing from a European Union location...your search results may " "vary, but I'll try and work around this by updating the cookie." @@ -387,7 +391,6 @@ def get_page(self, url): html = response.text elif http_response_code == 429: - ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.") # Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a @@ -437,7 +440,6 @@ def search(self): # Loop until we reach the maximum result results found or there are no more search results found to reach # max_search_result_urls_to_return. while total_valid_links_found <= self.max_search_result_urls_to_return: - ROOT_LOGGER.info( f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / " f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}" @@ -490,7 +492,6 @@ def search(self): # Process every anchored URL. for a in anchors: - # Get the URL from the anchor tag. try: link = a["href"] @@ -504,7 +505,6 @@ def search(self): continue if self.verbose_output: - # Extract the URL title. try: title = a.get_text() @@ -526,7 +526,6 @@ def search(self): # Check if URL has already been found. if link not in self.search_result_list: - # Increase the counters. valid_links_found_in_this_search += 1 total_valid_links_found += 1