Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GOOGLE_ABUSE_EXEMPTION cookie #21

Merged
merged 3 commits into from
Jun 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ for search_query in search_queries:
proxy_rotation_index += 1
```

## GOOGLE_ABUSE_EXEMPTION cookie

If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the
`SearchClient` object.

## &tbs= URL filter clarification

The `&tbs=` parameter is used to specify either verbatim or time-based filters.
Expand Down Expand Up @@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops
## Contributors

* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
* [ArshansGithub](https://github.com/ArshansGithub) - <https://github.com/opsdisk/yagooglesearch/pull/21>
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
beautifulsoup4>=4.9.3
requests>=2.26.0
requests>=2.31.0
requests[socks]
29 changes: 17 additions & 12 deletions yagooglesearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# Custom Python libraries.

__version__ = "1.6.1"
__version__ = "1.7.0"

# Logging
ROOT_LOGGER = logging.getLogger("yagooglesearch")
Expand Down Expand Up @@ -86,8 +86,8 @@ def __init__(
verify_ssl=True,
verbosity=5,
verbose_output=False,
google_exemption=None,
):

"""
SearchClient
:param str query: Query string. Must NOT be url-encoded.
Expand Down Expand Up @@ -118,6 +118,8 @@ def __init__(
This may need to be disabled in some HTTPS proxy instances.
:param int verbosity: Logging and console output verbosity.
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
:param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain
google searches. Defaults to None.

:rtype: List of str
:return: List of URLs found or list of {"rank", "title", "description", "url"}
Expand All @@ -142,6 +144,7 @@ def __init__(
self.verify_ssl = verify_ssl
self.verbosity = verbosity
self.verbose_output = verbose_output
self.google_exemption = google_exemption

# Assign log level.
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
Expand All @@ -151,8 +154,12 @@ def __init__(
ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.")
self.num = 100

# Initialize cookies to None, will be updated with each request in get_page().
self.cookies = None
# Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None.
# It will be updated with each request in get_page().
if self.google_exemption:
self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption}
else:
self.cookies = None

# Used later to ensure there are not any URL parameter collisions.
self.url_parameters = (
Expand All @@ -178,7 +185,6 @@ def __init__(

# Update proxy_dict if a proxy is provided.
if proxy:

# Standardize case since the scheme will be checked against a hard-coded list.
self.proxy = proxy.lower()

Expand Down Expand Up @@ -321,7 +327,12 @@ def get_page(self, url):

ROOT_LOGGER.info(f"Requesting URL: {url}")
response = requests.get(
url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl
url,
proxies=self.proxy_dict,
headers=headers,
cookies=self.cookies,
timeout=15,
verify=self.verify_ssl,
)

# Update the cookies.
Expand All @@ -341,7 +352,6 @@ def get_page(self, url):
# See https://github.com/benbusby/whoogle-search/issues/311
try:
if response.cookies["CONSENT"].startswith("PENDING+"):

ROOT_LOGGER.warning(
"Looks like your IP address is sourcing from a European Union location...your search results may "
"vary, but I'll try and work around this by updating the cookie."
Expand Down Expand Up @@ -381,7 +391,6 @@ def get_page(self, url):
html = response.text

elif http_response_code == 429:

ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.")

# Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
Expand Down Expand Up @@ -431,7 +440,6 @@ def search(self):
# Loop until we reach the maximum result results found or there are no more search results found to reach
# max_search_result_urls_to_return.
while total_valid_links_found <= self.max_search_result_urls_to_return:

ROOT_LOGGER.info(
f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / "
f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}"
Expand Down Expand Up @@ -484,7 +492,6 @@ def search(self):

# Process every anchored URL.
for a in anchors:

# Get the URL from the anchor tag.
try:
link = a["href"]
Expand All @@ -498,7 +505,6 @@ def search(self):
continue

if self.verbose_output:

# Extract the URL title.
try:
title = a.get_text()
Expand All @@ -520,7 +526,6 @@ def search(self):

# Check if URL has already been found.
if link not in self.search_result_list:

# Increase the counters.
valid_links_found_in_this_search += 1
total_valid_links_found += 1
Expand Down