From 8b0776f946162758324c1a576176f52fc3727b2e Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Tue, 2 Jan 2024 16:40:57 +0200 Subject: [PATCH] Don't use requests session with multithreading --- src/oabot/main.py | 4 ++-- src/oabot/settings.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/oabot/main.py b/src/oabot/main.py index eb74559..407d2f9 100644 --- a/src/oabot/main.py +++ b/src/oabot/main.py @@ -263,7 +263,7 @@ def keep_existing_url(self, url): return True try: - r = SESSION.head(url, timeout=5, allow_redirects=True) + r = requests.head(url, timeout=(5, 1), allow_redirects=True, headers={'User-Agent': OABOT_USER_AGENT}) except requests.exceptions.RequestException: r = None # Avoid changing an URL which already clearly points to an open PDF @@ -418,7 +418,7 @@ def get_oa_link(paper, doi=None, only_unpaywall=True): for url in sort_links(candidate_urls): if url: try: - head = SESSION.head(url, timeout=10) + head = requests.head(url, timeout=(5, 1), allow_redirects=True, headers={'User-Agent': OABOT_USER_AGENT}) head.raise_for_status() if head.status_code < 400 and 'Location' in head.headers and urllib.parse.urlparse(head.headers['Location']).path == '/': # Redirects to main page: fake status code, should be not found diff --git a/src/oabot/settings.py b/src/oabot/settings.py index d2ed96a..b39a657 100644 --- a/src/oabot/settings.py +++ b/src/oabot/settings.py @@ -9,7 +9,7 @@ # Mount point is '/' OABOT_APP_MOUNT_POINT = '' -OABOT_USER_AGENT = 'OAbot/1.0 (+http://enwp.org/WP:OABOT)' +OABOT_USER_AGENT = 'OAbot/1.0 (+http://enwp.org/WP:OABOT) not Googlebot' # the bot will not make any changes to these templates excluded_templates = ['cite arxiv', 'cite web', 'cite news', 'cite book']