Skip to content

Commit

Permalink
User-Agent has a default and is used for check_url
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 23, 2023
1 parent f22bb92 commit 8484df1
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed

- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
- User-Agent now has a default value + manipulation of spaces with suffix and (#228)
- Same User-Agent is used for check_url and browsertrix crawler (#227)

## [1.5.3] - 2023-10-02

Expand Down
35 changes: 24 additions & 11 deletions zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from warc2zim.main import warc2zim
from zimscraperlib.uri import rebuild_uri

DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"

class ProgressFileWatcher:
def __init__(self, output_dir, stats_path):
Expand Down Expand Up @@ -226,14 +227,15 @@ def zimit(args=None):

parser.add_argument(
"--userAgent",
help="Override user-agent with specified",
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
default=DEFAULT_USER_AGENT
)

parser.add_argument(
"--userAgentSuffix",
help="Append suffix to existing browser user-agent "
"(ex: +MyCrawler, info@example.com)",
default="+Zimit ",
default="+Zimit",
)

parser.add_argument(
Expand Down Expand Up @@ -344,8 +346,14 @@ def zimit(args=None):

url = zimit_args.url

user_agent = zimit_args.userAgent
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}"

if url:
url = check_url(url, zimit_args.scopeType)
url = check_url(url, user_agent, zimit_args.scopeType)
warc2zim_args.append("--url")
warc2zim_args.append(url)

Expand Down Expand Up @@ -394,12 +402,18 @@ def cleanup():
cmd_args.append("--url")
cmd_args.append(url)

user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail
if zimit_args.mobileDevice and zimit_args.userAgent != DEFAULT_USER_AGENT:
print("WARNING: --mobileDevice and --userAgent are both set ; userAgent won't be used for browsertrix crawl; only userAgentSuffix and adminEmail will be passed")

user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent_suffix += f" {zimit_args.adminEmail}"

cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
else:
cmd_args.append("--userAgent")
cmd_args.append(user_agent)

cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
Expand Down Expand Up @@ -445,11 +459,11 @@ def cleanup():
return warc2zim(warc2zim_args)


def check_url(url, scope=None):
def check_url(url, user_agent, scope=None):
url = urllib.parse.urlparse(url)
try:
resp = requests.head(
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27)
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent}
)
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
Expand Down Expand Up @@ -505,7 +519,6 @@ def get_node_cmd_line(args):
"allowHashUrls",
"lang",
"mobileDevice",
"userAgent",
"useSitemap",
"behaviors",
"behaviorTimeout",
Expand Down

0 comments on commit 8484df1

Please sign in to comment.