-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
69 lines (55 loc) · 1.89 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
"""
run.py - Command-line script to fetch Archive URLs from the Wayback Machine
in real-time (line-by-line streaming) and print them in the console.
Usage:
1) python run.py example.com
(Replace 'example.com' with the domain you want to fetch)
2) You can also run without arguments:
python run.py
Then you’ll be prompted for a domain.
Dependencies:
pip install requests
"""
import sys
import requests
def fetch_archive_urls(domain):
"""
Fetches Archive URLs for the given domain, streaming line by line from the
Wayback Machine's CDX endpoint, and prints them in real time.
"""
base_url = "https://web.archive.org/cdx/search/cdx"
params = {
"url": f"{domain}*",
"output": "text",
"fl": "original",
"collapse": "urlkey"
}
print(f"Fetching archive URLs for domain: {domain}\n")
line_count = 0
try:
# Use stream=True so we can process lines as they arrive
with requests.get(base_url, params=params, stream=True) as r:
r.raise_for_status()
# read line by line
for line in r.iter_lines(decode_unicode=True):
if line:
url_str = line.strip()
line_count += 1
print(url_str)
except requests.exceptions.RequestException as err:
print(f"Error: {err}")
return
print(f"\nDone. Total URLs fetched: {line_count}\n")
def main():
# If domain given in argv, use that. Otherwise prompt user.
if len(sys.argv) > 1:
domain = sys.argv[1].strip()
else:
domain = input("Enter domain (e.g. example.com): ").strip()
if not domain:
print("Error: please enter a valid domain.")
sys.exit(1)
fetch_archive_urls(domain)
if __name__ == "__main__":
main()