Skip to content

Commit

Permalink
Update commoncrawl.py
Browse files Browse the repository at this point in the history
  • Loading branch information
rix4uni authored Mar 22, 2024
1 parent c5fb497 commit d2fa9b4
Showing 1 changed file with 34 additions and 27 deletions.
61 changes: 34 additions & 27 deletions commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,21 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_urls(url, domain):
param = f"?url=*.{domain}&fl=url&output=json&filter=!=status:404"
index_url = url + param
try:
param = f"?url=*.{domain}&fl=url&output=json&filter=!=status:404"
index_url = url + param

# Send the request to the index
response = requests.get(index_url)
# Send the request to the index
response = requests.get(index_url)

# Extract the URLs from the response text
url_pattern = regex.compile(r'"url": "([^"]+)"')
urls = url_pattern.findall(response.text)
# Extract the URLs from the response text
url_pattern = regex.compile(r'"url": "([^"]+)"')
urls = url_pattern.findall(response.text)

return urls
return urls
except requests.exceptions.ConnectionError as e:
print(f"ConnectionError: {e}")
return []

if __name__ == "__main__":
# Use argparse to specify the number of threads to use
Expand All @@ -30,22 +34,25 @@ def get_urls(url, domain):
for line in sys.stdin:
domain = line.strip()

# Perform an HTTP GET request to the URL
response = requests.get("https://index.commoncrawl.org/collinfo.json")

# Parse the JSON data from the response
data = json.loads(response.text)

# Create a ThreadPoolExecutor with the specified number of threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Create a list of tasks to submit to the executor
tasks = []
for item in data:
url = item['cdx-api']
task = executor.submit(get_urls, url, domain)
tasks.append(task)

# Iterate over the completed tasks and print the results
for task in as_completed(tasks):
for url in task.result():
print(url)
try:
# Perform an HTTP GET request to the URL
response = requests.get("https://index.commoncrawl.org/collinfo.json")

# Parse the JSON data from the response
data = json.loads(response.text)

# Create a ThreadPoolExecutor with the specified number of threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Create a list of tasks to submit to the executor
tasks = []
for item in data:
url = item['cdx-api']
task = executor.submit(get_urls, url, domain)
tasks.append(task)

# Iterate over the completed tasks and print the results
for task in as_completed(tasks):
for url in task.result():
print(url)
except requests.exceptions.RequestException as e:
print(f"RequestException: {e}")

0 comments on commit d2fa9b4

Please sign in to comment.