Update commoncrawl.py

rix4uni · Mar 22, 2024 · d2fa9b4 · d2fa9b4
1 parent c5fb497
commit d2fa9b4
Showing 1 changed file with 34 additions and 27 deletions.
diff --git a/commoncrawl.py b/commoncrawl.py
@@ -6,17 +6,21 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 def get_urls(url, domain):
-    param = f"?url=*.{domain}&fl=url&output=json&filter=!=status:404"
-    index_url = url + param
+    try:
+        param = f"?url=*.{domain}&fl=url&output=json&filter=!=status:404"
+        index_url = url + param
 
-    # Send the request to the index
-    response = requests.get(index_url)
+        # Send the request to the index
+        response = requests.get(index_url)
 
-    # Extract the URLs from the response text
-    url_pattern = regex.compile(r'"url": "([^"]+)"')
-    urls = url_pattern.findall(response.text)
+        # Extract the URLs from the response text
+        url_pattern = regex.compile(r'"url": "([^"]+)"')
+        urls = url_pattern.findall(response.text)
 
-    return urls
+        return urls
+    except requests.exceptions.ConnectionError as e:
+        print(f"ConnectionError: {e}")
+        return []
 
 if __name__ == "__main__":
     # Use argparse to specify the number of threads to use
@@ -30,22 +34,25 @@ def get_urls(url, domain):
     for line in sys.stdin:
         domain = line.strip()
 
-        # Perform an HTTP GET request to the URL
-        response = requests.get("https://index.commoncrawl.org/collinfo.json")
-
-        # Parse the JSON data from the response
-        data = json.loads(response.text)
-
-        # Create a ThreadPoolExecutor with the specified number of threads
-        with ThreadPoolExecutor(max_workers=num_threads) as executor:
-            # Create a list of tasks to submit to the executor
-            tasks = []
-            for item in data:
-                url = item['cdx-api']
-                task = executor.submit(get_urls, url, domain)
-                tasks.append(task)
-
-            # Iterate over the completed tasks and print the results
-            for task in as_completed(tasks):
-                for url in task.result():
-                    print(url)
+        try:
+            # Perform an HTTP GET request to the URL
+            response = requests.get("https://index.commoncrawl.org/collinfo.json")
+
+            # Parse the JSON data from the response
+            data = json.loads(response.text)
+
+            # Create a ThreadPoolExecutor with the specified number of threads
+            with ThreadPoolExecutor(max_workers=num_threads) as executor:
+                # Create a list of tasks to submit to the executor
+                tasks = []
+                for item in data:
+                    url = item['cdx-api']
+                    task = executor.submit(get_urls, url, domain)
+                    tasks.append(task)
+
+                # Iterate over the completed tasks and print the results
+                for task in as_completed(tasks):
+                    for url in task.result():
+                        print(url)
+        except requests.exceptions.RequestException as e:
+            print(f"RequestException: {e}")