feat: Add parallism in fetching sundhed.dk

alexandrainst · Sep 29, 2024 · 7fbc3e2 · 7fbc3e2
1 parent 9d7579e
commit 7fbc3e2
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 20 deletions.
diff --git a/src/tts_text/sundhed_dk.py b/src/tts_text/sundhed_dk.py
@@ -1,5 +1,6 @@
 """Scraping and preprocessing of the sundhed.dk text corpus."""
 
+from functools import partial
 from pathlib import Path
 from unicodedata import normalize
 from bs4 import Tag
@@ -8,6 +9,8 @@
 from webdriver_manager.chrome import ChromeDriverManager
 import logging
 import re
+from tqdm.contrib.concurrent import process_map
+import multiprocessing as mp
 from .utils import extract_sentences, get_soup
 
 
@@ -34,18 +37,24 @@ def build_sundhed_dk_dataset(cfg: DictConfig) -> list[str]:
     ChromeDriverManager().install()
 
     # Get the overall categories from the front page
-    soup = get_soup(url=BASE_URL + "/borger/patienthaandbogen/", dynamic=True)
+    soup = get_soup(
+        url=BASE_URL + "/borger/patienthaandbogen/",
+        dynamic=True,
+        xpath_to_be_present="//div[@class='main-content']",
+    )
     category_urls = [
         BASE_URL + url_suffix.a["href"]
         for url_suffix in soup.find_all("li", class_="list-group-item")
     ]
 
     # Extract all articles
-    all_articles: list[str] = list()
-    desc = "Extracting articles from sundhed.dk"
-    for category_url in tqdm(category_urls, desc=desc, leave=True):
-        category_articles = extract_all_category_articles(url=category_url)
-        all_articles.extend(category_articles)
+    all_articles = [
+        article
+        for category_url in tqdm(category_urls, desc="Extracting articles")
+        for article in extract_all_category_articles(
+            url=category_url, parsed_urls=list(), num_workers=mp.cpu_count()
+        )
+    ]
 
     # Split the articles into sentences
     dataset = extract_sentences(
@@ -68,21 +77,29 @@ def build_sundhed_dk_dataset(cfg: DictConfig) -> list[str]:
     return dataset
 
 
-def extract_all_category_articles(url: str, parsed_urls: list[str] = []) -> list[str]:
+def extract_all_category_articles(
+    url: str, parsed_urls: list[str], num_workers: int
+) -> list[str]:
     """Extract all articles from a category page.
 
     These pages have arbitrarily nested subcategories, so this function is called
     recursively.
 
     Args:
-        url: The URL of the category page.
-        parsed_urls: A list of URLs that have already been parsed.
+        url:
+            The URL of the category page.
+        parsed_urls:
+            A list of URLs that have already been parsed.
+        num_workers:
+            The number of workers to use for parallel processing.
 
     Returns:
         A list of articles from the category.
     """
     # Parse the URL
-    soup = get_soup(url=url, dynamic=True)
+    soup = get_soup(
+        url=url, dynamic=True, xpath_to_be_present="//div[@class='main-content']"
+    )
 
     # Try to get the URLs of the subcategories, if any
     subcategory_urls = [
@@ -152,16 +169,33 @@ def extract_all_category_articles(url: str, parsed_urls: list[str] = []) -> list
 
             return [article_str]
 
-    # If it wasn't and article then we recursively extract all articles from the
+    # If it wasn't an article then we recursively extract all articles from the
     # subcategories
     desc = f"Extracting articles from {url}"
-    category_articles: list[str] = list()
     parsed_urls.append(url)
-    for subcategory_url in tqdm(subcategory_urls, desc=desc, leave=False):
-        if subcategory_url in parsed_urls:
-            continue
-        subcategory_articles = extract_all_category_articles(
-            url=subcategory_url, parsed_urls=parsed_urls
+    subcategory_urls = [url for url in subcategory_urls if url not in parsed_urls]
+    if num_workers > 1:
+        subcategory_articles = process_map(
+            partial(
+                extract_all_category_articles,
+                parsed_urls=parsed_urls,
+                num_workers=1,
+            ),
+            subcategory_urls,
+            max_workers=min(num_workers, len(subcategory_urls)),
+            desc=desc,
+            leave=False,
+            position=1,
         )
-        category_articles.extend(subcategory_articles)
-    return category_articles
+    else:
+        subcategory_articles = [
+            extract_all_category_articles(
+                url=url, parsed_urls=parsed_urls, num_workers=1
+            )
+            for url in subcategory_urls
+        ]
+    return [
+        article
+        for subcategory_article in subcategory_articles
+        for article in subcategory_article
+    ]
diff --git a/src/tts_text/utils.py b/src/tts_text/utils.py
@@ -14,6 +14,9 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import TimeoutException, WebDriverException
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
 import logging
 
 
@@ -138,7 +141,10 @@ def interleave_datasets(
 
 
 def get_soup(
-    url: str, dynamic: bool = False, retries: int | None = None
+    url: str,
+    dynamic: bool = False,
+    retries: int | None = None,
+    xpath_to_be_present: str | None = None,
 ) -> BeautifulSoup:
     """Get the soup of a URL.
 
@@ -150,6 +156,9 @@ def get_soup(
         retries:
             The number of retries to perform if the request times out. None means
             infinite retries.
+        xpath_to_be_present:
+            The xpath to wait for before returning the soup. If None, we will wait 5
+            seconds before returning the soup.
 
     Returns:
         The soup of the URL.
@@ -166,6 +175,15 @@ def get_soup(
         while retries_left > 0 and not html:
             try:
                 driver.get(url=url)
+                if xpath_to_be_present:
+                    wait = WebDriverWait(driver=driver, timeout=10)
+                    wait.until(
+                        method=EC.presence_of_element_located(
+                            locator=(By.XPATH, xpath_to_be_present)
+                        ),
+                    )
+                else:
+                    time.sleep(5)
                 html = driver.page_source
             except TimeoutException:
                 logger.warning(f"Timed out while getting soup from {url}.")