feat: automatic testing (#10)

This pull request introduces several key changes to the `torah-dl` project, focusing on adding extraction examples, refactoring tests, and updating the documentation. The most important changes include the addition of `ExtractionExample` to the extractors, refactoring the test cases to use a utility function, and updating the README to reflect new installation instructions. ### Addition of Extraction Examples: * [`src/torah_dl/core/models.py`](diffhunk://#diff-bd6461e7b33d76c6137eb081e59f3d0f353d71fff6ae99517a05da4b67414888R18-R33): Introduced the `ExtractionExample` class to represent examples of extractions. * `src/torah_dl/core/extractors/torahanytime.py` and `src/torah_dl/core/extractors/yutorah.py`: Added `EXAMPLES` to the `TorahAnytimeExtractor` and `YutorahExtractor` classes. [[1]](diffhunk://#diff-486bc73189a83c28718dbda4eba49a03447777316f24e3f32b8c66697fcca9b3R17-R43) [[2]](diffhunk://#diff-7cbf568efb6c85f5171fccc0b14a6d968e3ab3d66cbb621ab647838512e89ffbR18-R52) ### Refactoring Tests: * `test/test_core/test_extract.py` and `test/test_core/test_extractors.py`: Refactored test cases to use a new utility function `get_all_the_tests` for generating test parameters dynamically. [[1]](diffhunk://#diff-b6ecd03048aa23fc1bdcdc6b6fd9dda5268afc96384fe3a856b5b2d6a3c4e3f2R2-R9) [[2]](diffhunk://#diff-9377c07ea99c3f7acaea9eccb35d215968f8df9a2f84bafedf9a5097d21ada3bR2-L86) * [`test/test_core/utils.py`](diffhunk://#diff-656800ca19015571b1ea0e3ac99a5ddc024001df33b7dfce45c1c2c2573e28b4R1-R37): Added a new utility function `get_all_the_tests` to dynamically generate test parameters from extractor examples. ### Documentation Update: * [`README.md`](diffhunk://#diff-b335630551682c19a781afebcf4d07bf978fb1f8ac04c6bf87428ed5106870f5L14-R14): Updated the installation instructions to reflect the new command-line tool usage with `uv`.
SoferAi · Dec 12, 2024 · 7880a40 · 7880a40
1 parent b388308
commit 7880a40
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 Most of our generation's Torah content is locked up in websites that are not easily accessible. Sofer.Ai is building a platform to make Torah accessible to everyone, and we decided to build key parts of that platform in the open. We intend to support every website with Torah audio on our platform, and realized quickly that even finding all the sites with audio would be a full-time job. So we open-sourced `torah-dl` to make it easier for others to download Torah audio from any website, and make Torah more accessible!
 
 ## How does it work?
-`torah-dl` is a library and a set of command-line tools for downloading media from Torah websites. You can use it as a command-line tool with `uvx` (preferred), `pipx`, `pip`, `poetry`, `venv`, or any Python tool installer of your choice, simply by running `uvx torah-dl`.
+`torah-dl` is a library and a set of command-line tools for downloading media from Torah websites. You can use it as a command-line tool with `uv` (preferred), `pipx`, `pip`, `poetry`, `venv`, or any Python tool installer of your choice, simply by running `uv tool install "torah-dl[cli]"`, and then running `torah-dl`.
 
 For those who want to integrate `torah-dl` into their Python application, you can simply install it via `uv add torah-dl` or `pip install torah-dl`. You can then use the library in your code as you would any other Python library:
 

diff --git a/src/torah_dl/core/extractors/torahanytime.py b/src/torah_dl/core/extractors/torahanytime.py
@@ -4,7 +4,7 @@
 import requests
 
 from ..exceptions import ContentExtractionError, DownloadURLError, NetworkError, TitleExtractionError
-from ..models import Extraction, Extractor
+from ..models import Extraction, ExtractionExample, Extractor
 
 
 class TorahAnytimeExtractor(Extractor):
@@ -14,6 +14,33 @@ class TorahAnytimeExtractor(Extractor):
     links along with their associated titles from the page's JavaScript content.
     """
 
+    EXAMPLES = [  # noqa: RUF012
+        ExtractionExample(
+            name="main_page",
+            url="https://torahanytime.com/lectures/335042",
+            download_url="https://dl.torahanytime.com/mp3/335042--____10_04_2024__ee9743cb-5d09-4ffc-a3e3-1156e10e8944.mp4.mp3",
+            title="Aish Kodesh- Toldot, 5702, When It's Hard to Thank Hashem (2021/22 Series- Enhanced III)",
+            file_format="mp3",
+            valid=True,
+        ),
+        ExtractionExample(
+            name="short_link",
+            url="https://MyTAT.me/a335042",
+            download_url="https://dl.torahanytime.com/mp3/335042--____10_04_2024__ee9743cb-5d09-4ffc-a3e3-1156e10e8944.mp4.mp3",
+            title="Aish Kodesh- Toldot, 5702, When It's Hard to Thank Hashem (2021/22 Series- Enhanced III)",
+            file_format="mp3",
+            valid=True,
+        ),
+        ExtractionExample(
+            name="invalid_link",
+            url="https://torahanytime.com/whatever/0000000",
+            download_url="",
+            title="",
+            file_format="",
+            valid=False,
+        ),
+    ]
+
     # URL pattern for TorahAnytime.com pages
     URL_PATTERN = re.compile(r"https?://(?:www\.)?torahanytime\.com/")
     # URL pattern for MyTAT.me pages

diff --git a/src/torah_dl/core/extractors/yutorah.py b/src/torah_dl/core/extractors/yutorah.py
@@ -5,7 +5,7 @@
 from bs4 import BeautifulSoup
 
 from ..exceptions import ContentExtractionError, DownloadURLError, NetworkError, TitleExtractionError
-from ..models import Extraction, Extractor
+from ..models import Extraction, ExtractionExample, Extractor
 
 
 class YutorahExtractor(Extractor):
@@ -15,6 +15,41 @@ class YutorahExtractor(Extractor):
     links along with their associated titles from the page's JavaScript content.
     """
 
+    EXAMPLES = [  # noqa: RUF012
+        ExtractionExample(
+            name="main_page",
+            url="https://www.yutorah.org/lectures/1116616/Praying-for-Rain-and-the-International-Traveler",
+            download_url="https://download.yutorah.org/2024/986/1116616/praying-for-rain-and-the-international-traveler.mp3",
+            title="Praying for Rain and the International Traveler",
+            file_format="mp3",
+            valid=True,
+        ),
+        ExtractionExample(
+            name="short_link",
+            url="https://www.yutorah.org/lectures/1117459/",
+            download_url="https://download.yutorah.org/2024/986/1117459/davening-with-strep-throat.mp3",
+            title="Davening with Strep Throat",
+            file_format="mp3",
+            valid=True,
+        ),
+        ExtractionExample(
+            name="shiurid_link",
+            url="https://www.yutorah.org/lectures/details?shiurid=1117409",
+            download_url="https://download.yutorah.org/2024/21197/1117409/ketubot-42-dechitat-aveilut-1.mp3",
+            title="Ketubot 42: Dechitat Aveilut (1)",
+            file_format="mp3",
+            valid=True,
+        ),
+        ExtractionExample(
+            name="invalid_link",
+            url="https://www.yutorah.org/lectures/details?shiurid=0000000",
+            download_url="",
+            title="",
+            file_format="",
+            valid=False,
+        ),
+    ]
+
     # URL pattern for YUTorah.org pages
     URL_PATTERN = re.compile(r"https?://(?:www\.)?yutorah\.org/")
 

diff --git a/src/torah_dl/core/models.py b/src/torah_dl/core/models.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 from re import Pattern
+from typing import ClassVar
 
 from pydantic import BaseModel
 
@@ -14,9 +15,22 @@ class Extraction(BaseModel):
     # Add other common fields that all extractions should have
 
 
+class ExtractionExample(BaseModel):
+    """Represents an example of an extraction."""
+
+    name: str
+    url: str
+    download_url: str
+    title: str
+    file_format: str
+    valid: bool
+
+
 class Extractor(ABC):
     """Abstract base class for all extractors."""
 
+    EXAMPLES: ClassVar[list[ExtractionExample]] = []
+
     @property
     @abstractmethod
     def url_patterns(self) -> Pattern | list[Pattern]:

diff --git a/test/test_core/test_extract.py b/test/test_core/test_extract.py
@@ -1,28 +1,12 @@
 import pytest
+from utils import get_all_the_tests
 
 from torah_dl import extract
 from torah_dl.core.exceptions import ExtractorNotFoundError
 
-testdata = [
-    pytest.param(
-        "https://www.yutorah.org/lectures/1116616/Praying-for-Rain-and-the-International-Traveler",
-        "https://download.yutorah.org/2024/986/1116616/praying-for-rain-and-the-international-traveler.mp3",
-        "Praying for Rain and the International Traveler",
-        "mp3",
-        id="yutorah",
-    ),
-    pytest.param(
-        "https://torahanytime.com/lectures/335042",
-        "https://dl.torahanytime.com/mp3/335042--____10_04_2024__ee9743cb-5d09-4ffc-a3e3-1156e10e8944.mp4.mp3",
-        "Aish Kodesh- Toldot, 5702, When It's Hard to Thank Hashem (2021/22 Series- Enhanced III)",
-        "mp3",
-        id="torahanytime",
-    ),
-]
 
-
-@pytest.mark.parametrize("url, download_url, title, file_format", testdata)
-def test_extract(url: str, download_url: str, title: str, file_format: str):
+@pytest.mark.parametrize("extractor, url, download_url, title, file_format, valid", get_all_the_tests(only_valid=True))
+def test_extract(extractor, url: str, download_url: str, title: str, file_format: str, valid: bool):
     extraction = extract(url)
     assert extraction.download_url == download_url
     assert extraction.title == title

diff --git a/test/test_core/test_extractors.py b/test/test_core/test_extractors.py
@@ -1,86 +1,22 @@
 import pytest
+from utils import get_all_the_tests
 
 from torah_dl.core.exceptions import NetworkError
+from torah_dl.core.models import Extractor
 
 
-class TestYutorahExtractor:
-    from torah_dl.core.extractors import YutorahExtractor
+@pytest.mark.parametrize("extractor, url, download_url, title, file_format, valid", get_all_the_tests())
+def test_can_handle(extractor: Extractor, url: str, download_url: str, title: str, file_format: str, valid: bool):
+    assert extractor.can_handle(url)
 
-    extractor = YutorahExtractor()
 
-    testdata = [
-        pytest.param(
-            "https://www.yutorah.org/lectures/1116616/Praying-for-Rain-and-the-International-Traveler",
-            "https://download.yutorah.org/2024/986/1116616/praying-for-rain-and-the-international-traveler.mp3",
-            "Praying for Rain and the International Traveler",
-            "mp3",
-            id="main_page",
-        ),
-        pytest.param(
-            "https://www.yutorah.org/lectures/1117459/",
-            "https://download.yutorah.org/2024/986/1117459/davening-with-strep-throat.mp3",
-            "Davening with Strep Throat",
-            "mp3",
-            id="short_link",
-        ),
-        pytest.param(
-            "https://www.yutorah.org/lectures/details?shiurid=1117409",
-            "https://download.yutorah.org/2024/21197/1117409/ketubot-42-dechitat-aveilut-1.mp3",
-            "Ketubot 42: Dechitat Aveilut (1)",
-            "mp3",
-            id="shiurid_link",
-        ),
-    ]
-
-    @pytest.mark.parametrize("url, download_url, title, file_format", testdata)
-    def test_can_handle(self, url: str, download_url: str, title: str, file_format: str):
-        assert self.extractor.can_handle(url)
-
-    @pytest.mark.parametrize("url, download_url, title, file_format", testdata)
-    def test_extract(self, url: str, download_url: str, title: str, file_format: str):
-        result = self.extractor.extract(url)
-        assert result.download_url == download_url
-        assert result.title == title
-        assert result.file_format == file_format
-
-    def test_extract_invalid_link(self):
+@pytest.mark.parametrize("extractor, url, download_url, title, file_format, valid", get_all_the_tests())
+def test_extract(extractor: Extractor, url: str, download_url: str, title: str, file_format: str, valid: bool):
+    if not valid:
         with pytest.raises(NetworkError):
-            self.extractor.extract("https://www.yutorah.org/lectures/details?shiurid=0000000/")
-
-
-class TestTorahAnytimeExtractor:
-    from torah_dl.core.extractors import TorahAnytimeExtractor
-
-    extractor = TorahAnytimeExtractor()
-
-    testdata = [
-        pytest.param(
-            "https://torahanytime.com/lectures/335042",
-            "https://dl.torahanytime.com/mp3/335042--____10_04_2024__ee9743cb-5d09-4ffc-a3e3-1156e10e8944.mp4.mp3",
-            "Aish Kodesh- Toldot, 5702, When It's Hard to Thank Hashem (2021/22 Series- Enhanced III)",
-            "mp3",
-            id="main_page",
-        ),
-        pytest.param(
-            "https://MyTAT.me/a335042",
-            "https://dl.torahanytime.com/mp3/335042--____10_04_2024__ee9743cb-5d09-4ffc-a3e3-1156e10e8944.mp4.mp3",
-            "Aish Kodesh- Toldot, 5702, When It's Hard to Thank Hashem (2021/22 Series- Enhanced III)",
-            "mp3",
-            id="short_link",
-        ),
-    ]
-
-    @pytest.mark.parametrize("url, download_url, title, file_format", testdata)
-    def test_can_handle(self, url: str, download_url: str, title: str, file_format: str):
-        assert self.extractor.can_handle(url)
-
-    @pytest.mark.parametrize("url, download_url, title, file_format", testdata)
-    def test_extract(self, url: str, download_url: str, title: str, file_format: str):
-        result = self.extractor.extract(url)
+            extractor.extract(url)
+    else:
+        result = extractor.extract(url)
         assert result.download_url == download_url
         assert result.title == title
         assert result.file_format == file_format
-
-    def test_extract_invalid_link(self):
-        with pytest.raises(NetworkError):
-            self.extractor.extract("https://torahanytime.com/whatever/0000000")
diff --git a/test/test_core/utils.py b/test/test_core/utils.py
@@ -0,0 +1,37 @@
+import importlib
+import inspect
+from pathlib import Path
+
+import pytest
+
+from torah_dl.core.models import Extractor
+
+
+def get_all_the_tests(only_valid: bool = False) -> list[pytest.param]:
+    extractors_path = Path(__file__).parent.parent.parent / "src" / "torah_dl" / "core" / "extractors"
+    tests: list[pytest.param] = []
+
+    for file in extractors_path.glob("*.py"):
+        if file.stem in ["__init__", "base"]:
+            continue
+
+        module_path = f"torah_dl.core.extractors.{file.stem}"
+        module = importlib.import_module(module_path)
+
+        for _, obj in inspect.getmembers(module):
+            if inspect.isclass(obj) and issubclass(obj, Extractor) and obj != Extractor:
+                for ex in obj.EXAMPLES:
+                    if only_valid and not ex.valid:
+                        continue
+                    tests.append(
+                        pytest.param(
+                            obj(),
+                            ex.url,
+                            ex.download_url,
+                            ex.title,
+                            ex.file_format,
+                            ex.valid,
+                            id=f"{obj.__name__}.{ex.name}",
+                        )
+                    )
+    return tests