centre-for-humanities-computing · KennethEnevoldsen · Dec 26, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 31, 2024
diff --git a/.cruft.json b/.cruft.json
@@ -1,6 +1,6 @@
 {
   "template": "https://github.com/KennethEnevoldsen/swift-python-cookiecutter",
-  "commit": "e02068889310225ea4f65ea0b203c2949e1597a9",
+  "commit": "85413085032f305896da8bad287a83d53fb0b196",
   "checkout": null,
   "context": {
     "cookiecutter": {

diff --git a/.github/workflows/static_type_checks.yml b/.github/workflows/static_type_checks.yml
@@ -1,3 +1,10 @@
+# THIS ACTION WILL:
+  # 1. Install dependencies
+  # 2. Run static type checker
+
+# SETUP:
+  # None required except for the Makefile
+
 name: static_type_checks
 
 on:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,17 +1,19 @@
-# This workflow will:
-# 1) install Python dependencies
-# 2) run make test
+# THIS ACTION WILL:
+  # 1) install Python dependencies
+  # 2) run make test
 
+# SETUP:
+  # None required except for the Makefile
 
-name: Tests
+name: test
 on:
   push:
     branches: [main]
   pull_request:
     branches: [main]
 
 jobs:
-  pytest:
+  test:
     runs-on: ${{ matrix.os }}
     permissions:
       contents: read

diff --git a/docs/evaluation/datasets.py b/docs/evaluation/datasets.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import random
-from typing import Any, Dict, List
+from typing import Any
 
 import augmenty
 import catalogue
@@ -15,7 +17,7 @@
 
 
 @datasets.register("dane")
-def dane() -> Dict[str, List[Example]]:
+def dane() -> dict[str, list[Example]]:
     from dacy.datasets import dane as _dane
 
     train, dev, test = _dane(splits=["train", "dev", "test"])  # type: ignore
@@ -34,7 +36,7 @@ def augment_dataset(
     augmenters: dict,
     n_rep: int = 20,
     split: str = "test",
-) -> List[Example]:
+) -> list[Example]:
     # ensure seed
     random.seed(42)
     np.random.seed(42)
@@ -63,25 +65,25 @@ def augment_dataset(
 
 
 @datasets.register("gender_bias_dane")
-def dane_gender_bias() -> Dict[str, List[Example]]:
+def dane_gender_bias() -> dict[str, list[Example]]:
     return {"test": augment_dataset("dane", augmenters=get_gender_bias_augmenters())}
 
 
 @datasets.register("robustness_dane")
-def dane_robustness() -> Dict[str, List[Example]]:
+def dane_robustness() -> dict[str, list[Example]]:
     return {"test": augment_dataset("dane", augmenters=get_robustness_augmenters())}
 
 
 @datasets.register("dansk")
-def dansk(**kwargs: Any) -> Dict[str, List[Example]]:
+def dansk(**kwargs: Any) -> dict[str, list[Example]]:
     splits = ["train", "dev", "test"]
 
     if not Doc.has_extension("meta"):
         Doc.set_extension("meta", default={}, force=True)
 
     nlp = spacy.blank("da")
 
-    def convert_to_doc(example: Dict) -> Doc:
+    def convert_to_doc(example: dict) -> Doc:
         doc = Doc(nlp.vocab).from_json(example)
         # set metadata
         for k in ["dagw_source", "dagw_domain", "dagw_source_full"]:

diff --git a/docs/evaluation/utils.py b/docs/evaluation/utils.py
@@ -1,7 +1,7 @@
 import json
 import random
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Optional
 
 import numpy as np
 import pandas as pd
@@ -15,11 +15,11 @@
 
 
 def bootstrap(
-    examples: List[Example],
+    examples: list[Example],
     n_rep: int = 100,
     n_samples: Optional[int] = None,
     getter: Optional[Callable] = None,
-) -> List[Dict[str, Any]]:
+) -> list[dict[str, Any]]:
     random.seed(42)
     scorer = Scorer()
     scores = []
@@ -35,7 +35,7 @@ def bootstrap(
     return scores
 
 
-def compute_mean_and_ci(scores: List[Dict[str, Any]]) -> Dict[str, Any]:
+def compute_mean_and_ci(scores: list[dict[str, Any]]) -> dict[str, Any]:
     ent_f = [score["ents_f"] for score in scores]
     # filter out None
     ent_f = [x for x in ent_f if x is not None]
@@ -116,7 +116,7 @@ def doc_from_json(json_obj: dict, nlp: Language) -> Doc:
 
 def predictions_to_disk(
     save_path: Path,
-    examples: List[Example],
+    examples: list[Example],
     mdl_name: str,
     time_in_seconds: float,
 ) -> dict:
@@ -199,7 +199,7 @@ def apply_models(
 
 
 def create_dataframe(
-    examples: List[Example],
+    examples: list[Example],
     mdl_name: str,
     decimals: int = 1,
     n_rep: int = 100,
@@ -212,7 +212,7 @@ def create_dataframe(
         "Models": mdl_name,
     }
 
-    def score_to_string(score: Dict[str, Any], decimals: int = 1) -> str:
+    def score_to_string(score: dict[str, Any], decimals: int = 1) -> str:
         if score["mean"] == 0:
             return " "
         return f"{100*score['mean']:.{decimals}f} ({100*score['ci'][0]:.{decimals}f}, {100*score['ci'][1]:.{decimals}f})"

diff --git a/makefile b/makefile
@@ -9,9 +9,9 @@ static-type-check:
 lint:
 	@echo "--- 🧹 Running linters ---"
 	ruff format .  								# running ruff formatting
-	ruff src/ --fix  						    # running ruff linting
-	ruff tests/ --fix
-	ruff docs/conf.py --fix
+	ruff check src/ --fix  						    # running ruff linting
+	ruff check tests/ --fix
+	ruff check docs/conf.py --fix
 
 test:
 	@echo "--- 🧪 Running tests ---"

diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py
@@ -1,5 +1,5 @@
 ### pip install daluke==0.0.5
-from typing import Iterable, List
+from typing import Iterable
 
 from daluke import AutoNERDaLUKE, predict_ner
 from spacy.lang.da import Danish
@@ -18,7 +18,7 @@ def apply_daluke(
     examples: Iterable[Example],
     use_spacy: bool = True,
     batch_size: int = 16,
-) -> List[Example]:
+) -> list[Example]:
     docs_y, sentences = list(), list()
     for example in examples:
         # Tokenization using spacy or nltk

diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py
@@ -1,6 +1,6 @@
 # to download the danlp and nerda you will have to set up a certificate:
 import ssl
-from typing import Iterable, List
+from typing import Iterable, list
 
 from NERDA.precooked import DA_BERT_ML
 from spacy.lang.da import Danish
@@ -17,7 +17,7 @@
 nlp_da = Danish()
 
 
-def apply_nerda(examples: Iterable[Example], use_spacy: bool = True) -> List[Example]:
+def apply_nerda(examples: Iterable[Example], use_spacy: bool = True) -> list[Example]:
     sentences = []
     docs_y = []
     for example in examples:

diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, List
+from typing import Callable, Iterable
 
 from spacy.tokens import Doc, Span
 from spacy.training import Example
@@ -12,12 +12,12 @@ def no_misc_getter(doc, attr):
         yield span
 
 
-def add_iob(doc: Doc, iob: List[str]) -> Doc:
+def add_iob(doc: Doc, iob: list[str]) -> Doc:
     """Add iob tags to Doc.
 
     Args:
         doc (Doc): A SpaCy doc
-        iob (List[str]): a list of tokens on the IOB format
+        iob (list[str]): a list of tokens on the IOB format
 
     Returns:
         Doc: A doc with the spans to the new IOB

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,11 +48,7 @@ repository = "https://github.com/centre-for-humanities-computing/DaCy"
 file = "LICENSE"
 name = "Apache License 2.0"
 [project.optional-dependencies]
-dev = [
-  "cruft>=2.0.0",
-  "pyright>=1.1.339",
-  "ruff>=0.0.270",
-]
+dev = ["cruft>=2.0.0", "pyright>=1.1.339", "ruff==0.7.1"]
 tests = ["pytest>=7.1.2", "pytest-cov>=3.0.0", "pytest-instafail>=0.4.2"]
 docs = [
   "sphinx==5.3.0",
@@ -110,6 +106,40 @@ pythonPlatform = "Darwin"
 
 [tool.ruff]
 # extend-include = ["*.ipynb"]
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+  ".bzr",
+  ".direnv",
+  ".eggs",
+  ".git",
+  ".hg",
+  ".nox",
+  ".pants.d",
+  ".pytype",
+  ".ruff_cache",
+  ".svn",
+  ".tox",
+  ".venv",
+  "__pypackages__",
+  "_build",
+  "buck-out",
+  "build",
+  "dist",
+  "node_modules",
+  "venv",
+  "__init__.py",
+  ".env",
+  "__pycache__",
+  "dev/**",
+  "training/main/**",
+  "training/ner_fine_grained/**",
+  "papers/DaCy-A-Unified-Framework-for-Danish-NLP/**",
+  "docs/performance_testing_utils/**",
+]
+target-version = "py39"
+
+[tool.ruff.lint]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 select = [
   "A",
@@ -150,54 +180,23 @@ ignore = [
   "ANN202",
   "COM812",
 ]
-ignore-init-module-imports = true
 # Allow autofix for all enabled rules (when `--fix`) is provided.
 unfixable = ["ERA"]
-# Exclude a variety of commonly ignored directories.
-exclude = [
-  ".bzr",
-  ".direnv",
-  ".eggs",
-  ".git",
-  ".hg",
-  ".nox",
-  ".pants.d",
-  ".pytype",
-  ".ruff_cache",
-  ".svn",
-  ".tox",
-  ".venv",
-  "__pypackages__",
-  "_build",
-  "buck-out",
-  "build",
-  "dist",
-  "node_modules",
-  "venv",
-  "__init__.py",
-  ".venv",
-  ".env",
-  ".git",
-  "__pycache__",
-  "dev/**",
-  "training/main/**",
-  "training/ner_fine_grained/**",
-  "papers/DaCy-A-Unified-Framework-for-Danish-NLP/**",
-  "docs/performance_testing_utils/**",
-]
+
+
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-target-version = "py38"
+
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
-[tool.ruff.flake8-annotations]
+[tool.ruff.lint.flake8-annotations]
 mypy-init-return = true
 suppress-none-returning = true
 
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 10
 

diff --git a/src/dacy/datasets/dane.py b/src/dacy/datasets/dane.py
@@ -5,7 +5,7 @@
 import sys
 from os import PathLike
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from spacy.training.corpus import Corpus
 
@@ -15,19 +15,19 @@
 
 def dane(  # noqa
     save_path: Optional[PathLike] = None,  # type: ignore
-    splits: List[str] = ["train", "dev", "test"],  # noqa  # type: ignore
+    splits: list[str] = ["train", "dev", "test"],  # noqa  # type: ignore
     redownload: bool = False,
     n_sents: int = 1,
     open_unverified_connection: bool = False,
     **kwargs,  # noqa
-) -> Union[List[Corpus], Corpus]:  # type: ignore
+) -> Union[list[Corpus], Corpus]:  # type: ignore
     """Reads the DaNE dataset as a spacy Corpus.
 
     Args:
         save_path (str, optional): Path to the DaNE dataset If it does not contain the
             dataset it is downloaded to the folder. Defaults to None corresponding to
             dacy.where_is_my_dacy() in the datasets subfolder.
-        splits (List[str], optional): Which splits of the dataset should be returned.
+        splits (list[str], optional): Which splits of the dataset should be returned.
             Possible options include "train", "dev", "test", "all". Defaults to
             ["train", "dev", "test"].
         redownload (bool, optional): Should the dataset be redownloaded. Defaults to
@@ -40,7 +40,7 @@ def dane(  # noqa
             whether it already exists. Defaults to False.
 
     Returns:
-        Union[List[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof.
+        Union[list[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof.
 
     Example:
         >>> from dacy.datasets import dane