diff --git a/.cruft.json b/.cruft.json index 67f6cef3..990d73f1 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,6 +1,6 @@ { "template": "https://github.com/KennethEnevoldsen/swift-python-cookiecutter", - "commit": "e02068889310225ea4f65ea0b203c2949e1597a9", + "commit": "85413085032f305896da8bad287a83d53fb0b196", "checkout": null, "context": { "cookiecutter": { diff --git a/.github/workflows/static_type_checks.yml b/.github/workflows/static_type_checks.yml index 9975945a..019338c3 100644 --- a/.github/workflows/static_type_checks.yml +++ b/.github/workflows/static_type_checks.yml @@ -1,3 +1,10 @@ +# THIS ACTION WILL: + # 1. Install dependencies + # 2. Run static type checker + +# SETUP: + # None required except for the Makefile + name: static_type_checks on: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3a1814c5..a06f89fe 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,11 @@ -# This workflow will: -# 1) install Python dependencies -# 2) run make test +# THIS ACTION WILL: + # 1) install Python dependencies + # 2) run make test +# SETUP: + # None required except for the Makefile -name: Tests +name: test on: push: branches: [main] @@ -11,7 +13,7 @@ on: branches: [main] jobs: - pytest: + test: runs-on: ${{ matrix.os }} permissions: contents: read diff --git a/docs/evaluation/datasets.py b/docs/evaluation/datasets.py index 418aea4c..9521da69 100644 --- a/docs/evaluation/datasets.py +++ b/docs/evaluation/datasets.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import random -from typing import Any, Dict, List +from typing import Any import augmenty import catalogue @@ -15,7 +17,7 @@ @datasets.register("dane") -def dane() -> Dict[str, List[Example]]: +def dane() -> dict[str, list[Example]]: from dacy.datasets import dane as _dane train, dev, test = _dane(splits=["train", "dev", "test"]) # type: ignore @@ -34,7 +36,7 @@ def augment_dataset( augmenters: dict, n_rep: int = 20, split: str = "test", -) -> List[Example]: +) -> list[Example]: # ensure seed random.seed(42) np.random.seed(42) @@ -63,17 +65,17 @@ def augment_dataset( @datasets.register("gender_bias_dane") -def dane_gender_bias() -> Dict[str, List[Example]]: +def dane_gender_bias() -> dict[str, list[Example]]: return {"test": augment_dataset("dane", augmenters=get_gender_bias_augmenters())} @datasets.register("robustness_dane") -def dane_robustness() -> Dict[str, List[Example]]: +def dane_robustness() -> dict[str, list[Example]]: return {"test": augment_dataset("dane", augmenters=get_robustness_augmenters())} @datasets.register("dansk") -def dansk(**kwargs: Any) -> Dict[str, List[Example]]: +def dansk(**kwargs: Any) -> dict[str, list[Example]]: splits = ["train", "dev", "test"] if not Doc.has_extension("meta"): @@ -81,7 +83,7 @@ def dansk(**kwargs: Any) -> Dict[str, List[Example]]: nlp = spacy.blank("da") - def convert_to_doc(example: Dict) -> Doc: + def convert_to_doc(example: dict) -> Doc: doc = Doc(nlp.vocab).from_json(example) # set metadata for k in ["dagw_source", "dagw_domain", "dagw_source_full"]: diff --git a/docs/evaluation/utils.py b/docs/evaluation/utils.py index fb78287f..e47b696a 100644 --- a/docs/evaluation/utils.py +++ b/docs/evaluation/utils.py @@ -1,7 +1,7 @@ import json import random from pathlib import Path -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Optional import numpy as np import pandas as pd @@ -15,11 +15,11 @@ def bootstrap( - examples: List[Example], + examples: list[Example], n_rep: int = 100, n_samples: Optional[int] = None, getter: Optional[Callable] = None, -) -> List[Dict[str, Any]]: +) -> list[dict[str, Any]]: random.seed(42) scorer = Scorer() scores = [] @@ -35,7 +35,7 @@ def bootstrap( return scores -def compute_mean_and_ci(scores: List[Dict[str, Any]]) -> Dict[str, Any]: +def compute_mean_and_ci(scores: list[dict[str, Any]]) -> dict[str, Any]: ent_f = [score["ents_f"] for score in scores] # filter out None ent_f = [x for x in ent_f if x is not None] @@ -116,7 +116,7 @@ def doc_from_json(json_obj: dict, nlp: Language) -> Doc: def predictions_to_disk( save_path: Path, - examples: List[Example], + examples: list[Example], mdl_name: str, time_in_seconds: float, ) -> dict: @@ -199,7 +199,7 @@ def apply_models( def create_dataframe( - examples: List[Example], + examples: list[Example], mdl_name: str, decimals: int = 1, n_rep: int = 100, @@ -212,7 +212,7 @@ def create_dataframe( "Models": mdl_name, } - def score_to_string(score: Dict[str, Any], decimals: int = 1) -> str: + def score_to_string(score: dict[str, Any], decimals: int = 1) -> str: if score["mean"] == 0: return " " return f"{100*score['mean']:.{decimals}f} ({100*score['ci'][0]:.{decimals}f}, {100*score['ci'][1]:.{decimals}f})" diff --git a/makefile b/makefile index 4c3c83a3..2d6286f9 100644 --- a/makefile +++ b/makefile @@ -9,9 +9,9 @@ static-type-check: lint: @echo "--- 🧹 Running linters ---" ruff format . # running ruff formatting - ruff src/ --fix # running ruff linting - ruff tests/ --fix - ruff docs/conf.py --fix + ruff check src/ --fix # running ruff linting + ruff check tests/ --fix + ruff check docs/conf.py --fix test: @echo "--- 🧪 Running tests ---" diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py index 987c94df..c0654748 100644 --- a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py +++ b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_daluke.py @@ -1,5 +1,5 @@ ### pip install daluke==0.0.5 -from typing import Iterable, List +from typing import Iterable from daluke import AutoNERDaLUKE, predict_ner from spacy.lang.da import Danish @@ -18,7 +18,7 @@ def apply_daluke( examples: Iterable[Example], use_spacy: bool = True, batch_size: int = 16, -) -> List[Example]: +) -> list[Example]: docs_y, sentences = list(), list() for example in examples: # Tokenization using spacy or nltk diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py index caf2f3cb..46d26f84 100644 --- a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py +++ b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_nerda.py @@ -1,6 +1,6 @@ # to download the danlp and nerda you will have to set up a certificate: import ssl -from typing import Iterable, List +from typing import Iterable, list from NERDA.precooked import DA_BERT_ML from spacy.lang.da import Danish @@ -17,7 +17,7 @@ nlp_da = Danish() -def apply_nerda(examples: Iterable[Example], use_spacy: bool = True) -> List[Example]: +def apply_nerda(examples: Iterable[Example], use_spacy: bool = True) -> list[Example]: sentences = [] docs_y = [] for example in examples: diff --git a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py index d88200b8..9b6aaede 100644 --- a/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py +++ b/papers/DaCy-A-Unified-Framework-for-Danish-NLP/apply_fns/apply_fn_utils.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterable, List +from typing import Callable, Iterable from spacy.tokens import Doc, Span from spacy.training import Example @@ -12,12 +12,12 @@ def no_misc_getter(doc, attr): yield span -def add_iob(doc: Doc, iob: List[str]) -> Doc: +def add_iob(doc: Doc, iob: list[str]) -> Doc: """Add iob tags to Doc. Args: doc (Doc): A SpaCy doc - iob (List[str]): a list of tokens on the IOB format + iob (list[str]): a list of tokens on the IOB format Returns: Doc: A doc with the spans to the new IOB diff --git a/pyproject.toml b/pyproject.toml index 1e0d4055..20be51b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,11 +48,7 @@ repository = "https://github.com/centre-for-humanities-computing/DaCy" file = "LICENSE" name = "Apache License 2.0" [project.optional-dependencies] -dev = [ - "cruft>=2.0.0", - "pyright>=1.1.339", - "ruff>=0.0.270", -] +dev = ["cruft>=2.0.0", "pyright>=1.1.339", "ruff==0.7.1"] tests = ["pytest>=7.1.2", "pytest-cov>=3.0.0", "pytest-instafail>=0.4.2"] docs = [ "sphinx==5.3.0", @@ -110,6 +106,40 @@ pythonPlatform = "Darwin" [tool.ruff] # extend-include = ["*.ipynb"] + +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "__init__.py", + ".env", + "__pycache__", + "dev/**", + "training/main/**", + "training/ner_fine_grained/**", + "papers/DaCy-A-Unified-Framework-for-Danish-NLP/**", + "docs/performance_testing_utils/**", +] +target-version = "py39" + +[tool.ruff.lint] # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. select = [ "A", @@ -150,54 +180,23 @@ ignore = [ "ANN202", "COM812", ] -ignore-init-module-imports = true # Allow autofix for all enabled rules (when `--fix`) is provided. unfixable = ["ERA"] -# Exclude a variety of commonly ignored directories. -exclude = [ - ".bzr", - ".direnv", - ".eggs", - ".git", - ".hg", - ".nox", - ".pants.d", - ".pytype", - ".ruff_cache", - ".svn", - ".tox", - ".venv", - "__pypackages__", - "_build", - "buck-out", - "build", - "dist", - "node_modules", - "venv", - "__init__.py", - ".venv", - ".env", - ".git", - "__pycache__", - "dev/**", - "training/main/**", - "training/ner_fine_grained/**", - "papers/DaCy-A-Unified-Framework-for-Danish-NLP/**", - "docs/performance_testing_utils/**", -] + + # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -target-version = "py38" + [tool.ruff.lint.pydocstyle] convention = "google" -[tool.ruff.flake8-annotations] +[tool.ruff.lint.flake8-annotations] mypy-init-return = true suppress-none-returning = true -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 diff --git a/src/dacy/datasets/dane.py b/src/dacy/datasets/dane.py index f851caae..088a85e1 100644 --- a/src/dacy/datasets/dane.py +++ b/src/dacy/datasets/dane.py @@ -5,7 +5,7 @@ import sys from os import PathLike from pathlib import Path -from typing import List, Optional, Union +from typing import Optional, Union from spacy.training.corpus import Corpus @@ -15,19 +15,19 @@ def dane( # noqa save_path: Optional[PathLike] = None, # type: ignore - splits: List[str] = ["train", "dev", "test"], # noqa # type: ignore + splits: list[str] = ["train", "dev", "test"], # noqa # type: ignore redownload: bool = False, n_sents: int = 1, open_unverified_connection: bool = False, **kwargs, # noqa -) -> Union[List[Corpus], Corpus]: # type: ignore +) -> Union[list[Corpus], Corpus]: # type: ignore """Reads the DaNE dataset as a spacy Corpus. Args: save_path (str, optional): Path to the DaNE dataset If it does not contain the dataset it is downloaded to the folder. Defaults to None corresponding to dacy.where_is_my_dacy() in the datasets subfolder. - splits (List[str], optional): Which splits of the dataset should be returned. + splits (list[str], optional): Which splits of the dataset should be returned. Possible options include "train", "dev", "test", "all". Defaults to ["train", "dev", "test"]. redownload (bool, optional): Should the dataset be redownloaded. Defaults to @@ -40,7 +40,7 @@ def dane( # noqa whether it already exists. Defaults to False. Returns: - Union[List[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof. + Union[list[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof. Example: >>> from dacy.datasets import dane diff --git a/src/dacy/datasets/names.py b/src/dacy/datasets/names.py index 4878fb0e..9c3a1ef2 100644 --- a/src/dacy/datasets/names.py +++ b/src/dacy/datasets/names.py @@ -1,7 +1,7 @@ """Helper functions for loading name dictionaries for person augmentation.""" import os -from typing import Dict, List, Optional +from typing import Optional import pandas as pd @@ -11,7 +11,7 @@ def load_names( ethnicity: Optional[str] = None, # type: ignore gender: Optional[str] = None, # type: ignore min_prop_gender: float = 0, -) -> Dict[str, List[str]]: # type: ignore +) -> dict[str, list[str]]: # type: ignore """Loads the names lookup table. Danish are from Danmarks statistik (2021). Muslim names are from Meldgaard (2005), https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/. @@ -30,7 +30,7 @@ def load_names( gender is set. Defaults to 0. Returns: - Dict[str, List[str]]: A dictionary of names containing the keys + dict[str, list[str]]: A dictionary of names containing the keys "first_name" and "last_name". """ path = os.path.join( # noqa @@ -64,11 +64,11 @@ def load_names( } -def muslim_names() -> Dict[str, List[str]]: # type: ignore +def muslim_names() -> dict[str, list[str]]: # type: ignore """Returns a dictionary of Muslim names. Returns: - Dict[str, List[str]]: A dictionary of Muslim names containing the keys + dict[str, list[str]]: A dictionary of Muslim names containing the keys "first_name" and "last_name". The list is derived from Meldgaard (2005), https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/. @@ -81,11 +81,11 @@ def muslim_names() -> Dict[str, List[str]]: # type: ignore return load_names(ethnicity="muslim") -def danish_names() -> Dict[str, List[str]]: # type: ignore +def danish_names() -> dict[str, list[str]]: # type: ignore """Returns a dictionary of Danish names. Returns: - Dict[str, List[str]]: A dictionary of Danish names containing the keys + dict[str, list[str]]: A dictionary of Danish names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021). @@ -98,11 +98,11 @@ def danish_names() -> Dict[str, List[str]]: # type: ignore return load_names(ethnicity="danish") -def female_names() -> Dict[str, List[str]]: # type: ignore +def female_names() -> dict[str, list[str]]: # type: ignore """Returns a dictionary of Danish female names. Returns: - Dict[str, List[str]]: A dictionary of names containing the keys "first_name" + dict[str, list[str]]: A dictionary of names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021). Example: @@ -114,11 +114,11 @@ def female_names() -> Dict[str, List[str]]: # type: ignore return load_names(ethnicity="danish", gender="female", min_prop_gender=0.5) -def male_names() -> Dict[str, List[str]]: # type: ignore +def male_names() -> dict[str, list[str]]: # type: ignore """Returns a dictionary of Danish male names. Returns: - Dict[str, List[str]]: A dictionary of names containing the keys "first_name" + dict[str, list[str]]: A dictionary of names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021). Example: diff --git a/src/dacy/download.py b/src/dacy/download.py index 7f8a7efe..0713f2da 100644 --- a/src/dacy/download.py +++ b/src/dacy/download.py @@ -1,4 +1,5 @@ """Functions for downloading DaCy models.""" + import os from importlib.metadata import version from pathlib import Path diff --git a/src/dacy/hate_speech/wrapped_models.py b/src/dacy/hate_speech/wrapped_models.py index 86383ea0..29168aed 100644 --- a/src/dacy/hate_speech/wrapped_models.py +++ b/src/dacy/hate_speech/wrapped_models.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional +from typing import Callable, Optional from warnings import warn from spacy.lang.da import Danish @@ -75,12 +75,12 @@ def make_offensive_transformer( nlp: Language, name: str, - model: Model[List[Doc], FullTransformerBatch], - set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], # type: ignore + model: Model[list[Doc], FullTransformerBatch], + set_extra_annotations: Callable[[list[Doc], FullTransformerBatch], None], # type: ignore max_batch_items: int, doc_extension_trf_data: str, doc_extension_prediction: str, - labels: List[str], # type: ignore + labels: list[str], # type: ignore ) -> SequenceClassificationTransformer: if not Doc.has_extension("is_offensive"): warn( diff --git a/src/dacy/load.py b/src/dacy/load.py index 52cedb6e..0d7e2a0d 100644 --- a/src/dacy/load.py +++ b/src/dacy/load.py @@ -1,4 +1,5 @@ """Functionality for loading and locating DaCy models.""" + import warnings from pathlib import Path from typing import Any, Union diff --git a/src/dacy/score/input_length.py b/src/dacy/score/input_length.py index 32007a70..b49568d9 100644 --- a/src/dacy/score/input_length.py +++ b/src/dacy/score/input_length.py @@ -1,6 +1,7 @@ """Contains functions for testing the performance of models on varying input length.""" -from typing import Callable, List, Union + +from typing import Callable, Union import pandas as pd from wasabi import msg @@ -10,11 +11,11 @@ def n_sents_score( - n_sents: Union[int, List[int]], # type: ignore + n_sents: Union[int, list[int]], # type: ignore apply_fn: Callable, # type: ignore dataset: str = "dane", split: str = "test", - score_fn: List[Union[str, Callable]] = ["token", "pos", "ents", "dep"], # noqa # type: ignore + score_fn: list[Union[str, Callable]] = ["token", "pos", "ents", "dep"], # noqa # type: ignore verbose: bool = True, **kwargs, # noqa ) -> pd.DataFrame: @@ -22,7 +23,7 @@ def n_sents_score( sentences. Args: - n_sents (Union[int, List[int]]): Number of sentences which the performance + n_sents (Union[int, list[int]]): Number of sentences which the performance should be applied to. apply_fn (Callable): A wrapper function for the model you wish to score. The model should take in a spacy Example and output a tagged version of it. @@ -30,7 +31,7 @@ def n_sents_score( options include "dane". Defaults to "dane". split (str, optional): Which splits of the dataset should be used. Possible options include "train", "dev", "test", "all". Defaults to "test". - score_fn (List[Union[str, Callable]], optional): A scoring function which takes + score_fn (list[Union[str, Callable]], optional): A scoring function which takes in a list of examples and return a dictionary of the form {"score_name": score}. Four potiential strings are valid. "ents" for measuring the performance of entity spans. "pos" for measuring the performance of diff --git a/src/dacy/score/score.py b/src/dacy/score/score.py index cfa3e320..bc960c10 100644 --- a/src/dacy/score/score.py +++ b/src/dacy/score/score.py @@ -1,10 +1,12 @@ """This includes function for scoring models applied to a SpaCy corpus.""" + from __future__ import annotations +from collections.abc import Iterable from copy import copy from functools import partial from time import time # type: ignore -from typing import Callable, Iterable +from typing import Callable import pandas as pd from spacy.language import Language @@ -63,7 +65,7 @@ def score( # noqa to score. The model should take in a list of spacy Examples (Iterable[Example]) and output a tagged version of it (Iterable[Example]). A SpaCy pipeline (Language) can be provided as is. - score_fn (List[Union[Callable[[Iterable[Example]], dict], str]], optional): A + score_fn (list[Union[Callable[[Iterable[Example]], dict], str]], optional): A scoring function which takes in a list of examples (Iterable[Example]) and return a dictionary of performance scores. Four potiential strings are valid. "ents" for measuring the performance of entity spans. "pos" for @@ -72,7 +74,7 @@ def score( # noqa "dep" for measuring the performance of dependency parsing. "nlp" for measuring the performance of all components in the specified nlp pipeline. Defaults to ["token", "pos", "ents", "dep"]. - augmenters (List[Callable[[Language, Example], Iterable[Example]]], optional): A + augmenters (list[Callable[[Language, Example], Iterable[Example]]], optional): A spaCy style augmenters which should be applied to the corpus or a list thereof. defaults to [], indicating no augmenters. k (int, optional): Number of times it should run the augmentation and test the @@ -163,7 +165,7 @@ def __score(augmenter): # noqa: ANN001 # and collapse list to dict for key in scores: # type: ignore - scores[key] = [s[key] if key in s else None for s in scores_ls] # type: ignore + scores[key] = [s.get(key, None) for s in scores_ls] # type: ignore scores["k"] = list(range(k)) # type: ignore diff --git a/src/dacy/sentiment/wrapped_models.py b/src/dacy/sentiment/wrapped_models.py index d4043404..67110532 100644 --- a/src/dacy/sentiment/wrapped_models.py +++ b/src/dacy/sentiment/wrapped_models.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional +from typing import Callable, Optional from warnings import warn from spacy.lang.da import Danish @@ -132,12 +132,12 @@ def make_emotion_transformer( nlp: Language, name: str, - model: Model[List[Doc], FullTransformerBatch], - set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], # type: ignore + model: Model[list[Doc], FullTransformerBatch], + set_extra_annotations: Callable[[list[Doc], FullTransformerBatch], None], # type: ignore max_batch_items: int, doc_extension_trf_data: str, doc_extension_prediction: str, - labels: List[str], # type: ignore + labels: list[str], # type: ignore ) -> SequenceClassificationTransformer: if not Doc.has_extension("emotionally_laden"): warn( diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 63558a7d..88ca2ffd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,9 +1,10 @@ -import dacy -from dacy.datasets import dane, female_names, male_names, muslim_names from spacy.lang.da import Danish from spacy.training import Example from spacy.training.corpus import Corpus +import dacy +from dacy.datasets import dane, female_names, male_names, muslim_names + def test_dane(): train, dev, test = dane(open_unverified_connection=True) # type: ignore diff --git a/tests/test_download.py b/tests/test_download.py index 32ae2f71..fe16ebf6 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,5 +1,6 @@ -import dacy import pytest + +import dacy from dacy.load import load diff --git a/tests/test_hate_speech.py b/tests/test_hate_speech.py index cec26d8c..09ca6f95 100644 --- a/tests/test_hate_speech.py +++ b/tests/test_hate_speech.py @@ -1,6 +1,7 @@ -import dacy import spacy +import dacy + def test_add_hate_speech_detection(): nlp = spacy.blank("da") diff --git a/tests/test_score.py b/tests/test_score.py index 8d1ce467..54510535 100644 --- a/tests/test_score.py +++ b/tests/test_score.py @@ -1,9 +1,10 @@ import pandas as pd -from dacy.datasets import dane -from dacy.score import n_sents_score, score from spacy.lang.da import Danish from spacy.training.augment import create_lower_casing_augmenter +from dacy.datasets import dane +from dacy.score import n_sents_score, score + def test_score(): nlp = Danish() diff --git a/tests/test_sentiment.py b/tests/test_sentiment.py index 0777c92c..23a5bfe7 100644 --- a/tests/test_sentiment.py +++ b/tests/test_sentiment.py @@ -1,6 +1,7 @@ -import dacy import spacy +import dacy + def test_add_subjectivity(): nlp = spacy.blank("da") diff --git a/training/main/scripts/add_readme_metadata.py b/training/main/scripts/add_readme_metadata.py index cf810dca..9d10a664 100644 --- a/training/main/scripts/add_readme_metadata.py +++ b/training/main/scripts/add_readme_metadata.py @@ -5,9 +5,11 @@ """ +from __future__ import annotations + import json from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import typer import yaml @@ -16,7 +18,7 @@ TEXT_CLASSIFICATION_COMPONENTS = ["textcat", "textcat_multilabel"] -def _create_model_card(repo_name: str, repo_dir: Path) -> Dict[str, Any]: +def _create_model_card(repo_name: str, repo_dir: Path) -> dict[str, Any]: meta_path = repo_dir / "meta.json" with meta_path.open("r", encoding="utf-8") as f: data = json.load(f) @@ -86,10 +88,10 @@ def _create_model_card(repo_name: str, repo_dir: Path) -> Dict[str, Any]: def _insert_value( - metadata: Dict[str, Any], + metadata: dict[str, Any], name: str, value: Optional[Any], -) -> Dict[str, Any]: +) -> dict[str, Any]: if value is None or value == "": return metadata metadata[name] = value @@ -97,10 +99,10 @@ def _insert_value( def _insert_values_as_list( - metadata: Dict[str, Any], + metadata: dict[str, Any], name: str, values: Optional[Any], -) -> Dict[str, List[Any]]: +) -> dict[str, list[Any]]: if values is None: return metadata if isinstance(values, str): @@ -111,7 +113,7 @@ def _insert_values_as_list( return metadata -def _create_metric(name: str, t: str, value: float) -> Dict[str, Union[str, float]]: +def _create_metric(name: str, t: str, value: float) -> dict[str, Union[str, float]]: return {"name": name, "type": t, "value": value} @@ -120,14 +122,14 @@ def _create_p_r_f_list( precision: float, recall: float, f_score: float, -) -> List[Dict[str, Union[str, float]]]: +) -> list[dict[str, Union[str, float]]]: precision = _create_metric(f"{metric_name} Precision", "precision", precision) # type: ignore recall = _create_metric(f"{metric_name} Recall", "recall", recall) # type: ignore f_score = _create_metric(f"{metric_name} F Score", "f_score", f_score) # type: ignore return [precision, recall, f_score] # type: ignore -def _create_model_index(repo_name: str, data: Dict[str, Any]) -> List[Dict[str, Any]]: +def _create_model_index(repo_name: str, data: dict[str, Any]) -> list[dict[str, Any]]: # TODO: add some more metrics here model_index = {"name": repo_name} results = [] diff --git a/training/main/scripts/combine.py b/training/main/scripts/combine.py index ed4774f2..9b46c196 100644 --- a/training/main/scripts/combine.py +++ b/training/main/scripts/combine.py @@ -7,7 +7,6 @@ import json from collections import defaultdict from pathlib import Path -from typing import Dict, List import spacy from conllu import parse @@ -142,7 +141,7 @@ def combine_docs(cdt_sentences, ddt_dane): sent_id_to_doc_instance[doc._.sent_id] = doc # combine documents - doc_to_be_created: Dict[str, List[str]] = {} + doc_to_be_created: dict[str, list[str]] = {} sent_id_to_sent = {} for sent in cdt_sentences: sent_id = sent.metadata["sent_id"] diff --git a/training/main/scripts/create_kb.py b/training/main/scripts/create_kb.py index b766aa01..4e734ecc 100644 --- a/training/main/scripts/create_kb.py +++ b/training/main/scripts/create_kb.py @@ -5,7 +5,6 @@ import ssl from collections import defaultdict from pathlib import Path -from typing import List import numpy as np import spacy @@ -24,7 +23,7 @@ def main( trf_name: str, save_path_kb: Path = project_path / "assets/knowledge_bases/knowledge_base.kb", - langs_to_fetch: List[str] = ["da", "en"], + langs_to_fetch: list[str] = ["da", "en"], ): """Step 1: create the Knowledge Base in spaCy and write it to file""" spacy.require_gpu() # type: ignore @@ -131,7 +130,7 @@ def _fetch_wikidata_description(qid: str): return item.description.get("da", item.description.get("en", "")) # type: ignore -def _fetch_wikidata_aliases(qid, langs: List[str] = ["da", "en"]): +def _fetch_wikidata_aliases(qid, langs: list[str] = ["da", "en"]): """ Fetch the aliases of a Wikidata item. """ @@ -148,7 +147,7 @@ def _fetch_wikidata_aliases(qid, langs: List[str] = ["da", "en"]): return aliases -def _load_ents_from_data(nlp: Language, splits: List[str] = ["dev", "train"]): +def _load_ents_from_data(nlp: Language, splits: list[str] = ["dev", "train"]): """ Load in the data from the training set. """