From 7058ec44fd01f9b175780e30912054479cb75638 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 19 Jun 2024 16:30:45 +0100 Subject: [PATCH] CU-8694vte2g 1.12 depr removal (#454) * CU-8694vte2g: Remove CDB.add_concept method * CU-8694vte2g: Remove unused import (deprecated decorator) * CU-8694vte2g: Remove CAT.get_spacy_nlp method * CU-8694vte2g: Remove CAT.train_supervised method * CU-8694vte2g: Remove CAT multiprocessing methods * CU-8694vte2g: Remove MetaCAT.train method * CU-8694vte2g: Remove medcat.utils.ner.helper.deid_text method * CU-8694vte2g: Remove use of deprecated method * CU-8694vte2g: Add back removed deprecation import --- medcat/cat.py | 85 ------------------------------------ medcat/cdb.py | 41 +---------------- medcat/meta_cat.py | 21 --------- medcat/utils/ner/__init__.py | 2 +- medcat/utils/ner/deid.py | 12 +++-- medcat/utils/ner/helpers.py | 37 ---------------- 6 files changed, 11 insertions(+), 187 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index d11a534e6..2d83ccec5 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -16,7 +16,6 @@ from datetime import date from tqdm.autonotebook import tqdm, trange from spacy.tokens import Span, Doc, Token -from spacy.language import Language import humanfriendly from medcat import __version__ @@ -37,7 +36,6 @@ from medcat.utils.meta_cat.data_utils import json_to_fake_spacy from medcat.config import Config from medcat.vocab import Vocab -from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY from medcat.utils.saving.envsnapshot import get_environment_info, ENV_SNAPSHOT_FILE_NAME @@ -147,16 +145,6 @@ def _create_pipeline(self, config: Config): # Set max document length self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length - @deprecated(message="Replaced with cat.pipe.spacy_nlp.", - depr_version=(1, 2, 7), removal_version=(1, 12, 0)) - def get_spacy_nlp(self) -> Language: - """Returns the spacy pipeline with MedCAT - - Returns: - Language: The spacy Language being used. - """ - return self.pipe.spacy_nlp - def get_hash(self, force_recalc: bool = False) -> str: """Will not be a deep hash but will try to catch all the changing parts during training. @@ -772,43 +760,6 @@ def add_and_train_concept(self, for _cui in cuis: self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True) # type: ignore - @deprecated(message="Use train_supervised_from_json to train based on data " - "loaded from a json file", - depr_version=(1, 8, 0), removal_version=(1, 12, 0)) - def train_supervised(self, - data_path: str, - reset_cui_count: bool = False, - nepochs: int = 1, - print_stats: int = 0, - use_filters: bool = False, - terminate_last: bool = False, - use_overlaps: bool = False, - use_cui_doc_limit: bool = False, - test_size: int = 0, - devalue_others: bool = False, - use_groups: bool = False, - never_terminate: bool = False, - train_from_false_positives: bool = False, - extra_cui_filter: Optional[Set] = None, - retain_extra_cui_filter: bool = False, - checkpoint: Optional[Checkpoint] = None, - retain_filters: bool = False, - is_resumed: bool = False) -> Tuple: - """Train supervised by reading data from a json file. - - Refer to `train_supervvised_from_json` and/or `train_supervised_raw` - for further details. - - # noqa: DAR101 - # noqa: DAR201 - """ - return self.train_supervised_from_json(data_path, reset_cui_count, nepochs, - print_stats, use_filters, terminate_last, - use_overlaps, use_cui_doc_limit, test_size, - devalue_others, use_groups, never_terminate, - train_from_false_positives, extra_cui_filter, - retain_extra_cui_filter, checkpoint, - retain_filters, is_resumed) def train_supervised_from_json(self, data_path: str, @@ -1274,26 +1225,6 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_ pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb')) return part_counter - @deprecated(message="Use `multiprocessing_batch_char_size` instead", - depr_version=(1, 10, 0), removal_version=(1, 12, 0)) - def multiprocessing(self, - data: Union[List[Tuple], Iterable[Tuple]], - nproc: int = 2, - batch_size_chars: int = 5000 * 1000, - only_cui: bool = False, - addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], - separate_nn_components: bool = True, - out_split_size_chars: Optional[int] = None, - save_dir_path: str = os.path.abspath(os.getcwd()), - min_free_memory=0.1) -> Dict: - return self.multiprocessing_batch_char_size(data=data, nproc=nproc, - batch_size_chars=batch_size_chars, - only_cui=only_cui, addl_info=addl_info, - separate_nn_components=separate_nn_components, - out_split_size_chars=out_split_size_chars, - save_dir_path=save_dir_path, - min_free_memory=min_free_memory) - def multiprocessing_batch_char_size(self, data: Union[List[Tuple], Iterable[Tuple]], nproc: int = 2, @@ -1548,22 +1479,6 @@ def _multiprocessing_batch(self, return docs - @deprecated(message="Use `multiprocessing_batch_docs_size` instead", - depr_version=(1, 10, 0), removal_version=(1, 12, 0)) - def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]], - nproc: Optional[int] = None, - batch_size: Optional[int] = None, - only_cui: bool = False, - addl_info: List[str] = [], - return_dict: bool = True, - batch_factor: int = 2) -> Union[List[Tuple], Dict]: - return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc, - batch_size=batch_size, - only_cui=only_cui, - addl_info=addl_info, - return_dict=return_dict, - batch_factor=batch_factor) - def multiprocessing_batch_docs_size(self, in_data: Union[List[Tuple], Iterable[Tuple]], nproc: Optional[int] = None, diff --git a/medcat/cdb.py b/medcat/cdb.py index 2047681f4..e63843364 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -12,8 +12,8 @@ from medcat.utils.hasher import Hasher from medcat.utils.matutils import unitvec from medcat.utils.ml_utils import get_lr_linking -from medcat.utils.decorators import deprecated from medcat.config import Config, workers +from medcat.utils.decorators import deprecated from medcat.utils.saving.serializer import CDBSerializer from medcat.utils.config_utils import get_and_del_weighted_average_from_config from medcat.utils.config_utils import default_weighted_average @@ -252,45 +252,6 @@ def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', fu self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) - @deprecated("Use `cdb._add_concept` as this will be removed in a future release.", - depr_version=(1, 10, 0), removal_version=(1, 12, 0)) - def add_concept(self, - cui: str, - names: Dict[str, Dict], - ontologies: Set[str], - name_status: str, - type_ids: Set[str], - description: str, - full_build: bool = False) -> None: - """ - Deprecated: Use `cdb._add_concept` as this will be removed in a future release. - - Add a concept to internal Concept Database (CDB). Depending on what you are providing - this will add a large number of properties for each concept. - - Args: - cui (str): - Concept ID or unique identifier in this database, all concepts that have - the same CUI will be merged internally. - names (Dict[str, Dict]): - Names for this concept, or the value that if found in free text can be linked to this concept. - Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` - Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name' - ontologies (Set[str]): - ontologies in which the concept exists (e.g. SNOMEDCT, HPO) - name_status (str): - One of `P`, `N`, `A` - type_ids (Set[str]): - Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT) - description (str): - Description of this concept. - full_build (bool): - If True the dictionary self.addl_info will also be populated, contains a lot of extra information - about concepts, but can be very memory consuming. This is not necessary - for normal functioning of MedCAT (Default Value `False`). - """ - self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build) - def _add_concept(self, cui: str, names: Dict[str, Dict], diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index 7d834f177..349b848ed 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -15,7 +15,6 @@ from medcat.pipeline.pipe_runner import PipeRunner from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase from medcat.utils.meta_cat.data_utils import Doc as FakeDoc -from medcat.utils.decorators import deprecated from peft import get_peft_model, LoraConfig, TaskType # It should be safe to do this always, as all other multiprocessing @@ -121,26 +120,6 @@ def get_hash(self) -> str: hasher.update(self.config.get_hash()) return hasher.hexdigest() - @deprecated(message="Use `train_from_json` or `train_raw` instead", - depr_version=(1, 8, 0), removal_version=(1, 12, 0)) - def train(self, json_path: Union[str, list], save_dir_path: Optional[str] = None, data_oversampled: Optional[list] = None) -> Dict: - """Train or continue training a model give a json_path containing a MedCATtrainer export. It will - continue training if an existing model is loaded or start new training if the model is blank/new. - - Args: - json_path (Union[str, list]): - Path/Paths to a MedCATtrainer export containing the meta_annotations we want to train for. - save_dir_path (Optional[str]): - In case we have aut_save_model (meaning during the training the best model will be saved) - we need to set a save path. Defaults to `None`. - data_oversampled (Optional[list]): - In case of oversampling being performed, the data will be passed in the parameter - - Returns: - Dict: The resulting report. - """ - return self.train_from_json(json_path, save_dir_path, data_oversampled=data_oversampled) - def train_from_json(self, json_path: Union[str, list], save_dir_path: Optional[str] = None, data_oversampled: Optional[list] = None) -> Dict: """Train or continue training a model give a json_path containing a MedCATtrainer export. It will diff --git a/medcat/utils/ner/__init__.py b/medcat/utils/ner/__init__.py index 2657c7df7..5d296dc3a 100644 --- a/medcat/utils/ner/__init__.py +++ b/medcat/utils/ner/__init__.py @@ -1,2 +1,2 @@ from .metrics import metrics -from .helpers import deid_text, make_or_update_cdb +from .helpers import make_or_update_cdb diff --git a/medcat/utils/ner/deid.py b/medcat/utils/ner/deid.py index d71b52004..688bb1ea6 100644 --- a/medcat/utils/ner/deid.py +++ b/medcat/utils/ner/deid.py @@ -40,7 +40,7 @@ from medcat.cat import CAT from medcat.utils.ner.model import NerModel -from medcat.utils.ner.helpers import _deid_text as deid_text, replace_entities_in_text +from medcat.utils.ner.helpers import replace_entities_in_text logger = logging.getLogger(__name__) @@ -69,6 +69,12 @@ def train(self, json_path: Union[str, list, None], def deid_text(self, text: str, redact: bool = False) -> str: """Deidentify text and potentially redact information. + De-identified text. + If redaction is enabled, identifiable entities will be + replaced with starts (e.g `*****`). + Otherwise, the replacement will be the CUI or in other words, + the type of information that was hidden (e.g [PATIENT]). + Args: text (str): The text to deidentify. redact (bool): Whether to redact the information. @@ -76,8 +82,8 @@ def deid_text(self, text: str, redact: bool = False) -> str: Returns: str: The deidentified text. """ - self.cat.get_entities - return deid_text(self.cat, text, redact=redact) + entities = self.cat.get_entities(text)['entities'] + return replace_entities_in_text(text, entities, self.cat.cdb.get_name, redact=redact) def deid_multi_texts(self, texts: Union[Iterable[str], Iterable[Tuple]], diff --git a/medcat/utils/ner/helpers.py b/medcat/utils/ner/helpers.py index c4e6d5266..bea1e45ca 100644 --- a/medcat/utils/ner/helpers.py +++ b/medcat/utils/ner/helpers.py @@ -3,35 +3,6 @@ from medcat.utils.data_utils import count_annotations from medcat.cdb import CDB -from medcat.utils.decorators import deprecated - - -# For now, we will keep this method separate from the above class -# This is so that we wouldn't need to create a thorwaway object -# when calling the method from .helpers where it used to be. -# After the deprecated method in .helpers is removed, we can -# move this to a proper class method. -def _deid_text(cat, text: str, redact: bool = False) -> str: - """De-identify text. - - De-identified text. - If redaction is enabled, identifiable entities will be - replaced with starts (e.g `*****`). - Otherwise, the replacement will be the CUI or in other words, - the type of information that was hidden (e.g [PATIENT]). - - - Args: - cat (CAT): The CAT object to use for deid. - text (str): The input document. - redact (bool): Whether to redact. Defaults to False. - - Returns: - str: The de-identified document. - """ - entities = cat.get_entities(text)['entities'] - return replace_entities_in_text(text, entities, cat.cdb.get_name, redact=redact) - def replace_entities_in_text(text: str, entities: Dict, @@ -45,14 +16,6 @@ def replace_entities_in_text(text: str, return new_text -@deprecated("API now allows creating a DeId model (medcat.utils.ner.deid.DeIdModel). " - "It aims to simplify the usage of DeId models. " - "The use of this model is encouraged over the use of this method.", - depr_version=(1, 8, 0), removal_version=(1, 12, 0)) -def deid_text(*args, **kwargs) -> str: - return _deid_text(*args, **kwargs) - - def make_or_update_cdb(json_path: str, cdb: Optional[CDB] = None, min_count: int = 0) -> CDB: """Creates a new CDB or updates an existing one with new