Skip to content

Commit

Permalink
CU-8694vte2g 1.12 depr removal (CogStack#454)
Browse files Browse the repository at this point in the history
* CU-8694vte2g: Remove CDB.add_concept method

* CU-8694vte2g: Remove unused import (deprecated decorator)

* CU-8694vte2g: Remove CAT.get_spacy_nlp method

* CU-8694vte2g: Remove CAT.train_supervised method

* CU-8694vte2g: Remove CAT multiprocessing methods

* CU-8694vte2g: Remove MetaCAT.train method

* CU-8694vte2g: Remove medcat.utils.ner.helper.deid_text method

* CU-8694vte2g: Remove use of deprecated method

* CU-8694vte2g: Add back removed deprecation import
  • Loading branch information
mart-r authored Jun 19, 2024
1 parent 1c3628d commit 7058ec4
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 187 deletions.
85 changes: 0 additions & 85 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from datetime import date
from tqdm.autonotebook import tqdm, trange
from spacy.tokens import Span, Doc, Token
from spacy.language import Language
import humanfriendly

from medcat import __version__
Expand All @@ -37,7 +36,6 @@
from medcat.utils.meta_cat.data_utils import json_to_fake_spacy
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.utils.decorators import deprecated
from medcat.ner.transformers_ner import TransformersNER
from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY
from medcat.utils.saving.envsnapshot import get_environment_info, ENV_SNAPSHOT_FILE_NAME
Expand Down Expand Up @@ -147,16 +145,6 @@ def _create_pipeline(self, config: Config):
# Set max document length
self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length

@deprecated(message="Replaced with cat.pipe.spacy_nlp.",
depr_version=(1, 2, 7), removal_version=(1, 12, 0))
def get_spacy_nlp(self) -> Language:
"""Returns the spacy pipeline with MedCAT
Returns:
Language: The spacy Language being used.
"""
return self.pipe.spacy_nlp

def get_hash(self, force_recalc: bool = False) -> str:
"""Will not be a deep hash but will try to catch all the changing parts during training.
Expand Down Expand Up @@ -772,43 +760,6 @@ def add_and_train_concept(self,
for _cui in cuis:
self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True) # type: ignore

@deprecated(message="Use train_supervised_from_json to train based on data "
"loaded from a json file",
depr_version=(1, 8, 0), removal_version=(1, 12, 0))
def train_supervised(self,
data_path: str,
reset_cui_count: bool = False,
nepochs: int = 1,
print_stats: int = 0,
use_filters: bool = False,
terminate_last: bool = False,
use_overlaps: bool = False,
use_cui_doc_limit: bool = False,
test_size: int = 0,
devalue_others: bool = False,
use_groups: bool = False,
never_terminate: bool = False,
train_from_false_positives: bool = False,
extra_cui_filter: Optional[Set] = None,
retain_extra_cui_filter: bool = False,
checkpoint: Optional[Checkpoint] = None,
retain_filters: bool = False,
is_resumed: bool = False) -> Tuple:
"""Train supervised by reading data from a json file.
Refer to `train_supervvised_from_json` and/or `train_supervised_raw`
for further details.
# noqa: DAR101
# noqa: DAR201
"""
return self.train_supervised_from_json(data_path, reset_cui_count, nepochs,
print_stats, use_filters, terminate_last,
use_overlaps, use_cui_doc_limit, test_size,
devalue_others, use_groups, never_terminate,
train_from_false_positives, extra_cui_filter,
retain_extra_cui_filter, checkpoint,
retain_filters, is_resumed)

def train_supervised_from_json(self,
data_path: str,
Expand Down Expand Up @@ -1274,26 +1225,6 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_
pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb'))
return part_counter

@deprecated(message="Use `multiprocessing_batch_char_size` instead",
depr_version=(1, 10, 0), removal_version=(1, 12, 0))
def multiprocessing(self,
data: Union[List[Tuple], Iterable[Tuple]],
nproc: int = 2,
batch_size_chars: int = 5000 * 1000,
only_cui: bool = False,
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
separate_nn_components: bool = True,
out_split_size_chars: Optional[int] = None,
save_dir_path: str = os.path.abspath(os.getcwd()),
min_free_memory=0.1) -> Dict:
return self.multiprocessing_batch_char_size(data=data, nproc=nproc,
batch_size_chars=batch_size_chars,
only_cui=only_cui, addl_info=addl_info,
separate_nn_components=separate_nn_components,
out_split_size_chars=out_split_size_chars,
save_dir_path=save_dir_path,
min_free_memory=min_free_memory)

def multiprocessing_batch_char_size(self,
data: Union[List[Tuple], Iterable[Tuple]],
nproc: int = 2,
Expand Down Expand Up @@ -1548,22 +1479,6 @@ def _multiprocessing_batch(self,

return docs

@deprecated(message="Use `multiprocessing_batch_docs_size` instead",
depr_version=(1, 10, 0), removal_version=(1, 12, 0))
def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]],
nproc: Optional[int] = None,
batch_size: Optional[int] = None,
only_cui: bool = False,
addl_info: List[str] = [],
return_dict: bool = True,
batch_factor: int = 2) -> Union[List[Tuple], Dict]:
return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc,
batch_size=batch_size,
only_cui=only_cui,
addl_info=addl_info,
return_dict=return_dict,
batch_factor=batch_factor)

def multiprocessing_batch_docs_size(self,
in_data: Union[List[Tuple], Iterable[Tuple]],
nproc: Optional[int] = None,
Expand Down
41 changes: 1 addition & 40 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from medcat.utils.hasher import Hasher
from medcat.utils.matutils import unitvec
from medcat.utils.ml_utils import get_lr_linking
from medcat.utils.decorators import deprecated
from medcat.config import Config, workers
from medcat.utils.decorators import deprecated
from medcat.utils.saving.serializer import CDBSerializer
from medcat.utils.config_utils import get_and_del_weighted_average_from_config
from medcat.utils.config_utils import default_weighted_average
Expand Down Expand Up @@ -252,45 +252,6 @@ def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', fu

self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)

@deprecated("Use `cdb._add_concept` as this will be removed in a future release.",
depr_version=(1, 10, 0), removal_version=(1, 12, 0))
def add_concept(self,
cui: str,
names: Dict[str, Dict],
ontologies: Set[str],
name_status: str,
type_ids: Set[str],
description: str,
full_build: bool = False) -> None:
"""
Deprecated: Use `cdb._add_concept` as this will be removed in a future release.
Add a concept to internal Concept Database (CDB). Depending on what you are providing
this will add a large number of properties for each concept.
Args:
cui (str):
Concept ID or unique identifier in this database, all concepts that have
the same CUI will be merged internally.
names (Dict[str, Dict]):
Names for this concept, or the value that if found in free text can be linked to this concept.
Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'
ontologies (Set[str]):
ontologies in which the concept exists (e.g. SNOMEDCT, HPO)
name_status (str):
One of `P`, `N`, `A`
type_ids (Set[str]):
Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT)
description (str):
Description of this concept.
full_build (bool):
If True the dictionary self.addl_info will also be populated, contains a lot of extra information
about concepts, but can be very memory consuming. This is not necessary
for normal functioning of MedCAT (Default Value `False`).
"""
self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build)

def _add_concept(self,
cui: str,
names: Dict[str, Dict],
Expand Down
21 changes: 0 additions & 21 deletions medcat/meta_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from medcat.pipeline.pipe_runner import PipeRunner
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
from medcat.utils.meta_cat.data_utils import Doc as FakeDoc
from medcat.utils.decorators import deprecated
from peft import get_peft_model, LoraConfig, TaskType

# It should be safe to do this always, as all other multiprocessing
Expand Down Expand Up @@ -121,26 +120,6 @@ def get_hash(self) -> str:
hasher.update(self.config.get_hash())
return hasher.hexdigest()

@deprecated(message="Use `train_from_json` or `train_raw` instead",
depr_version=(1, 8, 0), removal_version=(1, 12, 0))
def train(self, json_path: Union[str, list], save_dir_path: Optional[str] = None, data_oversampled: Optional[list] = None) -> Dict:
"""Train or continue training a model give a json_path containing a MedCATtrainer export. It will
continue training if an existing model is loaded or start new training if the model is blank/new.
Args:
json_path (Union[str, list]):
Path/Paths to a MedCATtrainer export containing the meta_annotations we want to train for.
save_dir_path (Optional[str]):
In case we have aut_save_model (meaning during the training the best model will be saved)
we need to set a save path. Defaults to `None`.
data_oversampled (Optional[list]):
In case of oversampling being performed, the data will be passed in the parameter
Returns:
Dict: The resulting report.
"""
return self.train_from_json(json_path, save_dir_path, data_oversampled=data_oversampled)

def train_from_json(self, json_path: Union[str, list], save_dir_path: Optional[str] = None,
data_oversampled: Optional[list] = None) -> Dict:
"""Train or continue training a model give a json_path containing a MedCATtrainer export. It will
Expand Down
2 changes: 1 addition & 1 deletion medcat/utils/ner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metrics import metrics
from .helpers import deid_text, make_or_update_cdb
from .helpers import make_or_update_cdb
12 changes: 9 additions & 3 deletions medcat/utils/ner/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from medcat.cat import CAT
from medcat.utils.ner.model import NerModel

from medcat.utils.ner.helpers import _deid_text as deid_text, replace_entities_in_text
from medcat.utils.ner.helpers import replace_entities_in_text


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -69,15 +69,21 @@ def train(self, json_path: Union[str, list, None],
def deid_text(self, text: str, redact: bool = False) -> str:
"""Deidentify text and potentially redact information.
De-identified text.
If redaction is enabled, identifiable entities will be
replaced with starts (e.g `*****`).
Otherwise, the replacement will be the CUI or in other words,
the type of information that was hidden (e.g [PATIENT]).
Args:
text (str): The text to deidentify.
redact (bool): Whether to redact the information.
Returns:
str: The deidentified text.
"""
self.cat.get_entities
return deid_text(self.cat, text, redact=redact)
entities = self.cat.get_entities(text)['entities']
return replace_entities_in_text(text, entities, self.cat.cdb.get_name, redact=redact)

def deid_multi_texts(self,
texts: Union[Iterable[str], Iterable[Tuple]],
Expand Down
37 changes: 0 additions & 37 deletions medcat/utils/ner/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,6 @@
from medcat.utils.data_utils import count_annotations
from medcat.cdb import CDB

from medcat.utils.decorators import deprecated


# For now, we will keep this method separate from the above class
# This is so that we wouldn't need to create a thorwaway object
# when calling the method from .helpers where it used to be.
# After the deprecated method in .helpers is removed, we can
# move this to a proper class method.
def _deid_text(cat, text: str, redact: bool = False) -> str:
"""De-identify text.
De-identified text.
If redaction is enabled, identifiable entities will be
replaced with starts (e.g `*****`).
Otherwise, the replacement will be the CUI or in other words,
the type of information that was hidden (e.g [PATIENT]).
Args:
cat (CAT): The CAT object to use for deid.
text (str): The input document.
redact (bool): Whether to redact. Defaults to False.
Returns:
str: The de-identified document.
"""
entities = cat.get_entities(text)['entities']
return replace_entities_in_text(text, entities, cat.cdb.get_name, redact=redact)


def replace_entities_in_text(text: str,
entities: Dict,
Expand All @@ -45,14 +16,6 @@ def replace_entities_in_text(text: str,
return new_text


@deprecated("API now allows creating a DeId model (medcat.utils.ner.deid.DeIdModel). "
"It aims to simplify the usage of DeId models. "
"The use of this model is encouraged over the use of this method.",
depr_version=(1, 8, 0), removal_version=(1, 12, 0))
def deid_text(*args, **kwargs) -> str:
return _deid_text(*args, **kwargs)


def make_or_update_cdb(json_path: str, cdb: Optional[CDB] = None,
min_count: int = 0) -> CDB:
"""Creates a new CDB or updates an existing one with new
Expand Down

0 comments on commit 7058ec4

Please sign in to comment.