From d3dd0f52d6ffc78884c05ccbb866201e18df9d25 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 7 Feb 2022 13:01:26 -0500 Subject: [PATCH 001/185] Collect EFO alternative terms/synonyms Fix default value for deprecated term exclusion --- term_collector.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/term_collector.py b/term_collector.py index cca29fa..fd96579 100644 --- a/term_collector.py +++ b/term_collector.py @@ -15,7 +15,7 @@ def __init__(self, ontology_iri): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.ontology_iri = ontology_iri - def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=True, include_individuals=False): + def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=False, include_individuals=False): """ Collect the terms described in the ontology at the specified IRI :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple @@ -120,6 +120,8 @@ def _get_synonyms(self, ontology_term): synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) + for efo_alt_term in self._get_efo_alt_terms(ontology_term): + synonyms.add(efo_alt_term) self.logger.debug("Collected %i synonyms for %s", len(synonyms), ontology_term) return synonyms @@ -151,6 +153,15 @@ def _get_skos_pref_labels(self, ontology_term): self.logger.debug(err) return skos_labels + def _get_efo_alt_terms(self, ontology_term): + efo_alt_terms = [] + try: + for efo_alt_term in ontology_term.alternative_term: + efo_alt_terms.append(efo_alt_term) + except AttributeError as err: + self.logger.debug(err) + return efo_alt_terms + def _get_obo_exact_synonyms(self, ontology_term): """ Collect synonyms of the given term that are specified using the annotation property used by DOID, MONDO, EFO, From c26e97acbbd0d54be9514011de6d3d9298f93694 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 8 Feb 2022 12:19:45 -0500 Subject: [PATCH 002/185] Include all syns when mapping. Logging updates --- term_collector.py | 13 +++++++------ tfidf_mapper.py | 43 +++++++++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/term_collector.py b/term_collector.py index fd96579..65b469e 100644 --- a/term_collector.py +++ b/term_collector.py @@ -32,8 +32,9 @@ def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecat ontology_terms = [] if len(base_iris) > 0: for iri in base_iris: + iri = iri.strip() query = iri + "*" - self.logger.info("...Collecting terms with IRIs starting in: " + iri) + self.logger.info("...collecting terms with IRIs starting in: " + iri) iris = list(default_world.search(iri=query)) ontology_terms.extend(self._get_ontology_terms(iris, ontology, exclude_deprecated)) else: @@ -103,10 +104,10 @@ def _get_labels(self, ontology_term): labels.add(skos_label) if len(labels) == 0: label_from_iri = onto_utils.label_from_iri(ontology_term.iri) - self.logger.info("Ontology term %s has no labels (rdfs:label or skos:prefLabel). " + self.logger.info("...ontology term %s has no labels (rdfs:label or skos:prefLabel). " "Using a label based on the term IRI: %s", ontology_term.iri, label_from_iri) labels.add(label_from_iri) - self.logger.debug("Collected %i labels and synonyms for %s", len(labels), ontology_term) + self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) return labels def _get_synonyms(self, ontology_term): @@ -122,7 +123,7 @@ def _get_synonyms(self, ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): synonyms.add(efo_alt_term) - self.logger.debug("Collected %i synonyms for %s", len(synonyms), ontology_term) + self.logger.debug("...collected %i synonyms for %s", len(synonyms), ontology_term) return synonyms def _get_rdfs_labels(self, ontology_term): @@ -216,7 +217,7 @@ def _load_ontology(self, ontology_iri): ontology = get_ontology(ontology_iri).load() end = time.time() self._log_ontology_metrics(ontology) - self.logger.info("done (loading time: %.2fs)", end-start) + self.logger.info("...done (ontology loading time: %.2fs)", end-start) return ontology def _classify_ontology(self, ontology): @@ -229,7 +230,7 @@ def _classify_ontology(self, ontology): with ontology: # entailments will be added to this ontology sync_reasoner(infer_property_values=True) end = time.time() - self.logger.info("done (reasoning time: %.2fs)", end - start) + self.logger.info("...done (reasoning time: %.2fs)", end - start) def _log_ontology_metrics(self, ontology): self.logger.debug(" Ontology IRI: %s", ontology.base_iri) diff --git a/tfidf_mapper.py b/tfidf_mapper.py index c49dee3..7261c19 100644 --- a/tfidf_mapper.py +++ b/tfidf_mapper.py @@ -15,6 +15,7 @@ def __init__(self, target_ontology_terms): :param target_ontology_terms: Collection of ontology terms to be mapped against """ self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) def map(self, source_terms, max_mappings=3, min_score=0.3): @@ -26,13 +27,14 @@ def map(self, source_terms, max_mappings=3, min_score=0.3): Default set to 0, so consider all candidates """ self.logger.info("Mapping %i source terms...", len(source_terms)) + self.logger.info("...against %i ontology terms (%i labels/synonyms)", len(self.target_ontology_terms), len(self.target_labels)) start = time.time() - source_terms = onto_utils.normalize_list(source_terms) - vectorizer = self._tokenize(source_terms, self.target_labels) - results_mtx = self._sparse_dot_top(vectorizer, source_terms, self.target_labels, min_score) + source_terms_norm = onto_utils.normalize_list(source_terms) + vectorizer = self._tokenize(source_terms_norm, self.target_labels) + results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df, term_graphs = self._get_mappings(results_mtx, max_mappings, source_terms, self.target_terms) end = time.time() - self.logger.info('done (mapping time: %.2fs seconds)', end-start) + self.logger.info("...done (mapping time: %.2fs seconds)", end-start) return results_df, term_graphs def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): @@ -53,30 +55,32 @@ def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): src_mtx = vectorizer.fit_transform(source_terms).tocsr() tgt_mtx = vectorizer.fit_transform(target_labels).transpose().tocsr() + # 'ntop' specifies the maximum number of labels/synonyms that should be considered + # multiple labels/synonyms in the 'ntop' matches may be from the same ontology term return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=20, lower_bound=min_score) def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ coo_mtx = results_mtx.tocoo() - mapping_list = [] - mapping_graph_list = [] - last_source_string = "" - candidate_target_terms = set() + mappings = [] + mapped_term_graphs = [] + last_source_term = "" + top_mappings = set() for row, col, score in zip(coo_mtx.row, coo_mtx.col, coo_mtx.data): source_term = source_terms[row] onto_term = target_terms[col] - if source_term == last_source_string: - if len(candidate_target_terms) == max_mappings: + self.logger.debug("Source term: %s maps to %s (%f)", source_term, onto_term.label, score) + if source_term == last_source_term: + if len(top_mappings) == max_mappings: continue else: - last_source_string = source_term - candidate_target_terms.clear() - if onto_term.iri not in candidate_target_terms: - mapping = TermMapping(source_term, onto_term.label, onto_term.iri, onto_term.ontology_iri, score) - mapping_list.append(mapping) - mapping_graph_list.append(onto_term.graph().graph_dict()) - candidate_target_terms.add(onto_term.iri) - return TermMappingCollection(mapping_list).mappings_df(), mapping_graph_list + last_source_term = source_term + top_mappings.clear() + if onto_term.iri not in top_mappings: + mappings.append(TermMapping(source_term, onto_term.label, onto_term.iri, onto_term.ontology_iri, score)) + mapped_term_graphs.append(onto_term.graph().graph_dict()) + top_mappings.add(onto_term.iri) + return TermMappingCollection(mappings).mappings_df(), mapped_term_graphs def _get_target_labels_terms(self, ontology_terms): """Get lists of labels and terms to enable retrieving terms from their labels""" @@ -85,4 +89,7 @@ def _get_target_labels_terms(self, ontology_terms): for label in term.labels: target_labels.append(label) target_terms.append(term) + for synonym in term.synonyms: + target_labels.append(synonym) + target_terms.append(term) return target_labels, target_terms From f1e862e84fa6ea7fa737dfaef91d9f407d258cc4 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 9 Feb 2022 11:12:02 -0500 Subject: [PATCH 003/185] Move sources to dedicated folder --- text2term/__init__.py | 0 curie2iri.py => text2term/curie2iri.py | 13 ------------- text2term.py => text2term/main.py | 5 ++--- onto_utils.py => text2term/onto_utils.py | 0 term.py => text2term/term.py | 0 term_collector.py => text2term/term_collector.py | 0 term_graph.py => text2term/term_graph.py | 0 term_mapping.py => text2term/term_mapping.py | 0 tfidf_mapper.py => text2term/tfidf_mapper.py | 2 +- zooma_mapper.py => text2term/zooma_mapper.py | 0 10 files changed, 3 insertions(+), 17 deletions(-) create mode 100644 text2term/__init__.py rename curie2iri.py => text2term/curie2iri.py (85%) rename text2term.py => text2term/main.py (95%) rename onto_utils.py => text2term/onto_utils.py (100%) rename term.py => text2term/term.py (100%) rename term_collector.py => text2term/term_collector.py (100%) rename term_graph.py => text2term/term_graph.py (100%) rename term_mapping.py => text2term/term_mapping.py (100%) rename tfidf_mapper.py => text2term/tfidf_mapper.py (98%) rename zooma_mapper.py => text2term/zooma_mapper.py (100%) diff --git a/text2term/__init__.py b/text2term/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/curie2iri.py b/text2term/curie2iri.py similarity index 85% rename from curie2iri.py rename to text2term/curie2iri.py index 830d36f..146a000 100644 --- a/curie2iri.py +++ b/text2term/curie2iri.py @@ -1,7 +1,5 @@ -import datetime import logging import ssl -import sys import urllib.request from urllib.error import HTTPError import pandas as pd @@ -72,14 +70,3 @@ def get_iris_df_for_file(self, input_file, resolve_iri): iris_file = self.get_iris(onto_utils.parse_list_file(input_file), resolve_iri=resolve_iri) out_col_names = ['source_tag', 'target_iri', 'iri_resolves'] return pd.DataFrame(iris_file, columns=out_col_names) - - -if __name__ == "__main__": - tag2iri = OntoTag2Iri() - if len(sys.argv) > 1: - input_tag_list_file = sys.argv[1] - output_file = "tag2iri-" + datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + ".csv" - output_df = tag2iri.get_iris_df_for_file(input_tag_list_file, resolve_iri=True) - output_df.to_csv(output_file, index=False) - else: - print("Provide input file with tags to convert to IRIs") diff --git a/text2term.py b/text2term/main.py similarity index 95% rename from text2term.py rename to text2term/main.py index 2217021..bb4933c 100644 --- a/text2term.py +++ b/text2term/main.py @@ -1,6 +1,5 @@ import argparse import datetime -import json import os import sys import onto_utils @@ -57,5 +56,5 @@ def get_arguments(): mapper = TFIDFMapper(onto_terms) mappings_df, term_graphs = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) mappings_df.to_csv(output_file, index=False) - with open(output_file + "-term-graphs.json", 'w') as json_file: - json.dump(term_graphs, json_file, indent=2) + # with open(output_file + "-term-graphs.json", 'w') as json_file: + # json.dump(term_graphs, json_file, indent=2) diff --git a/onto_utils.py b/text2term/onto_utils.py similarity index 100% rename from onto_utils.py rename to text2term/onto_utils.py diff --git a/term.py b/text2term/term.py similarity index 100% rename from term.py rename to text2term/term.py diff --git a/term_collector.py b/text2term/term_collector.py similarity index 100% rename from term_collector.py rename to text2term/term_collector.py diff --git a/term_graph.py b/text2term/term_graph.py similarity index 100% rename from term_graph.py rename to text2term/term_graph.py diff --git a/term_mapping.py b/text2term/term_mapping.py similarity index 100% rename from term_mapping.py rename to text2term/term_mapping.py diff --git a/tfidf_mapper.py b/text2term/tfidf_mapper.py similarity index 98% rename from tfidf_mapper.py rename to text2term/tfidf_mapper.py index 7261c19..613fa32 100644 --- a/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -57,7 +57,7 @@ def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): tgt_mtx = vectorizer.fit_transform(target_labels).transpose().tocsr() # 'ntop' specifies the maximum number of labels/synonyms that should be considered # multiple labels/synonyms in the 'ntop' matches may be from the same ontology term - return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=20, lower_bound=min_score) + return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=15, lower_bound=min_score) def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ diff --git a/zooma_mapper.py b/text2term/zooma_mapper.py similarity index 100% rename from zooma_mapper.py rename to text2term/zooma_mapper.py From bd85a34fa763436376d06550644161bf4a9fa60b Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 9 Feb 2022 11:13:01 -0500 Subject: [PATCH 004/185] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index d54e365..c39619b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 ccb-hms +Copyright (c) 2022 Center for Computational Biomedicine, Harvard Medical School Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 4ac0f8ae446d909928f426cc448875c0c67bca2b Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 9 Feb 2022 11:13:19 -0500 Subject: [PATCH 005/185] Update setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2bb4d93..cee2e93 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ description='A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration', long_description=long_description, long_description_content_type='text/markdown', + author='Center for Computational Biomedicine, Harvard Medical School', classifiers=[ 'Development Status :: 3 - Alpha', 'License :: OSI Approved :: MIT License', @@ -26,5 +27,5 @@ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering' - ], + ] ) From 5fcbf873a9f955b8e9ab859cc297cbcc7efb1a10 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 11 Feb 2022 12:47:51 -0500 Subject: [PATCH 006/185] Add pip install to readme. Update tool instructions --- README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f407474..5166ce4 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,17 @@ A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic ## Usage -`text2term.py -s SOURCE -t TARGET [-o OUTPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-i INCL_INDIVIDUALS]` +Install package using **pip**: + +`pip install .` + +Execute the tool as follows: + +`text2term -s SOURCE -t TARGET [-o OUTPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-i INCL_INDIVIDUALS]` + +To display a help message with descriptions of tool arguments do: + +`text2term -h` or `text2term --help` ### Required arguments `-s SOURCE` Input file containing list of 'source' terms to map to ontology terms (one per line). @@ -28,17 +38,17 @@ A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic ## Examples The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term.py -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` +`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` Specify an output file where the mappings should be saved using `-o`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` +`python text2term -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -min 0.8` +`python text2term -s unstruct_terms.txt -t efo.owl -min 0.8` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -d` +`python text2term -s unstruct_terms.txt -t efo.owl -d` Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: `python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` From 78a7dfd9a0b7bb78e344f4d414a5309f6e53fe64 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 11 Feb 2022 12:48:21 -0500 Subject: [PATCH 007/185] Add Python 3.9 requirement --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index cee2e93..1fdea24 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,9 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering' - ] + 'Programming Language :: Python :: 3.9', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Bio-Informatics' + ], + python_requires=">=3.9", ) From 3f8c3453ab1bb6cb44fe90571cc8824b07f3e2f5 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 11 Feb 2022 12:51:06 -0500 Subject: [PATCH 008/185] Rename main.py and remove commented lines --- text2term/{main.py => __main__.py} | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) rename text2term/{main.py => __main__.py} (95%) diff --git a/text2term/main.py b/text2term/__main__.py similarity index 95% rename from text2term/main.py rename to text2term/__main__.py index bb4933c..2217021 100644 --- a/text2term/main.py +++ b/text2term/__main__.py @@ -1,5 +1,6 @@ import argparse import datetime +import json import os import sys import onto_utils @@ -56,5 +57,5 @@ def get_arguments(): mapper = TFIDFMapper(onto_terms) mappings_df, term_graphs = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) mappings_df.to_csv(output_file, index=False) - # with open(output_file + "-term-graphs.json", 'w') as json_file: - # json.dump(term_graphs, json_file, indent=2) + with open(output_file + "-term-graphs.json", 'w') as json_file: + json.dump(term_graphs, json_file, indent=2) From 4754975f04ccea97d6f161570d8d6179c2793068 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 17 Feb 2022 21:30:54 -0500 Subject: [PATCH 009/185] Update module imports --- text2term/term.py | 4 ++-- text2term/term_collector.py | 4 ++-- text2term/{curie2iri.py => term_tag2iri.py} | 6 +++--- text2term/tfidf_mapper.py | 6 +++--- text2term/zooma_mapper.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) rename text2term/{curie2iri.py => term_tag2iri.py} (98%) diff --git a/text2term/term.py b/text2term/term.py index 7257410..fe2d16a 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -1,8 +1,8 @@ """Provides OntologyTerm class""" -import onto_utils from owlready2 import Thing, ThingClass -from term_graph import OntologyTermGraph, Node, Edge +from text2term import onto_utils +from text2term.term_graph import OntologyTermGraph, Edge, Node class OntologyTerm: diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 65b469e..021948f 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -1,9 +1,9 @@ """Provides OntologyTermCollector class""" import logging -import onto_utils from owlready2 import * -from term import OntologyTerm +from text2term import onto_utils +from text2term.term import OntologyTerm class OntologyTermCollector: diff --git a/text2term/curie2iri.py b/text2term/term_tag2iri.py similarity index 98% rename from text2term/curie2iri.py rename to text2term/term_tag2iri.py index 146a000..7ae53b3 100644 --- a/text2term/curie2iri.py +++ b/text2term/term_tag2iri.py @@ -1,14 +1,14 @@ import logging import ssl import urllib.request -from urllib.error import HTTPError import pandas as pd -import onto_utils +from urllib.error import HTTPError +from text2term import onto_utils ssl._create_default_https_context = ssl._create_stdlib_context -class OntoTag2Iri: +class TermTag2Iri: def __init__(self): self.logger = onto_utils.get_logger(__name__, logging.INFO) diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 613fa32..f85df92 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -2,10 +2,10 @@ import logging import time -import onto_utils import sparse_dot_topn as ct from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils +from text2term.term_mapping import TermMapping, TermMappingCollection class TFIDFMapper: @@ -57,7 +57,7 @@ def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): tgt_mtx = vectorizer.fit_transform(target_labels).transpose().tocsr() # 'ntop' specifies the maximum number of labels/synonyms that should be considered # multiple labels/synonyms in the 'ntop' matches may be from the same ontology term - return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=15, lower_bound=min_score) + return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=50, lower_bound=min_score) def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 196882c..270b64a 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -4,8 +4,8 @@ import logging import time import requests -import onto_utils -from term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils +from text2term.term_mapping import TermMappingCollection, TermMapping class ZoomaMapper: From ff7425a1920075b9d0ea633e054243d78cdcc89d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 17 Feb 2022 21:33:50 -0500 Subject: [PATCH 010/185] Add BioPortal mapper. Closes #6 --- text2term/bioportal_mapper.py | 119 ++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 text2term/bioportal_mapper.py diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py new file mode 100644 index 0000000..d231d60 --- /dev/null +++ b/text2term/bioportal_mapper.py @@ -0,0 +1,119 @@ +"""Provides BioPortalAnnotatorMapper class""" + +import json +import logging +import time +import requests +from text2term.term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils + + +class BioPortalAnnotatorMapper: + + def __init__(self, bp_api_key): + self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.url = "http://data.bioontology.org/annotator" + self.bp_api_key = bp_api_key + + def map(self, source_terms, ontologies, max_mappings=3): + self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) + start = time.time() + mappings = [] + for term in source_terms: + mappings.extend(self._map_term(term, ontologies, max_mappings)) + self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) + return TermMappingCollection(mappings).mappings_df() + + def _map_term(self, source_term, ontologies, max_mappings): + params = { + "text": source_term, + "longest_only": "true", + "expand_mappings": "true", + "ontologies": ontologies + } + mappings = [] + self.logger.debug("Searching for ontology terms to match: " + source_term) + response = self._do_get_request(self.url, params=params) + if response is not None: + self.logger.debug("...found " + str(len(response)) + " mappings") + for mapping in response: + if len(mappings) < max_mappings: + mappings.append(self._mapping_details(source_term, mapping).as_term_mapping()) + return mappings + + def _mapping_details(self, text, annotation): + ann_class = annotation["annotatedClass"] + term_iri = ann_class["@id"] + term_link_bp = ann_class["links"]["self"] + onto_iri = ann_class["links"]["ontology"] + onto_name = onto_utils.curie_from_iri(term_iri) + bp_link = ann_class["links"]["ui"] + match_type = annotation["annotations"][0]["matchType"] + term_name, term_definition, ancestors = self.get_term_details(term_link_bp) + return BioPortalMapping(text, term_name, term_iri, term_definition, ancestors, onto_iri, onto_name, bp_link, + match_type) + + def get_term_details(self, term_iri): + response = self._do_get_request(term_iri) + term_name, term_definition = "", "" + ancestors = [] + if response is not None: + term_name = onto_utils.remove_quotes(response["prefLabel"]) + if len(response["definition"]) > 0: + term_definition = response["definition"][0] + term_definition = onto_utils.remove_quotes(term_definition) + ancestors_link = response["links"]["ancestors"] + ancestors = self._get_ancestors(ancestors_link) + return term_name, term_definition, ancestors + + def _get_ancestors(self, term_ancestors_bp_link): + response = self._do_get_request(term_ancestors_bp_link) + ancestors = [] + if response is not None: + for ancestor in response: + if ancestor is not None: + ancestor_name = ancestor["prefLabel"] + ancestors.append(ancestor_name) + ancestors = list(dict.fromkeys(ancestors)) # remove duplicate ancestors + return ancestors + + def _do_get_request(self, request_url, params=None): + headers = { + "Authorization": "apiKey token=" + self.bp_api_key, + } + response = requests.get(request_url, params=params, headers=headers, verify=True) + if response.ok: + json_resp = json.loads(response.content) + if len(json_resp) > 0: + return json_resp + else: + self.logger.info("Empty response for input: " + request_url + " with parameters " + str(params)) + elif response.status_code == 429: # API is throttling requests + self.logger.info(response.reason + ". Status code: " + str(response.status_code) + ". Waiting 15 seconds.") + time.sleep(15) + return self._do_get_request(request_url, params) + else: + json_resp = json.loads(response.content) + self.logger.error(response.reason + ":" + request_url + ". " + json_resp["errors"][0]) + + +class BioPortalMapping: + + def __init__(self, original_text, term_name, term_iri, term_definition, term_ancestors, ontology_iri, ontology_name, + bioportal_link, match_type): + self.original_text = original_text + self.term_name = term_name + self.term_iri = term_iri + self.term_definition = term_definition + self.term_ancestors = term_ancestors + self.ontology_iri = ontology_iri + self.ontology_name = ontology_name + self.bioportal_link = bioportal_link + self.match_type = match_type + + def as_term_mapping(self): + return TermMapping(self.original_text, self.term_name, self.term_iri, self.ontology_iri, self.mapping_score) + + @property + def mapping_score(self): + return 1 # if SYN|PREF From 1800b39d08dda97ea15fb9ec2f180f1be4062ebd Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 26 Feb 2022 16:01:08 -0500 Subject: [PATCH 011/185] Set version to 0.2.0 the tool now includes basic interfaces to Zooma and BioPortal Annotator --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1fdea24..9dc5f60 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.1' +version = '0.2.0' setup( name='text2term ontology mapper', From 83e3065450942507486ec92957e90e485dc00eea Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 26 Feb 2022 16:43:23 -0500 Subject: [PATCH 012/185] Allow custom parameters to Zooma/BP (closes #9) --- setup.py | 2 +- text2term/bioportal_mapper.py | 19 ++++++++++++++++--- text2term/zooma_mapper.py | 16 +++++++++++++--- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 9dc5f60..277016f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.2.0' +version = '0.2.1' setup( name='text2term ontology mapper', diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index d231d60..4ab0f6b 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -11,26 +11,39 @@ class BioPortalAnnotatorMapper: def __init__(self, bp_api_key): + """ + :param bp_api_key: BioPortal API key + """ self.logger = onto_utils.get_logger(__name__, logging.INFO) self.url = "http://data.bioontology.org/annotator" self.bp_api_key = bp_api_key - def map(self, source_terms, ontologies, max_mappings=3): + def map(self, source_terms, ontologies, max_mappings=3, api_params=()): + """ + Find and return ontology mappings through the BioPortal Annotator Web service + :param source_terms: Collection of source terms to map to target ontologies + :param ontologies: String with a comma-separated list of ontology acronyms (eg "HP,EFO") + :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned + :param api_params: Additional BioPortal Annotator-specific parameters to include in the request + """ self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) start = time.time() mappings = [] for term in source_terms: - mappings.extend(self._map_term(term, ontologies, max_mappings)) + mappings.extend(self._map_term(term, ontologies, max_mappings, api_params)) self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() - def _map_term(self, source_term, ontologies, max_mappings): + def _map_term(self, source_term, ontologies, max_mappings, api_params): params = { "text": source_term, "longest_only": "true", "expand_mappings": "true", "ontologies": ontologies } + if len(api_params) > 0: + params.update(api_params) + self.logger.debug("API parameters: " + str(params)) mappings = [] self.logger.debug("Searching for ontology terms to match: " + source_term) response = self._do_get_request(self.url, params=params) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 270b64a..7376211 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -14,20 +14,30 @@ def __init__(self): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.url = "http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate" - def map(self, source_terms, ontologies, max_mappings=3): + def map(self, source_terms, ontologies, max_mappings=3, api_params=()): + """ + Find and return ontology mappings through the Zooma Web service + :param source_terms: Collection of source terms to map to target ontologies + :param ontologies: String with a comma-separated list of ontology acronyms (eg "HP,EFO") + :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned + :param api_params: Additional Zooma API-specific parameters to include in the request + """ self.logger.info("Mapping %i source terms against ontologies: %s", len(source_terms), ontologies) start = time.time() mappings = [] for term in source_terms: - mappings.extend(self._map_term(term, ontologies, max_mappings)) + mappings.extend(self._map_term(term, ontologies, max_mappings, api_params)) self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() - def _map_term(self, source_term, ontologies, max_mappings): + def _map_term(self, source_term, ontologies, max_mappings, api_params): params = { "propertyValue": source_term, "ontologies": ontologies } + if len(api_params) > 0: + params.update(api_params) + self.logger.debug("API parameters: " + str(params)) mappings = [] self.logger.debug("Searching for ontology terms to match: " + source_term) response = self._do_get_request(self.url, params=params) From 19a287df25b578e0bb5b75abe84d2ad9af2c2cdd Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 26 Feb 2022 19:07:32 -0500 Subject: [PATCH 013/185] Add mapper with syntactic similarity metrics --- requirements.txt | 7 +- setup.py | 2 +- text2term/similarity_metric.py | 12 +++ text2term/syntactic_mapper.py | 129 +++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 text2term/similarity_metric.py create mode 100644 text2term/syntactic_mapper.py diff --git a/requirements.txt b/requirements.txt index 7e27897..37b4fda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,12 +4,11 @@ pandas~=1.2.4 numpy~=1.19.5 gensim~=4.0.1 scipy~=1.6.3 -sklearn~=0.0 scikit-learn~=0.24.2 setuptools~=47.1.0 -jellyfish~=0.8.9 requests~=2.27.1 -thefuzz~=0.19.0 tqdm~=4.62.3 sparse_dot_topn~=0.3.1 -bioregistry~=0.4.46 \ No newline at end of file +bioregistry~=0.4.46 +nltk~=3.5 +rapidfuzz~=2.0.5 \ No newline at end of file diff --git a/setup.py b/setup.py index 277016f..75d1909 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.2.1' +version = '0.3.0' setup( name='text2term ontology mapper', diff --git a/text2term/similarity_metric.py b/text2term/similarity_metric.py new file mode 100644 index 0000000..5a13070 --- /dev/null +++ b/text2term/similarity_metric.py @@ -0,0 +1,12 @@ +"""Provides SimilarityMetric enum""" + +from enum import Enum + + +class SimilarityMetric(Enum): + LEVENSHTEIN = 'leven' + JARO = 'jaro' + JARO_WINKLER = 'jarowinkler' + JACCARD = 'jaccard' + FUZZY = 'fuzzy' + FUZZY_WEIGHTED = 'fuzzyw' diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py new file mode 100644 index 0000000..b29c253 --- /dev/null +++ b/text2term/syntactic_mapper.py @@ -0,0 +1,129 @@ +"""Provides SyntacticMapper class""" + +import logging +import time +import nltk +import rapidfuzz +from tqdm import tqdm +from text2term import onto_utils +from text2term.similarity_metric import SimilarityMetric +from text2term.term_mapping import TermMapping, TermMappingCollection + + +class SyntacticMapper: + + def __init__(self, target_ontology_terms): + """ + :param target_ontology_terms: Collection of ontology terms to be mapped against + """ + self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.target_ontology_terms = target_ontology_terms + + def map(self, source_terms, similarity_metric=SimilarityMetric.JARO_WINKLER, max_mappings=3): + """ + :param source_terms: List of source terms to be mapped with ontology terms + :param similarity_metric: Similarity metric to be used for matching + :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned + """ + self.logger.info("Mapping %i source terms...", len(source_terms)) + start = time.time() + mappings = [] + for input_term in tqdm(source_terms): + matches = self._map(input_term, similarity_metric, max_mappings) + mappings.extend(matches) + end = time.time() + self.logger.info('done (mapping time: %.2fs seconds)', end - start) + return TermMappingCollection(mappings).mappings_df() + + def _map(self, source_term, similarity_metric, max_matches=3): + self.logger.debug("Matching %s...", source_term) + term_matches = [] + for term in self.target_ontology_terms: + highest_similarity = 0.0 + for target_name in self._term_names(term): + similarity = self.compare(source_term, target_name, similarity_metric) + self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) + if similarity > highest_similarity: + highest_similarity = similarity + term_matches.append(TermMapping(source_term, term.label, term.iri, term.ontology_iri, highest_similarity)) + matches_sorted = sorted(term_matches, key=lambda x: x.mapping_score, reverse=True) + del matches_sorted[max_matches:] + return matches_sorted + + def _term_names(self, ontology_term): + lbls_syns = [] + lbls_syns.extend(ontology_term.labels) + lbls_syns.extend(ontology_term.synonyms) + return lbls_syns + + def compare(self, s1, s2, similarity_metric): + """ + Compare the given strings s1 and s2 with respect to the specified string similarity metric + :param s1: source string + :param s2: target string + :param similarity_metric: String similarity metric to be used (see supported metrics in `SimilarityMetric`) + """ + if similarity_metric == SimilarityMetric.LEVENSHTEIN: + return self.compare_levenshtein(s1, s2) + elif similarity_metric == SimilarityMetric.JARO: + return self.compare_jaro(s1, s2) + elif similarity_metric == SimilarityMetric.JARO_WINKLER: + return self.compare_jarowinkler(s1, s2) + elif similarity_metric == SimilarityMetric.FUZZY: + return self.compare_fuzzy(s1, s2) + elif similarity_metric == SimilarityMetric.FUZZY_WEIGHTED: + return self.compare_fuzzy_weighted(s1, s2) + elif similarity_metric == SimilarityMetric.JACCARD: + return self.compare_jaccard(s1, s2) + else: + self.logger.error("Unsupported similarity metric: %s", similarity_metric) + + def compare_levenshtein(self, s1, s2): + """ + Calculates the normalized Levenshtein distance between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.normalized_levenshtein(s1, s2)/100 + return similarity + + def compare_jaro(self, s1, s2): + """ + Calculates the Jaro similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.jaro_similarity(s1, s2)/100 + return similarity + + def compare_jarowinkler(self, s1, s2): + """ + Calculates the Jaro-Winkler similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.jaro_winkler_similarity(s1, s2)/100 + return similarity + + def compare_fuzzy(self, s1, s2): + """ + Calculates the normalized Indel distance between s1 and s2. + See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#ratio + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.fuzz.ratio(s1, s2)/100 + return similarity + + def compare_fuzzy_weighted(self, s1, s2): + """ + Calculates a weighted ratio between s1 and s2 based on rapidfuzz's fuzzy ratio algorithms. + See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#wratio + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.fuzz.WRatio(s1, s2)/100 + return similarity + + def compare_jaccard(self, s1, s2): + """ + Calculates a Jaccard-based similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = 1-nltk.jaccard_distance(set(s1), set(s2)) + return similarity From 8b144a1c8df7a78144c32d6117bad40c67df32cb Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 26 Feb 2022 19:22:41 -0500 Subject: [PATCH 014/185] Update dependencies --- requirements.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 37b4fda..bfef8f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ -Owlready2~=0.31 +Owlready2~=0.36 argparse~=1.4.0 -pandas~=1.2.4 -numpy~=1.19.5 -gensim~=4.0.1 -scipy~=1.6.3 -scikit-learn~=0.24.2 -setuptools~=47.1.0 +pandas~=1.4.1 +numpy~=1.22.2 +gensim~=4.1.2 +scipy~=1.8.0 +scikit-learn~=1.0.2 +setuptools~=60.9.3 requests~=2.27.1 tqdm~=4.62.3 sparse_dot_topn~=0.3.1 -bioregistry~=0.4.46 -nltk~=3.5 +bioregistry~=0.4.63 +nltk~=3.7 rapidfuzz~=2.0.5 \ No newline at end of file From c4355bb09dd018b104c70b1f3f2246092152f3ba Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 2 Mar 2022 18:31:40 -0500 Subject: [PATCH 015/185] Create graphs for all ontology terms. Closes #12 --- text2term/__main__.py | 6 +++++- text2term/term_collector.py | 6 ++++++ text2term/term_mapping.py | 18 +++++++++++------- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index 2217021..1cc33d6 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -54,8 +54,12 @@ def get_arguments(): exclude_deprecated=excl_deprecated, include_individuals=incl_individuals) if len(onto_terms) > 0: + # Get ontology mappings mapper = TFIDFMapper(onto_terms) - mappings_df, term_graphs = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) + mappings_df = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) mappings_df.to_csv(output_file, index=False) + + # Get ontology term graphs + term_graphs = term_collector.get_term_graphs(onto_terms) with open(output_file + "-term-graphs.json", 'w') as json_file: json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 021948f..9d9de02 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -45,6 +45,12 @@ def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecat self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end-start) return ontology_terms + def get_term_graphs(self, ontology_terms): + term_graphs = [] + for term in ontology_terms: + term_graphs.append(term.graph().graph_dict()) + return term_graphs + def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): ontology_terms = [] for ontology_term in term_list: diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 0a9ace1..ec09c83 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -4,6 +4,11 @@ class TermMapping: + SRC_TERM = "Source Term" + TGT_TERM_LBL = "Mapped Term Label" + TGT_TERM_IRI = "Mapped Term IRI" + TGT_TERM_ONT_IRI = "Mapped Ontology IRI" + MAPPING_SCORE = "Mapping Score" def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapped_ontology_iri, mapping_score): self._source_term = source_term @@ -34,11 +39,11 @@ def mapping_score(self): def to_dict(self): return { - 'Source Term': self.source_term, - 'Mapped Term Label': self.mapped_term_label, - 'Mapped Term IRI': self.mapped_term_iri, - 'Mapped Ontology IRI': self.mapped_ontology_iri, - 'Mapping Score': self.mapping_score + self.SRC_TERM: self.source_term, + self.TGT_TERM_LBL: self.mapped_term_label, + self.TGT_TERM_IRI: self.mapped_term_iri, + self.TGT_TERM_ONT_IRI: self.mapped_ontology_iri, + self.MAPPING_SCORE: self.mapping_score } def __eq__(self, other): @@ -47,8 +52,7 @@ def __eq__(self, other): return False def __str__(self): - return "Mapping: " + self.source_term + " -> " + self._mapped_term_label + \ - " (" + self.mapped_term_iri + ")" + return self.source_term + " -> " + self._mapped_term_label + " (" + self.mapped_term_iri + ")" class TermMappingCollection: From e925342463992bb52831af3fb8189ec29440e638 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 7 Mar 2022 10:00:45 -0500 Subject: [PATCH 016/185] Remove term graph gen. from mapper --- text2term/tfidf_mapper.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index f85df92..5f531d8 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -32,10 +32,10 @@ def map(self, source_terms, max_mappings=3, min_score=0.3): source_terms_norm = onto_utils.normalize_list(source_terms) vectorizer = self._tokenize(source_terms_norm, self.target_labels) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) - results_df, term_graphs = self._get_mappings(results_mtx, max_mappings, source_terms, self.target_terms) + results_df = self._get_mappings(results_mtx, max_mappings, source_terms, self.target_terms) end = time.time() self.logger.info("...done (mapping time: %.2fs seconds)", end-start) - return results_df, term_graphs + return results_df def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): """ @@ -63,7 +63,6 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ coo_mtx = results_mtx.tocoo() mappings = [] - mapped_term_graphs = [] last_source_term = "" top_mappings = set() for row, col, score in zip(coo_mtx.row, coo_mtx.col, coo_mtx.data): @@ -78,9 +77,8 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): top_mappings.clear() if onto_term.iri not in top_mappings: mappings.append(TermMapping(source_term, onto_term.label, onto_term.iri, onto_term.ontology_iri, score)) - mapped_term_graphs.append(onto_term.graph().graph_dict()) top_mappings.add(onto_term.iri) - return TermMappingCollection(mappings).mappings_df(), mapped_term_graphs + return TermMappingCollection(mappings).mappings_df() def _get_target_labels_terms(self, ontology_terms): """Get lists of labels and terms to enable retrieving terms from their labels""" From e5b2417d9555d2c04d25990edbdbd6fa7d49f4a1 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 25 Mar 2022 12:57:00 -0400 Subject: [PATCH 017/185] Update setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 75d1909..226daa2 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ version = '0.3.0' setup( - name='text2term ontology mapper', + name='text2term', version=version, install_requires=requirements, packages=find_packages(), @@ -20,6 +20,7 @@ long_description=long_description, long_description_content_type='text/markdown', author='Center for Computational Biomedicine, Harvard Medical School', + author_email='rafael_goncalves@hms.harvard.edu', classifiers=[ 'Development Status :: 3 - Alpha', 'License :: OSI Approved :: MIT License', From 31f9ccd62448c693063471eef1cdf5d86c152a79 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 25 Mar 2022 12:58:04 -0400 Subject: [PATCH 018/185] Fix ontology filter option in Zooma interface --- text2term/zooma_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 7376211..99e54f6 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -33,7 +33,7 @@ def map(self, source_terms, ontologies, max_mappings=3, api_params=()): def _map_term(self, source_term, ontologies, max_mappings, api_params): params = { "propertyValue": source_term, - "ontologies": ontologies + "filter": "required:[none],ontologies:[" + ontologies + "]" } if len(api_params) > 0: params.update(api_params) From bca4d4a4d6aef65f9b317cb12ce5b567c44b774c Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 25 Mar 2022 15:37:31 -0400 Subject: [PATCH 019/185] Add Zooma API documentation link --- text2term/zooma_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 99e54f6..aaf0787 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -31,6 +31,7 @@ def map(self, source_terms, ontologies, max_mappings=3, api_params=()): return TermMappingCollection(mappings).mappings_df() def _map_term(self, source_term, ontologies, max_mappings, api_params): + # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters params = { "propertyValue": source_term, "filter": "required:[none],ontologies:[" + ontologies + "]" From 4eafd86dee018d35daf2d2f7da5911b4b4233760 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 25 Mar 2022 15:40:03 -0400 Subject: [PATCH 020/185] Refactor similarity metric and add BP and Zooma --- text2term/mapping_method.py | 15 +++++++++++++++ text2term/similarity_metric.py | 12 ------------ text2term/syntactic_mapper.py | 32 ++++++++++++++++---------------- 3 files changed, 31 insertions(+), 28 deletions(-) create mode 100644 text2term/mapping_method.py delete mode 100644 text2term/similarity_metric.py diff --git a/text2term/mapping_method.py b/text2term/mapping_method.py new file mode 100644 index 0000000..92b295e --- /dev/null +++ b/text2term/mapping_method.py @@ -0,0 +1,15 @@ +"""Provides MappingMethod enum""" + +from enum import Enum + + +class MappingMethod(Enum): + LEVENSHTEIN = 'levenshtein' + JARO = 'jaro' + JARO_WINKLER = 'jarowinkler' + JACCARD = 'jaccard' + FUZZY = 'fuzzy' + FUZZY_WEIGHTED = 'fuzzyw' + TFIDF = 'tfidf' + ZOOMA = 'zooma' + BIOPORTAL = 'bioportal' diff --git a/text2term/similarity_metric.py b/text2term/similarity_metric.py deleted file mode 100644 index 5a13070..0000000 --- a/text2term/similarity_metric.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Provides SimilarityMetric enum""" - -from enum import Enum - - -class SimilarityMetric(Enum): - LEVENSHTEIN = 'leven' - JARO = 'jaro' - JARO_WINKLER = 'jarowinkler' - JACCARD = 'jaccard' - FUZZY = 'fuzzy' - FUZZY_WEIGHTED = 'fuzzyw' diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index b29c253..b7603f9 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -6,7 +6,7 @@ import rapidfuzz from tqdm import tqdm from text2term import onto_utils -from text2term.similarity_metric import SimilarityMetric +from text2term.mapping_method import MappingMethod from text2term.term_mapping import TermMapping, TermMappingCollection @@ -19,29 +19,29 @@ def __init__(self, target_ontology_terms): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.target_ontology_terms = target_ontology_terms - def map(self, source_terms, similarity_metric=SimilarityMetric.JARO_WINKLER, max_mappings=3): + def map(self, source_terms, mapping_method=MappingMethod.JARO_WINKLER, max_mappings=3): """ :param source_terms: List of source terms to be mapped with ontology terms - :param similarity_metric: Similarity metric to be used for matching + :param mapping_method: Mapping method to be used for matching :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned """ self.logger.info("Mapping %i source terms...", len(source_terms)) start = time.time() mappings = [] for input_term in tqdm(source_terms): - matches = self._map(input_term, similarity_metric, max_mappings) + matches = self._map(input_term, mapping_method, max_mappings) mappings.extend(matches) end = time.time() self.logger.info('done (mapping time: %.2fs seconds)', end - start) return TermMappingCollection(mappings).mappings_df() - def _map(self, source_term, similarity_metric, max_matches=3): + def _map(self, source_term, mapping_method, max_matches=3): self.logger.debug("Matching %s...", source_term) term_matches = [] for term in self.target_ontology_terms: highest_similarity = 0.0 for target_name in self._term_names(term): - similarity = self.compare(source_term, target_name, similarity_metric) + similarity = self.compare(source_term, target_name, mapping_method) self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) if similarity > highest_similarity: highest_similarity = similarity @@ -56,27 +56,27 @@ def _term_names(self, ontology_term): lbls_syns.extend(ontology_term.synonyms) return lbls_syns - def compare(self, s1, s2, similarity_metric): + def compare(self, s1, s2, mapping_method): """ - Compare the given strings s1 and s2 with respect to the specified string similarity metric + Compare the given strings s1 and s2 with respect to the specified mapping method :param s1: source string :param s2: target string - :param similarity_metric: String similarity metric to be used (see supported metrics in `SimilarityMetric`) + :param mapping_method: Mapping method to be used (see supported methods in `MappingMethod`) """ - if similarity_metric == SimilarityMetric.LEVENSHTEIN: + if mapping_method == MappingMethod.LEVENSHTEIN: return self.compare_levenshtein(s1, s2) - elif similarity_metric == SimilarityMetric.JARO: + elif mapping_method == MappingMethod.JARO: return self.compare_jaro(s1, s2) - elif similarity_metric == SimilarityMetric.JARO_WINKLER: + elif mapping_method == MappingMethod.JARO_WINKLER: return self.compare_jarowinkler(s1, s2) - elif similarity_metric == SimilarityMetric.FUZZY: + elif mapping_method == MappingMethod.FUZZY: return self.compare_fuzzy(s1, s2) - elif similarity_metric == SimilarityMetric.FUZZY_WEIGHTED: + elif mapping_method == MappingMethod.FUZZY_WEIGHTED: return self.compare_fuzzy_weighted(s1, s2) - elif similarity_metric == SimilarityMetric.JACCARD: + elif mapping_method == MappingMethod.JACCARD: return self.compare_jaccard(s1, s2) else: - self.logger.error("Unsupported similarity metric: %s", similarity_metric) + self.logger.error("Unsupported method: %s", mapping_method) def compare_levenshtein(self, s1, s2): """ From c25b5ca630b19b7da5685fd6623feee19821d522 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 31 Mar 2022 16:39:53 -0400 Subject: [PATCH 021/185] Use UUID-based IRIs when creating ontology from labels --- text2term/onto_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 50b05bd..8377a02 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -1,6 +1,8 @@ import logging import re import sys +import uuid + import bioregistry from owlready2 import * from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces @@ -65,13 +67,15 @@ def parse_list_file(file_path): def get_ontology_from_labels(term_labels): - onto = owlready2.get_ontology("http://ccb.harvard.edu/t2t/") + base_iri = "http://ccb.harvard.edu/t2t/" + onto = owlready2.get_ontology(base_iri) onto.metadata.comment.append("Created dynamically using text2term") onto.metadata.comment.append(datetime.datetime.now()) for term_label in term_labels: with onto: - new_class = types.new_class(term_label, (Thing,)) - new_class.label = term_label + new_term_iri = base_iri + str(uuid.uuid4()) + new_term = types.new_class(new_term_iri, (Thing,)) + new_term.label = term_label return onto From 04cbfa508afa257c9fb9e1060f0c9a9761313d3f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 31 Mar 2022 19:57:58 -0400 Subject: [PATCH 022/185] Use a UUID in generated ontology IRI as well --- text2term/onto_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 8377a02..5ae10cf 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -67,13 +67,14 @@ def parse_list_file(file_path): def get_ontology_from_labels(term_labels): - base_iri = "http://ccb.harvard.edu/t2t/" - onto = owlready2.get_ontology(base_iri) + onto_base_iri = "http://ccb.harvard.edu/t2t/" + onto_iri = onto_base_iri + "Ontology-" + str(uuid.uuid4()) + onto = owlready2.get_ontology(onto_iri) onto.metadata.comment.append("Created dynamically using text2term") onto.metadata.comment.append(datetime.datetime.now()) for term_label in term_labels: with onto: - new_term_iri = base_iri + str(uuid.uuid4()) + new_term_iri = onto_base_iri + "R" + str(uuid.uuid4()) new_term = types.new_class(new_term_iri, (Thing,)) new_term.label = term_label return onto From 87f29399d2953aa327f61196defebb35f1a4b42b Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 8 Apr 2022 17:32:54 -0400 Subject: [PATCH 023/185] Drop the collection of ontology IRIs. Closes #8 --- text2term/bioportal_mapper.py | 2 +- text2term/syntactic_mapper.py | 2 +- text2term/term.py | 7 +------ text2term/term_collector.py | 4 ++-- text2term/term_mapping.py | 9 +-------- text2term/tfidf_mapper.py | 2 +- text2term/zooma_mapper.py | 4 +--- 7 files changed, 8 insertions(+), 22 deletions(-) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index 4ab0f6b..3bc125e 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -125,7 +125,7 @@ def __init__(self, original_text, term_name, term_iri, term_definition, term_anc self.match_type = match_type def as_term_mapping(self): - return TermMapping(self.original_text, self.term_name, self.term_iri, self.ontology_iri, self.mapping_score) + return TermMapping(self.original_text, self.term_name, self.term_iri, self.mapping_score) @property def mapping_score(self): diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index b7603f9..7ed9ed2 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -45,7 +45,7 @@ def _map(self, source_term, mapping_method, max_matches=3): self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) if similarity > highest_similarity: highest_similarity = similarity - term_matches.append(TermMapping(source_term, term.label, term.iri, term.ontology_iri, highest_similarity)) + term_matches.append(TermMapping(source_term, term.label, term.iri, highest_similarity)) matches_sorted = sorted(term_matches, key=lambda x: x.mapping_score, reverse=True) del matches_sorted[max_matches:] return matches_sorted diff --git a/text2term/term.py b/text2term/term.py index fe2d16a..ceb4237 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -10,12 +10,11 @@ class OntologyTerm: Represents an ontology class or individual. In the case of an individual 'children' is always empty and 'parents' specifies the individual's types. """ - def __init__(self, iri, labels, synonyms, definition, ontology_iri, parents=(), children=(), instances=()): + def __init__(self, iri, labels, synonyms, definition, parents=(), children=(), instances=()): self._iri = iri self._labels = labels self._synonyms = synonyms self._definition = definition - self._ontology_iri = ontology_iri self._parents = parents self._children = children self._instances = instances @@ -36,10 +35,6 @@ def synonyms(self): def definition(self): return self._definition - @property - def ontology_iri(self): - return self._ontology_iri - @property def parents(self): return self._parents diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 9d9de02..77fe2cc 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -62,7 +62,7 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): children = self._get_children(ontology_term, ontology) instances = self._get_instances(ontology_term, ontology) definition = self._get_definition(ontology_term) - term_details = OntologyTerm(ontology_term.iri, labels, synonyms, definition, ontology.base_iri, + term_details = OntologyTerm(ontology_term.iri, labels, synonyms, definition, parents=parents, children=children, instances=instances) ontology_terms.append(term_details) else: @@ -85,7 +85,7 @@ def _get_children(self, ontology_term, ontology): children = set() try: children = set(ontology.get_children_of(ontology_term)) - except AttributeError as err: + except TypeError and AttributeError as err: self.logger.debug(err) return children diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index ec09c83..e999308 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -7,14 +7,12 @@ class TermMapping: SRC_TERM = "Source Term" TGT_TERM_LBL = "Mapped Term Label" TGT_TERM_IRI = "Mapped Term IRI" - TGT_TERM_ONT_IRI = "Mapped Ontology IRI" MAPPING_SCORE = "Mapping Score" - def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapped_ontology_iri, mapping_score): + def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapping_score): self._source_term = source_term self._mapped_term_label = mapped_term_label self._mapped_term_iri = mapped_term_iri - self._mapped_ontology_iri = mapped_ontology_iri self._mapping_score = mapping_score @property @@ -29,10 +27,6 @@ def mapped_term_label(self): def mapped_term_iri(self): return self._mapped_term_iri - @property - def mapped_ontology_iri(self): - return self._mapped_ontology_iri - @property def mapping_score(self): return self._mapping_score @@ -42,7 +36,6 @@ def to_dict(self): self.SRC_TERM: self.source_term, self.TGT_TERM_LBL: self.mapped_term_label, self.TGT_TERM_IRI: self.mapped_term_iri, - self.TGT_TERM_ONT_IRI: self.mapped_ontology_iri, self.MAPPING_SCORE: self.mapping_score } diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 5f531d8..5cc3d12 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -76,7 +76,7 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): last_source_term = source_term top_mappings.clear() if onto_term.iri not in top_mappings: - mappings.append(TermMapping(source_term, onto_term.label, onto_term.iri, onto_term.ontology_iri, score)) + mappings.append(TermMapping(source_term, onto_term.label, onto_term.iri, score)) top_mappings.add(onto_term.iri) return TermMappingCollection(mappings).mappings_df() diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index aaf0787..768faa0 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -58,11 +58,9 @@ def _mapping_details(self, text, mapping_response): tags = mapping_response["semanticTags"] term_iri = tags[0] - ontology_iri = "" # TODO: Get Ontology IRI - # get mapping confidence score mapping_score = self._mapping_score(mapping_response["confidence"]) - return TermMapping(text, term_label, term_iri, ontology_iri, mapping_score) + return TermMapping(text, term_label, term_iri, mapping_score) def _mapping_score(self, confidence): """Represent numerically the mapping confidence categories returned by Zooma (high, good, medium or low)""" From 13fd30abb77e566f0e51c8429452b446baa4620e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 9 May 2022 16:37:26 -0400 Subject: [PATCH 024/185] Drop option to include individuals in search --- README.md | 1 - text2term/term_collector.py | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5166ce4..674eee7 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,6 @@ To display a help message with descriptions of tool arguments do: `-d EXCL_DEPRECATED` Exclude terms stated as deprecated via owl:deprecated. -`-i INCL_INDIVIDUALS` Include ontology individuals in addition to classes. ## Examples diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 77fe2cc..d24eb4e 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -15,13 +15,12 @@ def __init__(self, ontology_iri): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.ontology_iri = ontology_iri - def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=False, include_individuals=False): + def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=False): """ Collect the terms described in the ontology at the specified IRI :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple - :param use_reasoning: Use a reasoner to compute inferred class hierarchy and individual types + :param use_reasoning: Use a reasoner to compute inferred class hierarchy :param exclude_deprecated: Exclude ontology terms stated as deprecated using owl:deprecated 'true' - :param include_individuals: Include OWL ontology individuals in addition to ontology classes :return: Collection of ontology terms in the specified ontology """ ontology = self._load_ontology(self.ontology_iri) @@ -39,8 +38,6 @@ def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecat ontology_terms.extend(self._get_ontology_terms(iris, ontology, exclude_deprecated)) else: ontology_terms = self._get_ontology_terms(ontology.classes(), ontology, exclude_deprecated) - if include_individuals: - ontology_terms.extend(self._get_ontology_terms(ontology.individuals(), ontology, exclude_deprecated)) end = time.time() self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end-start) return ontology_terms @@ -241,7 +238,6 @@ def _classify_ontology(self, ontology): def _log_ontology_metrics(self, ontology): self.logger.debug(" Ontology IRI: %s", ontology.base_iri) self.logger.debug(" Class count: %i", len(list(ontology.classes()))) - self.logger.debug(" Individual count: %i", len(list(ontology.individuals()))) self.logger.debug(" Object property count: %i", len(list(ontology.object_properties()))) self.logger.debug(" Data property count: %i", len(list(ontology.data_properties()))) self.logger.debug(" Annotation property count: %i", len(list(ontology.annotation_properties()))) From a19a5b6230f61039614be9430a7727beaef5b523 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 9 May 2022 17:05:41 -0400 Subject: [PATCH 025/185] Move term graph generation to own module --- text2term/term.py | 52 ++-------------------------- text2term/term_collector.py | 6 ---- text2term/term_graph.py | 6 ++-- text2term/term_graph_generator.py | 56 +++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 59 deletions(-) create mode 100644 text2term/term_graph_generator.py diff --git a/text2term/term.py b/text2term/term.py index ceb4237..5eef645 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -1,15 +1,8 @@ """Provides OntologyTerm class""" -from owlready2 import Thing, ThingClass -from text2term import onto_utils -from text2term.term_graph import OntologyTermGraph, Edge, Node - class OntologyTerm: - """ - Represents an ontology class or individual. In the case of an individual 'children' is always empty and - 'parents' specifies the individual's types. - """ + def __init__(self, iri, labels, synonyms, definition, parents=(), children=(), instances=()): self._iri = iri self._labels = labels @@ -47,46 +40,6 @@ def children(self): def instances(self): return self._instances - def graph(self): - """ Build and return a graph representing the neighborhood of an ontology term. """ - nodes, edges = set(), set() - nodes.add(Node(self.iri, self.label)) - self._add_superclasses(nodes, edges) - self._add_subclasses(self.children, nodes, edges) - self._add_instances(self.instances, nodes, edges) - return OntologyTermGraph(self.iri, nodes, edges) - - def _add_superclasses(self, nodes, edges): - for parent in self.parents: - self._add_node(parent, nodes) - edges.add(Edge(self.iri, parent.iri, Edge.IS_A)) - self._add_ancestors(parent, nodes, edges) - - def _add_ancestors(self, node, nodes, edges): - for ancestor in node.is_a: - if ancestor is not Thing and isinstance(ancestor, ThingClass): - self._add_node(ancestor, nodes) - edges.add(Edge(node.iri, ancestor.iri, Edge.IS_A)) - self._add_ancestors(ancestor, nodes, edges) - - def _add_children(self, term_list, edge_type, nodes, edges): - for term in term_list: - self._add_node(term, nodes) - edges.add(Edge(term.iri, self.iri, edge_type)) - - def _add_subclasses(self, subclasses, nodes, edges): - self._add_children(subclasses, Edge.IS_A, nodes, edges) - - def _add_instances(self, instances, nodes, edges): - self._add_children(instances, Edge.INSTANCE_OF, nodes, edges) - - def _add_node(self, term, term_set): - if len(term.label) == 0: - label = onto_utils.label_from_iri(term.iri) - else: - label = term.label[0] - term_set.add(Node(term.iri, label)) - @property def label(self): """Return a single label for this term""" @@ -103,5 +56,4 @@ def __hash__(self): def __str__(self): return "Ontology Term: " + self.iri + ", Labels: " + str(self.labels) + ", Synonyms: " + \ str(self.synonyms) + ", Definition: " + str(self.definition) + ", Parents: " + str(self.parents) + \ - ", Children: " + str(self.children) + ", Instances: " + str(self.instances) + ", Term graph: " + \ - str(self.graph().graph_dict()) + ", Children: " + str(self.children) + ", Instances: " + str(self.instances) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index d24eb4e..23542ae 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -42,12 +42,6 @@ def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecat self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end-start) return ontology_terms - def get_term_graphs(self, ontology_terms): - term_graphs = [] - for term in ontology_terms: - term_graphs.append(term.graph().graph_dict()) - return term_graphs - def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): ontology_terms = [] for ontology_term in term_list: diff --git a/text2term/term_graph.py b/text2term/term_graph.py index 29347be..b3f168d 100644 --- a/text2term/term_graph.py +++ b/text2term/term_graph.py @@ -1,7 +1,7 @@ -"""Provides OntologyTermGraph, Node and Edge classes""" +"""Provides TermGraph, Node and Edge classes""" -class OntologyTermGraph: +class TermGraph: """ Represents a graph of the neighborhood of an ontology term. The graph includes all (direct and indirect) superclasses and all direct subclasses. @@ -23,7 +23,7 @@ def nodes(self): def edges(self): return self._edges - def graph_dict(self): + def as_dict(self): graph = { "iri": self.term_iri, "nodes": self._nodes_dict(), diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py new file mode 100644 index 0000000..b0101b5 --- /dev/null +++ b/text2term/term_graph_generator.py @@ -0,0 +1,56 @@ +import onto_utils +from term_graph import TermGraph, Node, Edge +from owlready2 import Thing, ThingClass + + +class TermGraphGenerator: + + def __init__(self): + pass + + def graph(self, term): + """ Build and return a graph representing the neighborhood of an ontology term. """ + nodes, edges = set(), set() + nodes.add(Node(term.iri, term.label)) + self._add_superclasses(term, nodes, edges) + self._add_subclasses(term, term.children, nodes, edges) + self._add_instances(term, term.instances, nodes, edges) + return TermGraph(term.iri, nodes, edges) + + def _add_superclasses(self, term, nodes, edges): + for parent in term.parents: + self._add_node(parent, nodes) + edges.add(Edge(term.iri, parent.iri, Edge.IS_A)) + self._add_ancestors(parent, nodes, edges) + + def _add_ancestors(self, node, nodes, edges): + for ancestor in node.is_a: + if ancestor is not Thing and isinstance(ancestor, ThingClass): + self._add_node(ancestor, nodes) + edges.add(Edge(node.iri, ancestor.iri, Edge.IS_A)) + self._add_ancestors(ancestor, nodes, edges) + + def _add_children(self, term, children, edge_type, nodes, edges): + for child in children: + self._add_node(child, nodes) + edges.add(Edge(child.iri, term.iri, edge_type)) + + def _add_subclasses(self, term, subclasses, nodes, edges): + self._add_children(term, subclasses, Edge.IS_A, nodes, edges) + + def _add_instances(self, term, instances, nodes, edges): + self._add_children(term, instances, Edge.INSTANCE_OF, nodes, edges) + + def _add_node(self, term, term_set): + if len(term.label) == 0: + label = onto_utils.label_from_iri(term.iri) + else: + label = term.label[0] + term_set.add(Node(term.iri, label)) + + def graphs_dicts(self, terms): + """Convenience function to get a list of all term graphs' dictionary representations""" + graph_dicts = [] + for term in terms: + graph_dicts.append(self.graph(term).as_dict()) + return graph_dicts From 289d9074e2df69e271739e87882b7fd54b7ee876 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 May 2022 15:16:37 -0400 Subject: [PATCH 026/185] Add input option for CSV input refactor main module --- README.md | 2 + requirements.txt | 3 +- setup.py | 2 +- text2term/__main__.py | 72 ++++++++++++++++++++++++++--------- text2term/bioportal_mapper.py | 72 ++++++++--------------------------- text2term/onto_utils.py | 38 +++++++++++++++--- text2term/syntactic_mapper.py | 11 +++--- text2term/term_mapping.py | 9 ++++- text2term/tfidf_mapper.py | 11 +++--- text2term/zooma_mapper.py | 18 ++++----- 10 files changed, 135 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index 674eee7..f6ef3f7 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ To display a help message with descriptions of tool arguments do: `-o OUTPUT` Path to desired output file for the mappings. +`-csv CSV_INPUT` Specifies that the input is a CSV file—followed by the name of the column that contains the terms to map, optionally followed by the name of the column that contains identifiers for the terms (e.g., "my terms,my term ids") + `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. `-min MIN_SCORE` Minimum score [0,1] for the mappings (0=dissimilar, 1=exact match). diff --git a/requirements.txt b/requirements.txt index bfef8f6..709b8be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ tqdm~=4.62.3 sparse_dot_topn~=0.3.1 bioregistry~=0.4.63 nltk~=3.7 -rapidfuzz~=2.0.5 \ No newline at end of file +rapidfuzz~=2.0.5 +shortuuid~=1.0.9 \ No newline at end of file diff --git a/setup.py b/setup.py index 226daa2..ebe7574 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.3.0' +version = '0.4.0' setup( name='text2term', diff --git a/text2term/__main__.py b/text2term/__main__.py index 1cc33d6..e2d0d41 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -5,12 +5,13 @@ import sys import onto_utils from term_collector import OntologyTermCollector +from term_graph_generator import TermGraphGenerator from tfidf_mapper import TFIDFMapper def get_arguments(): timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") - output_file_name = "t2t-out-" + timestamp + ".csv" + output_file_name = "t2t-mappings-" + timestamp + ".csv" parser = argparse.ArgumentParser(description="A tool to map unstructured terms to ontology terms") parser.add_argument("-s", "--source", required=True, type=str, help="Input file containing list of 'source' terms to map to ontology terms (one per line)") @@ -18,6 +19,10 @@ def get_arguments(): help="Path or URL of 'target' ontology to map the source terms to") parser.add_argument("-o", "--output", required=False, type=str, default=output_file_name, help="Path to desired output file for the mappings (default=current working directory)") + parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), + help="Specifies that the input is a CSV file—This should be followed by the name of the column " + "that contains the terms to map, optionally followed by the name of the column that " + "contains identifiers for the terms (eg 'my_terms,my_term_ids')") parser.add_argument("-top", "--top_mappings", required=False, type=int, default=3, help="Maximum number of top-ranked mappings returned per source term (default=3)") parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, @@ -26,8 +31,8 @@ def get_arguments(): help="Map only to terms whose IRIs start with any IRI given in this comma-separated list") parser.add_argument("-d", "--excl_deprecated", required=False, default=False, action="store_true", help="Exclude terms stated as deprecated via owl:deprecated") - parser.add_argument("-i", "--incl_individuals", required=False, default=False, action="store_true", - help="Include ontology individuals in addition to classes") + parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", + help="Save the graphs representing the neighborhood of each ontology term") arguments = parser.parse_args() source_file, target_file, out_file = arguments.source, arguments.target, arguments.output @@ -42,24 +47,53 @@ def get_arguments(): iris = arguments.base_iris if len(iris) > 0: iris = tuple(iris.split(',')) - return source_file, target_file, out_file, arguments.top_mappings, arguments.min_score, iris, \ - arguments.excl_deprecated, arguments.incl_individuals + csv_column_names = arguments.csv_input + if len(csv_column_names) > 0: + csv_column_names = tuple(csv_column_names.split(',')) -if __name__ == "__main__": - input_file, target_ontology, output_file, max_mappings, min_score, base_iris, excl_deprecated, incl_individuals = get_arguments() - source_terms = onto_utils.parse_list_file(input_file) - term_collector = OntologyTermCollector(target_ontology) - onto_terms = term_collector.get_ontology_terms(base_iris=base_iris, - exclude_deprecated=excl_deprecated, - include_individuals=incl_individuals) - if len(onto_terms) > 0: - # Get ontology mappings - mapper = TFIDFMapper(onto_terms) - mappings_df = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) - mappings_df.to_csv(output_file, index=False) + return source_file, target_file, out_file, arguments.top_mappings, arguments.min_score, iris, csv_column_names, \ + arguments.excl_deprecated, arguments.save_term_graphs + + +def process_source_file(input_file_path, csv_column_names): + if len(csv_column_names) >= 1: + term_id_col_name = "" + if len(csv_column_names) == 2: + term_id_col_name = csv_column_names[1] + terms, term_ids = onto_utils.parse_csv_file(input_file_path, + term_column_name=csv_column_names[0], + term_id_column_name=term_id_col_name) + else: + terms = onto_utils.parse_list_file(input_file_path) + term_ids = onto_utils.generate_iris(len(terms)) + return terms, term_ids - # Get ontology term graphs - term_graphs = term_collector.get_term_graphs(onto_terms) + +def process_target_ontology(ontology, iris, exclude_deprecated, save_term_graphs): + term_collector = OntologyTermCollector(ontology) + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) + if len(onto_terms) == 0: + raise RuntimeError("Could not find any terms in the given ontology.") + if save_term_graphs: + term_graphs = TermGraphGenerator().graphs_dicts(onto_terms) with open(output_file + "-term-graphs.json", 'w') as json_file: json.dump(term_graphs, json_file, indent=2) + return onto_terms + + +def do_mapping(source_input_terms, source_input_term_ids, ontology_terms, max_mappings_per_term, + min_mapping_score, mappings_output_file): + mapper = TFIDFMapper(ontology_terms) + mappings_df = mapper.map(source_input_terms, source_input_term_ids, + max_mappings=max_mappings_per_term, + min_score=min_mapping_score) + mappings_df.to_csv(mappings_output_file, index=False) + + +if __name__ == "__main__": + input_file, target_ontology, output_file, max_mappings, min_score, base_iris, \ + csv_columns, excl_deprecated, save_graphs = get_arguments() + source_terms, source_term_ids = process_source_file(input_file, csv_columns) + target_terms = process_target_ontology(target_ontology, base_iris, excl_deprecated, save_graphs) + do_mapping(source_terms, source_term_ids, target_terms, max_mappings, min_score, output_file) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index 3bc125e..457fdff 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -18,23 +18,24 @@ def __init__(self, bp_api_key): self.url = "http://data.bioontology.org/annotator" self.bp_api_key = bp_api_key - def map(self, source_terms, ontologies, max_mappings=3, api_params=()): + def map(self, source_terms, source_terms_ids, ontologies='', max_mappings=3, api_params=()): """ Find and return ontology mappings through the BioPortal Annotator Web service :param source_terms: Collection of source terms to map to target ontologies - :param ontologies: String with a comma-separated list of ontology acronyms (eg "HP,EFO") + :param source_terms_ids: List of identifiers for the given source terms + :param ontologies: String with comma-separated list of ontology acronyms (eg 'HP,EFO'). Default: all ontologies ('') :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional BioPortal Annotator-specific parameters to include in the request """ self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) start = time.time() mappings = [] - for term in source_terms: - mappings.extend(self._map_term(term, ontologies, max_mappings, api_params)) + for term, term_id in zip(source_terms, source_terms_ids): + mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() - def _map_term(self, source_term, ontologies, max_mappings, api_params): + def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): params = { "text": source_term, "longest_only": "true", @@ -51,44 +52,23 @@ def _map_term(self, source_term, ontologies, max_mappings, api_params): self.logger.debug("...found " + str(len(response)) + " mappings") for mapping in response: if len(mappings) < max_mappings: - mappings.append(self._mapping_details(source_term, mapping).as_term_mapping()) + mappings.append(self._mapping_details(source_term, source_term_id, mapping)) return mappings - def _mapping_details(self, text, annotation): - ann_class = annotation["annotatedClass"] + def _mapping_details(self, source_term, source_term_id, mapping): + ann_class = mapping["annotatedClass"] term_iri = ann_class["@id"] term_link_bp = ann_class["links"]["self"] - onto_iri = ann_class["links"]["ontology"] - onto_name = onto_utils.curie_from_iri(term_iri) - bp_link = ann_class["links"]["ui"] - match_type = annotation["annotations"][0]["matchType"] - term_name, term_definition, ancestors = self.get_term_details(term_link_bp) - return BioPortalMapping(text, term_name, term_iri, term_definition, ancestors, onto_iri, onto_name, bp_link, - match_type) + match_type = mapping["annotations"][0]["matchType"] + term_label = self.get_term_details(term_link_bp) + return TermMapping(source_term, source_term_id, term_label, term_iri, 1) def get_term_details(self, term_iri): response = self._do_get_request(term_iri) - term_name, term_definition = "", "" - ancestors = [] + term_label = "" if response is not None: - term_name = onto_utils.remove_quotes(response["prefLabel"]) - if len(response["definition"]) > 0: - term_definition = response["definition"][0] - term_definition = onto_utils.remove_quotes(term_definition) - ancestors_link = response["links"]["ancestors"] - ancestors = self._get_ancestors(ancestors_link) - return term_name, term_definition, ancestors - - def _get_ancestors(self, term_ancestors_bp_link): - response = self._do_get_request(term_ancestors_bp_link) - ancestors = [] - if response is not None: - for ancestor in response: - if ancestor is not None: - ancestor_name = ancestor["prefLabel"] - ancestors.append(ancestor_name) - ancestors = list(dict.fromkeys(ancestors)) # remove duplicate ancestors - return ancestors + term_label = onto_utils.remove_quotes(response["prefLabel"]) + return term_label def _do_get_request(self, request_url, params=None): headers = { @@ -108,25 +88,3 @@ def _do_get_request(self, request_url, params=None): else: json_resp = json.loads(response.content) self.logger.error(response.reason + ":" + request_url + ". " + json_resp["errors"][0]) - - -class BioPortalMapping: - - def __init__(self, original_text, term_name, term_iri, term_definition, term_ancestors, ontology_iri, ontology_name, - bioportal_link, match_type): - self.original_text = original_text - self.term_name = term_name - self.term_iri = term_iri - self.term_definition = term_definition - self.term_ancestors = term_ancestors - self.ontology_iri = ontology_iri - self.ontology_name = ontology_name - self.bioportal_link = bioportal_link - self.match_type = match_type - - def as_term_mapping(self): - return TermMapping(self.original_text, self.term_name, self.term_iri, self.mapping_score) - - @property - def mapping_score(self): - return 1 # if SYN|PREF diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 5ae10cf..1ec45c0 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -1,15 +1,17 @@ import logging import re import sys -import uuid - +import pandas as pd import bioregistry +import shortuuid from owlready2 import * from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 'ds', 'rd', 'rdgwas', 'average', 'weekly', 'monthly', 'daily'} +BASE_IRI = "http://ccb.hms.harvard.edu/t2t/" + def normalize_list(token_list): normalized_token_list = [] @@ -63,23 +65,49 @@ def get_logger(name, level): def parse_list_file(file_path): file = open(file_path) lines = file.read().splitlines() + file.close() return lines +def parse_csv_file(file_path, term_column_name, term_id_column_name, separator=','): + data = pd.read_csv(file_path, sep=separator) + if term_column_name not in data.columns: + raise ValueError("Could not find specified column name for input terms: " + term_column_name) + terms = data[term_column_name].values + if term_id_column_name not in data.columns: + term_ids = generate_iris(len(terms)) + elif data[term_id_column_name].isnull().values.all(): + term_ids = generate_iris(len(terms)) + else: + term_ids = data[term_id_column_name].values + return terms, term_ids + + def get_ontology_from_labels(term_labels): - onto_base_iri = "http://ccb.harvard.edu/t2t/" - onto_iri = onto_base_iri + "Ontology-" + str(uuid.uuid4()) + onto_iri = BASE_IRI + "Ontology-" + generate_uuid() onto = owlready2.get_ontology(onto_iri) onto.metadata.comment.append("Created dynamically using text2term") onto.metadata.comment.append(datetime.datetime.now()) for term_label in term_labels: with onto: - new_term_iri = onto_base_iri + "R" + str(uuid.uuid4()) + new_term_iri = generate_iri() new_term = types.new_class(new_term_iri, (Thing,)) new_term.label = term_label return onto +def generate_uuid(): + return str(shortuuid.ShortUUID().random(length=10)) + + +def generate_iri(): + return BASE_IRI + "R" + generate_uuid() + + +def generate_iris(quantity): + return [generate_iri() for _ in range(quantity)] + + OBO_BASE_IRI = "http://purl.obolibrary.org/obo/" BIOPORTAL_BASE_IRI = "http://purl.bioontology.org/ontology/" ORPHANET_IRI = "http://www.orpha.net/ORDO/" diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index 7ed9ed2..e04f0f6 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -19,23 +19,24 @@ def __init__(self, target_ontology_terms): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.target_ontology_terms = target_ontology_terms - def map(self, source_terms, mapping_method=MappingMethod.JARO_WINKLER, max_mappings=3): + def map(self, source_terms, source_terms_ids, mapping_method=MappingMethod.JARO_WINKLER, max_mappings=3): """ :param source_terms: List of source terms to be mapped with ontology terms + :param source_terms_ids: List of identifiers for the given source terms :param mapping_method: Mapping method to be used for matching :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned """ self.logger.info("Mapping %i source terms...", len(source_terms)) start = time.time() mappings = [] - for input_term in tqdm(source_terms): - matches = self._map(input_term, mapping_method, max_mappings) + for term, term_id in tqdm(zip(source_terms, source_terms_ids)): + matches = self._map(term, term_id, mapping_method, max_mappings) mappings.extend(matches) end = time.time() self.logger.info('done (mapping time: %.2fs seconds)', end - start) return TermMappingCollection(mappings).mappings_df() - def _map(self, source_term, mapping_method, max_matches=3): + def _map(self, source_term, source_term_id, mapping_method, max_matches=3): self.logger.debug("Matching %s...", source_term) term_matches = [] for term in self.target_ontology_terms: @@ -45,7 +46,7 @@ def _map(self, source_term, mapping_method, max_matches=3): self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) if similarity > highest_similarity: highest_similarity = similarity - term_matches.append(TermMapping(source_term, term.label, term.iri, highest_similarity)) + term_matches.append(TermMapping(source_term, source_term_id, term.label, term.iri, highest_similarity)) matches_sorted = sorted(term_matches, key=lambda x: x.mapping_score, reverse=True) del matches_sorted[max_matches:] return matches_sorted diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index e999308..243af5d 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -5,12 +5,14 @@ class TermMapping: SRC_TERM = "Source Term" + SRC_TERM_ID = "Source Term Id" TGT_TERM_LBL = "Mapped Term Label" TGT_TERM_IRI = "Mapped Term IRI" MAPPING_SCORE = "Mapping Score" - def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapping_score): + def __init__(self, source_term, source_term_id, mapped_term_label, mapped_term_iri, mapping_score): self._source_term = source_term + self._source_term_id = source_term_id self._mapped_term_label = mapped_term_label self._mapped_term_iri = mapped_term_iri self._mapping_score = mapping_score @@ -19,6 +21,10 @@ def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapping_scor def source_term(self): return self._source_term + @property + def source_term_id(self): + return self._source_term_id + @property def mapped_term_label(self): return self._mapped_term_label @@ -33,6 +39,7 @@ def mapping_score(self): def to_dict(self): return { + self.SRC_TERM_ID: self.source_term_id, self.SRC_TERM: self.source_term, self.TGT_TERM_LBL: self.mapped_term_label, self.TGT_TERM_IRI: self.mapped_term_iri, diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 5cc3d12..99817bf 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -18,10 +18,11 @@ def __init__(self, target_ontology_terms): self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) - def map(self, source_terms, max_mappings=3, min_score=0.3): + def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): """ Main mapping function. Default settings return only the top candidate for every source string. :param source_terms: List of source terms to be mapped with ontology terms + :param source_terms_ids: List of identifiers for the given source terms :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates @@ -32,7 +33,7 @@ def map(self, source_terms, max_mappings=3, min_score=0.3): source_terms_norm = onto_utils.normalize_list(source_terms) vectorizer = self._tokenize(source_terms_norm, self.target_labels) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) - results_df = self._get_mappings(results_mtx, max_mappings, source_terms, self.target_terms) + results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) end = time.time() self.logger.info("...done (mapping time: %.2fs seconds)", end-start) return results_df @@ -49,7 +50,6 @@ def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): # Create count vectorizer and fit it on both lists to get vocabulary count_vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=(n, n)) vocabulary = count_vectorizer.fit(source_terms + target_labels).vocabulary_ - # Create tf-idf vectorizer return TfidfVectorizer(vocabulary=vocabulary, analyzer=analyzer, ngram_range=(n, n)) def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): @@ -59,7 +59,7 @@ def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): # multiple labels/synonyms in the 'ntop' matches may be from the same ontology term return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=50, lower_bound=min_score) - def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): + def _get_mappings(self, results_mtx, max_mappings, source_terms, source_terms_ids, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ coo_mtx = results_mtx.tocoo() mappings = [] @@ -67,6 +67,7 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): top_mappings = set() for row, col, score in zip(coo_mtx.row, coo_mtx.col, coo_mtx.data): source_term = source_terms[row] + source_term_id = source_terms_ids[row] onto_term = target_terms[col] self.logger.debug("Source term: %s maps to %s (%f)", source_term, onto_term.label, score) if source_term == last_source_term: @@ -76,7 +77,7 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): last_source_term = source_term top_mappings.clear() if onto_term.iri not in top_mappings: - mappings.append(TermMapping(source_term, onto_term.label, onto_term.iri, score)) + mappings.append(TermMapping(source_term, source_term_id, onto_term.label, onto_term.iri, score)) top_mappings.add(onto_term.iri) return TermMappingCollection(mappings).mappings_df() diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 768faa0..beb63e6 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -14,23 +14,24 @@ def __init__(self): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.url = "http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate" - def map(self, source_terms, ontologies, max_mappings=3, api_params=()): + def map(self, source_terms, source_terms_ids, ontologies='', max_mappings=3, api_params=()): """ Find and return ontology mappings through the Zooma Web service :param source_terms: Collection of source terms to map to target ontologies - :param ontologies: String with a comma-separated list of ontology acronyms (eg "HP,EFO") + :param source_terms_ids: List of identifiers for the given source terms + :param ontologies: String with comma-separated list of ontology acronyms (eg 'HP,EFO'). Default: all ontologies ('') :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional Zooma API-specific parameters to include in the request """ self.logger.info("Mapping %i source terms against ontologies: %s", len(source_terms), ontologies) start = time.time() mappings = [] - for term in source_terms: - mappings.extend(self._map_term(term, ontologies, max_mappings, api_params)) + for term, term_id in zip(source_terms, source_terms_ids): + mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() - def _map_term(self, source_term, ontologies, max_mappings, api_params): + def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters params = { "propertyValue": source_term, @@ -46,10 +47,10 @@ def _map_term(self, source_term, ontologies, max_mappings, api_params): self.logger.debug("...found " + str(len(response)) + " mappings") for mapping in response: if len(mappings) < max_mappings: - mappings.append(self._mapping_details(source_term, mapping)) + mappings.append(self._mapping_details(source_term, source_term_id, mapping)) return mappings - def _mapping_details(self, text, mapping_response): + def _mapping_details(self, source_term, source_term_id, mapping_response): # get ontology term label ann_class = mapping_response["annotatedProperty"] term_label = ann_class["propertyValue"] @@ -58,9 +59,8 @@ def _mapping_details(self, text, mapping_response): tags = mapping_response["semanticTags"] term_iri = tags[0] - # get mapping confidence score mapping_score = self._mapping_score(mapping_response["confidence"]) - return TermMapping(text, term_label, term_iri, mapping_score) + return TermMapping(source_term, source_term_id, term_label, term_iri, mapping_score) def _mapping_score(self, confidence): """Represent numerically the mapping confidence categories returned by Zooma (high, good, medium or low)""" From 88ef9495b2600db125b175bb7aefad1c5a6110f3 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 11 May 2022 12:14:04 -0400 Subject: [PATCH 027/185] Move tag2iri utility to onto_utils --- text2term/onto_utils.py | 41 +++++++++++++++++++--- text2term/term_tag2iri.py | 72 --------------------------------------- 2 files changed, 37 insertions(+), 76 deletions(-) delete mode 100644 text2term/term_tag2iri.py diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 1ec45c0..b153783 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -34,6 +34,16 @@ def normalize(token): return token +def remove_quotes(string): + string = string.replace("\"", "") + string = string.replace("\'", "") + return string + + +def remove_whitespace(string): + return string.replace(' ', '') + + def curie_from_iri(iri): return bioregistry.curie_from_iri(iri) @@ -45,10 +55,32 @@ def label_from_iri(iri): return iri.rsplit('/', 1)[1] -def remove_quotes(text): - text = text.replace("\"", "") - text = text.replace("\'", "") - return text +def iri_from_tag(source_tag): + iri = source_tag + if len(source_tag) > 0 and source_tag != "NA": + iri = remove_whitespace(iri) + if ":" in source_tag: + onto_name = iri.split(":")[0] + term_name = iri.replace(":", "_") + full_iri = _get_iri(onto_name, term_name) + iri = full_iri if len(full_iri) > 0 else iri + elif "_" in source_tag: + onto_name = iri.split("_")[0] + full_iri = _get_iri(onto_name, iri) + iri = full_iri if len(full_iri) > 0 else iri + return iri + + +def _get_iri(ont_name, term_name): + iri = '' + if ont_name in ONTOLOGY_IRIS: + if ont_name == 'ORPHA': + iri = ONTOLOGY_IRIS.get(ont_name) + term_name.replace('ORPHA_', 'Orphanet_') + elif ont_name == 'SNOMED' or ont_name == 'OMIM': + iri = ONTOLOGY_IRIS.get(ont_name) + term_name.replace(ont_name + '_', '') + else: + iri = ONTOLOGY_IRIS.get(ont_name) + term_name + return iri def get_logger(name, level): @@ -114,6 +146,7 @@ def generate_iris(quantity): ONTOLOGY_IRIS = {"EFO": "http://www.ebi.ac.uk/efo/", "Orphanet": ORPHANET_IRI, "ORPHA": ORPHANET_IRI, + "CL": OBO_BASE_IRI, "MONDO": OBO_BASE_IRI, "HP": OBO_BASE_IRI, "UBERON": OBO_BASE_IRI, diff --git a/text2term/term_tag2iri.py b/text2term/term_tag2iri.py deleted file mode 100644 index 7ae53b3..0000000 --- a/text2term/term_tag2iri.py +++ /dev/null @@ -1,72 +0,0 @@ -import logging -import ssl -import urllib.request -import pandas as pd -from urllib.error import HTTPError -from text2term import onto_utils - -ssl._create_default_https_context = ssl._create_stdlib_context - - -class TermTag2Iri: - - def __init__(self): - self.logger = onto_utils.get_logger(__name__, logging.INFO) - - def get_iris(self, source_tags, resolve_iri): - iri_mappings = [] - for source_tag in source_tags: - source_tag, iri, iri_resolves = self.get_iri(source_tag, resolve_iri) - iri_mappings.append((source_tag, iri, iri_resolves)) - return iri_mappings - - def get_iri(self, source_tag, resolve_iri): - iri = source_tag - iri_resolves = False - if len(source_tag) > 0 and source_tag != "NA": - if ":" in source_tag: - iri = self.remove_whitespace(iri) - onto_name = iri.split(":")[0] - term_name = iri.replace(":", "_") - full_iri = self._get_iri(onto_name, term_name) - iri = full_iri if len(full_iri) > 0 else iri - elif "_" in source_tag: - iri = self.remove_whitespace(iri) - ont_name = iri.split("_")[0] - full_iri = self._get_iri(ont_name, iri) - iri = full_iri if len(full_iri) > 0 else iri - if source_tag != iri: - iri_resolves = self.resolves(iri) if resolve_iri else iri_resolves - else: - self.logger.info("Unable to find suitable IRI for: %s", source_tag) - return source_tag, iri, iri_resolves - - def _get_iri(self, ont_name, term_name): - iri = '' - if ont_name in onto_utils.ONTOLOGY_IRIS: - if ont_name == 'ORPHA': - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name.replace('ORPHA_', 'Orphanet_') - elif ont_name == 'SNOMED' or ont_name == 'OMIM': - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name.replace(ont_name + '_', '') - else: - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name - return iri - - def remove_whitespace(self, string): - return string.replace(' ', '') - - def resolves(self, iri): - resolves = False - try: - status_code = urllib.request.urlopen(iri).getcode() - resolves = status_code == 200 - except HTTPError as err: - self.logger.debug(err) - if not resolves: - self.logger.info("IRI does not resolve: %s", iri) - return resolves - - def get_iris_df_for_file(self, input_file, resolve_iri): - iris_file = self.get_iris(onto_utils.parse_list_file(input_file), resolve_iri=resolve_iri) - out_col_names = ['source_tag', 'target_iri', 'iri_resolves'] - return pd.DataFrame(iris_file, columns=out_col_names) From 1abca9f5452913c5a98bca3981145419d802501d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 May 2022 20:29:53 -0400 Subject: [PATCH 028/185] Add support to specify mapper. Closes #14 --- text2term/__main__.py | 94 +++++------------ text2term/bioportal_mapper.py | 4 +- text2term/{mapping_method.py => mapper.py} | 9 +- text2term/syntactic_mapper.py | 30 +++--- text2term/t2t.py | 114 +++++++++++++++++++++ text2term/zooma_mapper.py | 4 +- 6 files changed, 164 insertions(+), 91 deletions(-) rename text2term/{mapping_method.py => mapper.py} (52%) create mode 100644 text2term/t2t.py diff --git a/text2term/__main__.py b/text2term/__main__.py index e2d0d41..b454f5d 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -1,24 +1,25 @@ import argparse import datetime -import json import os import sys -import onto_utils -from term_collector import OntologyTermCollector -from term_graph_generator import TermGraphGenerator -from tfidf_mapper import TFIDFMapper +from t2t import Text2Term +from mapper import Mapper -def get_arguments(): +if __name__ == "__main__": timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") output_file_name = "t2t-mappings-" + timestamp + ".csv" parser = argparse.ArgumentParser(description="A tool to map unstructured terms to ontology terms") parser.add_argument("-s", "--source", required=True, type=str, - help="Input file containing list of 'source' terms to map to ontology terms (one per line)") + help="Input file containing 'source' terms to map to ontology terms") parser.add_argument("-t", "--target", required=True, type=str, - help="Path or URL of 'target' ontology to map the source terms to") + help="Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is " + "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " + "'all' to search all ontologies in BioPortal or OLS, respectively") parser.add_argument("-o", "--output", required=False, type=str, default=output_file_name, help="Path to desired output file for the mappings (default=current working directory)") + parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, + help="Mapping method to use. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), help="Specifies that the input is a CSV file—This should be followed by the name of the column " "that contains the terms to map, optionally followed by the name of the column that " @@ -26,74 +27,27 @@ def get_arguments(): parser.add_argument("-top", "--top_mappings", required=False, type=int, default=3, help="Maximum number of top-ranked mappings returned per source term (default=3)") parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, - help="Minimum score [0,1] for the mappings (0=dissimilar, 1=exact match; default=0.5)") + help="Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5)") parser.add_argument("-iris", "--base_iris", required=False, type=str, default=(), - help="Map only to terms whose IRIs start with any IRI given in this comma-separated list") + help="Map only to ontology terms whose IRIs start with a value given in this comma-separated list") parser.add_argument("-d", "--excl_deprecated", required=False, default=False, action="store_true", - help="Exclude terms stated as deprecated via owl:deprecated") + help="Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False)") parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", - help="Save the graphs representing the neighborhood of each ontology term") + help="Save the graphs representing the neighborhood of each ontology term (default=False)") arguments = parser.parse_args() - source_file, target_file, out_file = arguments.source, arguments.target, arguments.output - if not os.path.exists(source_file): - parser.error("The file '{}' does not exist".format(source_file)) + if not os.path.exists(arguments.source): + parser.error("The file '{}' does not exist".format(arguments.source)) sys.exit(1) - - # create output directories if needed - if os.path.dirname(out_file): - os.makedirs(os.path.dirname(out_file), exist_ok=True) - + mapper = Mapper(arguments.mapper) iris = arguments.base_iris if len(iris) > 0: iris = tuple(iris.split(',')) - - csv_column_names = arguments.csv_input - if len(csv_column_names) > 0: - csv_column_names = tuple(csv_column_names.split(',')) - - return source_file, target_file, out_file, arguments.top_mappings, arguments.min_score, iris, csv_column_names, \ - arguments.excl_deprecated, arguments.save_term_graphs - - -def process_source_file(input_file_path, csv_column_names): - if len(csv_column_names) >= 1: - term_id_col_name = "" - if len(csv_column_names) == 2: - term_id_col_name = csv_column_names[1] - terms, term_ids = onto_utils.parse_csv_file(input_file_path, - term_column_name=csv_column_names[0], - term_id_column_name=term_id_col_name) - else: - terms = onto_utils.parse_list_file(input_file_path) - term_ids = onto_utils.generate_iris(len(terms)) - return terms, term_ids - - -def process_target_ontology(ontology, iris, exclude_deprecated, save_term_graphs): - term_collector = OntologyTermCollector(ontology) - onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) - if len(onto_terms) == 0: - raise RuntimeError("Could not find any terms in the given ontology.") - if save_term_graphs: - term_graphs = TermGraphGenerator().graphs_dicts(onto_terms) - with open(output_file + "-term-graphs.json", 'w') as json_file: - json.dump(term_graphs, json_file, indent=2) - return onto_terms - - -def do_mapping(source_input_terms, source_input_term_ids, ontology_terms, max_mappings_per_term, - min_mapping_score, mappings_output_file): - mapper = TFIDFMapper(ontology_terms) - mappings_df = mapper.map(source_input_terms, source_input_term_ids, - max_mappings=max_mappings_per_term, - min_score=min_mapping_score) - mappings_df.to_csv(mappings_output_file, index=False) - - -if __name__ == "__main__": - input_file, target_ontology, output_file, max_mappings, min_score, base_iris, \ - csv_columns, excl_deprecated, save_graphs = get_arguments() - source_terms, source_term_ids = process_source_file(input_file, csv_columns) - target_terms = process_target_ontology(target_ontology, base_iris, excl_deprecated, save_graphs) - do_mapping(source_terms, source_term_ids, target_terms, max_mappings, min_score, output_file) + csv_columns = arguments.csv_input + if len(csv_columns) > 0: + csv_columns = tuple(csv_columns.split(',')) + + Text2Term().map(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, + excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, + min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, + save_mappings=True) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index 457fdff..26072c2 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -18,12 +18,12 @@ def __init__(self, bp_api_key): self.url = "http://data.bioontology.org/annotator" self.bp_api_key = bp_api_key - def map(self, source_terms, source_terms_ids, ontologies='', max_mappings=3, api_params=()): + def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_params=()): """ Find and return ontology mappings through the BioPortal Annotator Web service :param source_terms: Collection of source terms to map to target ontologies :param source_terms_ids: List of identifiers for the given source terms - :param ontologies: String with comma-separated list of ontology acronyms (eg 'HP,EFO'). Default: all ontologies ('') + :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional BioPortal Annotator-specific parameters to include in the request """ diff --git a/text2term/mapping_method.py b/text2term/mapper.py similarity index 52% rename from text2term/mapping_method.py rename to text2term/mapper.py index 92b295e..2c42f01 100644 --- a/text2term/mapping_method.py +++ b/text2term/mapper.py @@ -1,9 +1,10 @@ -"""Provides MappingMethod enum""" +"""Provides Mapper enum""" from enum import Enum -class MappingMethod(Enum): +class Mapper(str, Enum): + """ Enumeration of "mappers" (ie string similarity metrics and Web APIs) available """ LEVENSHTEIN = 'levenshtein' JARO = 'jaro' JARO_WINKLER = 'jarowinkler' @@ -13,3 +14,7 @@ class MappingMethod(Enum): TFIDF = 'tfidf' ZOOMA = 'zooma' BIOPORTAL = 'bioportal' + + @classmethod + def list(cls): + return list(map(lambda c: c.value, cls)) diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index e04f0f6..b4385c4 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -6,7 +6,7 @@ import rapidfuzz from tqdm import tqdm from text2term import onto_utils -from text2term.mapping_method import MappingMethod +from text2term.mapper import Mapper from text2term.term_mapping import TermMapping, TermMappingCollection @@ -19,30 +19,30 @@ def __init__(self, target_ontology_terms): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.target_ontology_terms = target_ontology_terms - def map(self, source_terms, source_terms_ids, mapping_method=MappingMethod.JARO_WINKLER, max_mappings=3): + def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_mappings=3): """ :param source_terms: List of source terms to be mapped with ontology terms :param source_terms_ids: List of identifiers for the given source terms - :param mapping_method: Mapping method to be used for matching + :param mapper: Mapping method to be used for matching :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned """ self.logger.info("Mapping %i source terms...", len(source_terms)) start = time.time() mappings = [] for term, term_id in tqdm(zip(source_terms, source_terms_ids)): - matches = self._map(term, term_id, mapping_method, max_mappings) + matches = self._map(term, term_id, mapper, max_mappings) mappings.extend(matches) end = time.time() self.logger.info('done (mapping time: %.2fs seconds)', end - start) return TermMappingCollection(mappings).mappings_df() - def _map(self, source_term, source_term_id, mapping_method, max_matches=3): + def _map(self, source_term, source_term_id, mapper, max_matches=3): self.logger.debug("Matching %s...", source_term) term_matches = [] for term in self.target_ontology_terms: highest_similarity = 0.0 for target_name in self._term_names(term): - similarity = self.compare(source_term, target_name, mapping_method) + similarity = self.compare(source_term, target_name, mapper) self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) if similarity > highest_similarity: highest_similarity = similarity @@ -57,27 +57,27 @@ def _term_names(self, ontology_term): lbls_syns.extend(ontology_term.synonyms) return lbls_syns - def compare(self, s1, s2, mapping_method): + def compare(self, s1, s2, mapper): """ Compare the given strings s1 and s2 with respect to the specified mapping method :param s1: source string :param s2: target string - :param mapping_method: Mapping method to be used (see supported methods in `MappingMethod`) + :param mapper: Mapping method to be used """ - if mapping_method == MappingMethod.LEVENSHTEIN: + if mapper == Mapper.LEVENSHTEIN: return self.compare_levenshtein(s1, s2) - elif mapping_method == MappingMethod.JARO: + elif mapper == Mapper.JARO: return self.compare_jaro(s1, s2) - elif mapping_method == MappingMethod.JARO_WINKLER: + elif mapper == Mapper.JARO_WINKLER: return self.compare_jarowinkler(s1, s2) - elif mapping_method == MappingMethod.FUZZY: + elif mapper == Mapper.FUZZY: return self.compare_fuzzy(s1, s2) - elif mapping_method == MappingMethod.FUZZY_WEIGHTED: + elif mapper == Mapper.FUZZY_WEIGHTED: return self.compare_fuzzy_weighted(s1, s2) - elif mapping_method == MappingMethod.JACCARD: + elif mapper == Mapper.JACCARD: return self.compare_jaccard(s1, s2) else: - self.logger.error("Unsupported method: %s", mapping_method) + raise ValueError("Unsupported mapping method: " + str(mapper)) def compare_levenshtein(self, s1, s2): """ diff --git a/text2term/t2t.py b/text2term/t2t.py new file mode 100644 index 0000000..db39501 --- /dev/null +++ b/text2term/t2t.py @@ -0,0 +1,114 @@ +"""Provides Text2Term class""" + +import os +import json +import onto_utils +from mapper import Mapper +from term_collector import OntologyTermCollector +from term_graph_generator import TermGraphGenerator +from bioportal_mapper import BioPortalAnnotatorMapper +from syntactic_mapper import SyntacticMapper +from tfidf_mapper import TFIDFMapper +from zooma_mapper import ZoomaMapper + + +class Text2Term: + """ Main class in text2term package """ + + def __init__(self): + pass + + def map(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False): + """ + Main function to map terms in the input file to the specified target ontology. + + Parameters + ---------- + input_file : str + Input file containing 'source' terms to map to ontology terms + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') + base_iris : tuple + Map only to ontology terms whose IRIs start with a value given in this comma-separated list + csv_columns : tuple + Name of the column that contains the terms to map, optionally followed by the name of the column that + contains identifiers for the terms (eg 'my_terms,my_term_ids') + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False) + mapper : Mapper + Mapping method to use (eg tfidf, levenshtein, zooma) (default=tfidf) + max_mappings : int + Maximum number of top-ranked mappings returned per source term (default=3) + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5) + output_file : str + Path to desired output file for the mappings (default=current working directory) + save_graphs : bool + Save the graphs representing the neighborhood of each ontology term (default=False) + save_mappings : bool + Save the generated mappings to the specified output file (default=False) + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + source_terms, source_term_ids = self._load_data(input_file, csv_columns) + if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: + target_terms = '' if target_ontology.lower() == 'all' else target_ontology + else: + target_terms = self._load_ontology(target_ontology, base_iris, excl_deprecated) + mappings_df = self._do_mapping(source_terms, source_term_ids, target_terms, mapper, max_mappings, min_score) + if save_mappings: + self._save_mappings(mappings_df, output_file) + if save_graphs: + self._save_graphs(target_terms, output_file) + return mappings_df + + def _load_data(self, input_file_path, csv_column_names): + if len(csv_column_names) >= 1: + term_id_col_name = "" + if len(csv_column_names) == 2: + term_id_col_name = csv_column_names[1] + terms, term_ids = onto_utils.parse_csv_file(input_file_path, + term_column_name=csv_column_names[0], + term_id_column_name=term_id_col_name) + else: + terms = onto_utils.parse_list_file(input_file_path) + term_ids = onto_utils.generate_iris(len(terms)) + return terms, term_ids + + def _load_ontology(self, ontology, iris, exclude_deprecated): + term_collector = OntologyTermCollector(ontology) + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) + if len(onto_terms) == 0: + raise RuntimeError("Could not find any terms in the given ontology.") + return onto_terms + + def _do_mapping(self, source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): + if mapper == Mapper.TFIDF: + term_mapper = TFIDFMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + elif mapper == Mapper.ZOOMA: + term_mapper = ZoomaMapper() + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper == Mapper.BIOPORTAL: + term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.FUZZY, Mapper.FUZZY_WEIGHTED, Mapper.JACCARD}: + term_mapper = SyntacticMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + else: + raise ValueError("Unsupported mapper: " + mapper) + + def _save_mappings(self, mappings, output_file): + if os.path.dirname(output_file): # create output directories if needed + os.makedirs(os.path.dirname(output_file), exist_ok=True) + mappings.to_csv(output_file, index=False) + + def _save_graphs(self, terms, output_file): + term_graphs = TermGraphGenerator().graphs_dicts(terms) + with open(output_file + "-term-graphs.json", 'w') as json_file: + json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index beb63e6..672fb55 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -14,12 +14,12 @@ def __init__(self): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.url = "http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate" - def map(self, source_terms, source_terms_ids, ontologies='', max_mappings=3, api_params=()): + def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_params=()): """ Find and return ontology mappings through the Zooma Web service :param source_terms: Collection of source terms to map to target ontologies :param source_terms_ids: List of identifiers for the given source terms - :param ontologies: String with comma-separated list of ontology acronyms (eg 'HP,EFO'). Default: all ontologies ('') + :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional Zooma API-specific parameters to include in the request """ From 2666134f365ad5e48f2d06729d451e8f46669bcb Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 May 2022 21:07:14 -0400 Subject: [PATCH 029/185] Update README.md --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f6ef3f7..d5838f1 100644 --- a/README.md +++ b/README.md @@ -17,23 +17,29 @@ To display a help message with descriptions of tool arguments do: `text2term -h` or `text2term --help` ### Required arguments -`-s SOURCE` Input file containing list of 'source' terms to map to ontology terms (one per line). +`-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). -`-t TARGET` Path or URL of 'target' ontology to map the source terms to. +`-t TARGET` Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is " + "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " + "'all' to search all ontologies in BioPortal or OLS, respectively. ### Optional arguments `-o OUTPUT` Path to desired output file for the mappings. +`-m MAPPER` Mapping method to use. One of: [levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal] (default=tfidf) + `-csv CSV_INPUT` Specifies that the input is a CSV file—followed by the name of the column that contains the terms to map, optionally followed by the name of the column that contains identifiers for the terms (e.g., "my terms,my term ids") `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. -`-min MIN_SCORE` Minimum score [0,1] for the mappings (0=dissimilar, 1=exact match). +`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5). + +`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list. -`-iris BASE_IRIS` Map only to terms whose IRIs start with any IRI given in this comma-separated list. +`-d EXCL_DEPRECATED` Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False). -`-d EXCL_DEPRECATED` Exclude terms stated as deprecated via owl:deprecated. +`-g SAVE_TERM_GRAPHS` Save the graphs representing the neighborhood of each ontology term (default=False). ## Examples From 5cd38f88bfd55792141e0b4f9ac7f38ddda33908 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 May 2022 09:50:10 -0400 Subject: [PATCH 030/185] Update README.md --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d5838f1..1676b58 100644 --- a/README.md +++ b/README.md @@ -19,17 +19,15 @@ To display a help message with descriptions of tool arguments do: ### Required arguments `-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). -`-t TARGET` Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is " - "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " - "'all' to search all ontologies in BioPortal or OLS, respectively. +`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies in BioPortal or OLS. ### Optional arguments `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Mapping method to use. One of: [levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal] (default=tfidf) +`-m MAPPER` Mapping method to use. One of: [levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal] -`-csv CSV_INPUT` Specifies that the input is a CSV file—followed by the name of the column that contains the terms to map, optionally followed by the name of the column that contains identifiers for the terms (e.g., "my terms,my term ids") +`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. From a7034164ce5ea4182cbc7724e2dc15ce0b551b60 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 May 2022 09:52:18 -0400 Subject: [PATCH 031/185] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1676b58..d68f3c9 100644 --- a/README.md +++ b/README.md @@ -19,25 +19,25 @@ To display a help message with descriptions of tool arguments do: ### Required arguments `-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). -`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies in BioPortal or OLS. +`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies. ### Optional arguments `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Mapping method to use. One of: [levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal] +`-m MAPPER` Mapping method to use. One of: `[levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal]` `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. -`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5). +`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match). `-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list. -`-d EXCL_DEPRECATED` Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False). +`-d EXCL_DEPRECATED` Exclude ontology terms stated as deprecated via `owl:deprecated true`. -`-g SAVE_TERM_GRAPHS` Save the graphs representing the neighborhood of each ontology term (default=False). +`-g SAVE_TERM_GRAPHS` Save the graphs representing the neighborhood of each ontology term. ## Examples From d1bf47af849b00c18650695d2792f846683ccbd4 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 May 2022 09:53:42 -0400 Subject: [PATCH 032/185] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d68f3c9..65d5b25 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ To display a help message with descriptions of tool arguments do: `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Mapping method to use. One of: `[levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal]` +`-m MAPPER` Mapping method to use. One of: `levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal` `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') From 4414123e15523f77e8a807d187e8ac49a170e957 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 May 2022 09:55:48 -0400 Subject: [PATCH 033/185] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 65d5b25..fde76b3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# text2term Ontology Mapper +# text2term ontology mapper A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration. @@ -25,7 +25,7 @@ To display a help message with descriptions of tool arguments do: `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Mapping method to use. One of: `levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal` +`-m MAPPER` Mapping method to use. One of: *levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal* `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') From e6c34791c70dad3ea684a4ebfacd733697618d85 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 14 May 2022 14:11:21 -0400 Subject: [PATCH 034/185] Update docs. Add map function for list inputs --- README.md | 8 ++--- text2term/__main__.py | 30 +++++++--------- text2term/t2t.py | 82 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 83 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index fde76b3..f4cc02e 100644 --- a/README.md +++ b/README.md @@ -25,19 +25,19 @@ To display a help message with descriptions of tool arguments do: `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Mapping method to use. One of: *levenshtein,jaro,jarowinkler,jaccard,fuzzy,tfidf,zooma,bioportal* +`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal*. -`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') +`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids'). `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. `-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match). -`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list. +`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)'). `-d EXCL_DEPRECATED` Exclude ontology terms stated as deprecated via `owl:deprecated true`. -`-g SAVE_TERM_GRAPHS` Save the graphs representing the neighborhood of each ontology term. +`-g SAVE_TERM_GRAPHS` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. ## Examples diff --git a/text2term/__main__.py b/text2term/__main__.py index b454f5d..4d61caf 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -1,25 +1,22 @@ import argparse -import datetime import os import sys from t2t import Text2Term from mapper import Mapper - if __name__ == "__main__": - timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") - output_file_name = "t2t-mappings-" + timestamp + ".csv" parser = argparse.ArgumentParser(description="A tool to map unstructured terms to ontology terms") parser.add_argument("-s", "--source", required=True, type=str, - help="Input file containing 'source' terms to map to ontology terms") + help="Input file containing 'source' terms to map to ontology terms (list of terms or CSV file)") parser.add_argument("-t", "--target", required=True, type=str, - help="Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is " + help="Path or URL of 'target' ontology to map source terms to. When the chosen mapper is " "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " - "'all' to search all ontologies in BioPortal or OLS, respectively") - parser.add_argument("-o", "--output", required=False, type=str, default=output_file_name, + "'all' to search all ontologies") + parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, - help="Mapping method to use. One of: " + str(Mapper.list()) + " (default=tfidf)") + help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), help="Specifies that the input is a CSV file—This should be followed by the name of the column " "that contains the terms to map, optionally followed by the name of the column that " @@ -29,13 +26,13 @@ parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, help="Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5)") parser.add_argument("-iris", "--base_iris", required=False, type=str, default=(), - help="Map only to ontology terms whose IRIs start with a value given in this comma-separated list") + help="Map only to ontology terms whose IRIs start with a value given in this comma-separated " + "list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)')") parser.add_argument("-d", "--excl_deprecated", required=False, default=False, action="store_true", help="Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False)") parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", - help="Save the graphs representing the neighborhood of each ontology term (default=False)") + help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") arguments = parser.parse_args() - if not os.path.exists(arguments.source): parser.error("The file '{}' does not exist".format(arguments.source)) sys.exit(1) @@ -46,8 +43,7 @@ csv_columns = arguments.csv_input if len(csv_columns) > 0: csv_columns = tuple(csv_columns.split(',')) - - Text2Term().map(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, - excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, - min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True) + Text2Term().map_file(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, + excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, + min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, + save_mappings=True) diff --git a/text2term/t2t.py b/text2term/t2t.py index db39501..71dc463 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -2,6 +2,7 @@ import os import json +import datetime import onto_utils from mapper import Mapper from term_collector import OntologyTermCollector @@ -18,49 +19,98 @@ class Text2Term: def __init__(self): pass - def map(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False): + def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False): """ - Main function to map terms in the input file to the specified target ontology. + Map the terms in the given input file to the specified target ontology. Parameters ---------- input_file : str - Input file containing 'source' terms to map to ontology terms + Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) target_ontology : str Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies base_iris : tuple - Map only to ontology terms whose IRIs start with a value given in this comma-separated list + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') csv_columns : tuple Name of the column that contains the terms to map, optionally followed by the name of the column that contains identifiers for the terms (eg 'my_terms,my_term_ids') excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False) - mapper : Mapper - Mapping method to use (eg tfidf, levenshtein, zooma) (default=tfidf) + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal max_mappings : int - Maximum number of top-ranked mappings returned per source term (default=3) + Maximum number of top-ranked mappings returned per source term min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5) + Minimum similarity score [0,1] for the mappings (1=exact match) output_file : str - Path to desired output file for the mappings (default=current working directory) + Path to desired output file for the mappings save_graphs : bool - Save the graphs representing the neighborhood of each ontology term (default=False) + Save vis.js graphs representing the neighborhood of each ontology term save_mappings : bool - Save the generated mappings to the specified output file (default=False) + Save the generated mappings to a file (specified by `output_file`) Returns ---------- df Data frame containing the generated ontology mappings """ - source_terms, source_term_ids = self._load_data(input_file, csv_columns) + source_terms, source_terms_ids = self._load_data(input_file, csv_columns) + return self.map(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, + output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings) + + def map(self, source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=()): + """ + Map the terms in the given list to the specified target ontology. + + Parameters + ---------- + source_terms : list + List of 'source' terms to map to ontology terms + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + if len(source_terms_ids) != len(source_terms): + source_terms_ids = onto_utils.generate_iris(len(source_terms)) + if output_file == '': + timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + output_file = "t2t-mappings-" + timestamp + ".csv" if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: target_terms = '' if target_ontology.lower() == 'all' else target_ontology else: target_terms = self._load_ontology(target_ontology, base_iris, excl_deprecated) - mappings_df = self._do_mapping(source_terms, source_term_ids, target_terms, mapper, max_mappings, min_score) + mappings_df = self._do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) if save_mappings: self._save_mappings(mappings_df, output_file) if save_graphs: From abd8aa0fd1eb79768bbcf54dc550d67b8b4c941e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 20 May 2022 17:50:49 -0400 Subject: [PATCH 035/185] Minor fixes and comments --- text2term/__main__.py | 2 +- text2term/bioportal_mapper.py | 7 +++++-- text2term/syntactic_mapper.py | 2 +- text2term/term_collector.py | 2 +- text2term/zooma_mapper.py | 4 ++-- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index 4d61caf..e5bd523 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -14,7 +14,7 @@ "'all' to search all ontologies") parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, + parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index 26072c2..d0a3d52 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -23,7 +23,10 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa Find and return ontology mappings through the BioPortal Annotator Web service :param source_terms: Collection of source terms to map to target ontologies :param source_terms_ids: List of identifiers for the given source terms - :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies + :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies. + The ontology names accepted must match the names used in BioPortal. Here are some known ontologies: + GO, UBERON, "CL" for Cell Ontology, MESH, SNOMEDCT, FMA, NCIT, EFO, DOID, MONDO, "PR" for Protein Ontology, + "HP" for Human Phenotype Ontology :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional BioPortal Annotator-specific parameters to include in the request """ @@ -37,7 +40,7 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): params = { - "text": source_term, + "text": onto_utils.normalize(source_term), "longest_only": "true", "expand_mappings": "true", "ontologies": ontologies diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index b4385c4..9ef5e9e 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -29,7 +29,7 @@ def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_ma self.logger.info("Mapping %i source terms...", len(source_terms)) start = time.time() mappings = [] - for term, term_id in tqdm(zip(source_terms, source_terms_ids)): + for term, term_id in tqdm(zip(source_terms, source_terms_ids), total=len(source_terms)): matches = self._map(term, term_id, mapper, max_mappings) mappings.extend(matches) end = time.time() diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 23542ae..63aa313 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -76,7 +76,7 @@ def _get_children(self, ontology_term, ontology): children = set() try: children = set(ontology.get_children_of(ontology_term)) - except TypeError and AttributeError as err: + except (TypeError, AttributeError) as err: self.logger.debug(err) return children diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 672fb55..53e341b 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -23,7 +23,7 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional Zooma API-specific parameters to include in the request """ - self.logger.info("Mapping %i source terms against ontologies: %s", len(source_terms), ontologies) + self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) start = time.time() mappings = [] for term, term_id in zip(source_terms, source_terms_ids): @@ -34,7 +34,7 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters params = { - "propertyValue": source_term, + "propertyValue": onto_utils.normalize(source_term), "filter": "required:[none],ontologies:[" + ontologies + "]" } if len(api_params) > 0: From dc64136b86c383b23261a9193b46a9c118d5ef4d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 6 Jun 2022 14:23:58 -0400 Subject: [PATCH 036/185] Include DB mapping sources in Zooma search --- text2term/zooma_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 53e341b..cb1d030 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -35,7 +35,7 @@ def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_p # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters params = { "propertyValue": onto_utils.normalize(source_term), - "filter": "required:[none],ontologies:[" + ontologies + "]" + "filter": "required:[gwas,cttv,atlas],ontologies:[" + ontologies + "]" } if len(api_params) > 0: params.update(api_params) From 688853659f4fa77213e83edf1f9c119018d875e5 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 6 Jun 2022 14:24:36 -0400 Subject: [PATCH 037/185] Remove redundant line --- text2term/bioportal_mapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index d0a3d52..dedcb3e 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -62,7 +62,6 @@ def _mapping_details(self, source_term, source_term_id, mapping): ann_class = mapping["annotatedClass"] term_iri = ann_class["@id"] term_link_bp = ann_class["links"]["self"] - match_type = mapping["annotations"][0]["matchType"] term_label = self.get_term_details(term_link_bp) return TermMapping(source_term, source_term_id, term_label, term_iri, 1) From f307245960a14939c9d95a08616571f4a239d41a Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 28 Jun 2022 16:16:05 -0400 Subject: [PATCH 038/185] Include curated data sources in Zooma search --- text2term/zooma_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index cb1d030..1a6b245 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -35,7 +35,7 @@ def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_p # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters params = { "propertyValue": onto_utils.normalize(source_term), - "filter": "required:[gwas,cttv,atlas],ontologies:[" + ontologies + "]" + "filter": "required:[gwas,cttv,atlas,eva-clinvar,sysmicro],ontologies:[" + ontologies + "]" } if len(api_params) > 0: params.update(api_params) From 38b6d33e7fc2b46fd11a4365d0d5717d0efcc89a Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 11 Jul 2022 13:02:47 -0400 Subject: [PATCH 039/185] Update stop-words list and logger handling starting support for parsing TSV files --- text2term/onto_utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index b153783..63fe747 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -7,11 +7,23 @@ from owlready2 import * from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces -STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', - 'ds', 'rd', 'rdgwas', 'average', 'weekly', 'monthly', 'daily'} BASE_IRI = "http://ccb.hms.harvard.edu/t2t/" +STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's', + 'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined', + 'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid', + 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'not', 'by', + 'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls', + 'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'} + +TEMPORAL_WORDS = {'age', 'time', 'times', 'date', 'initiation', 'cessation', 'progression', 'duration', 'early', 'late', + 'later', 'trimester'} + +QUANTITY_WORDS = {'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'frequently', 'per', 'hour', 'day', 'week', 'month', + 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', 'percentage', + 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} + def normalize_list(token_list): normalized_token_list = [] @@ -26,7 +38,6 @@ def normalize(token): :param token: Text to be normalized :return: Normalized string """ - token = re.sub(r"[\(\[].*?[\)\]]", "", token) # remove text within parenthesis/brackets token = strip_non_alphanum(token).lower() token = token.replace("_", " ") token = " ".join(w for w in token.split() if w not in STOP_WORDS) @@ -89,7 +100,8 @@ def get_logger(name, level): logger.setLevel(level=level) console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(formatter) - logger.addHandler(console_handler) + if not logger.hasHandlers(): + logger.addHandler(console_handler) logger.propagate = False return logger @@ -115,6 +127,10 @@ def parse_csv_file(file_path, term_column_name, term_id_column_name, separator=' return terms, term_ids +def parse_tsv_file(file_path, term_column_name, term_id_column_name): + return parse_csv_file(file_path, term_column_name, term_id_column_name, separator="\t") + + def get_ontology_from_labels(term_labels): onto_iri = BASE_IRI + "Ontology-" + generate_uuid() onto = owlready2.get_ontology(onto_iri) From 51c0e8f28790afd78b400b0c7a527609c276a35f Mon Sep 17 00:00:00 2001 From: Jason Payne <100445045+paynejason@users.noreply.github.com> Date: Mon, 22 Aug 2022 12:56:27 -0400 Subject: [PATCH 040/185] Change requirements.txt to be more specific Allows the UI module to function on new Apple chip computers. --- requirements.txt | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 709b8be..7e7fef1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -Owlready2~=0.36 -argparse~=1.4.0 -pandas~=1.4.1 -numpy~=1.22.2 -gensim~=4.1.2 -scipy~=1.8.0 -scikit-learn~=1.0.2 -setuptools~=60.9.3 -requests~=2.27.1 -tqdm~=4.62.3 -sparse_dot_topn~=0.3.1 -bioregistry~=0.4.63 -nltk~=3.7 -rapidfuzz~=2.0.5 -shortuuid~=1.0.9 \ No newline at end of file +Owlready2==0.36 +argparse==1.4.0 +pandas==1.4.1 +numpy==1.23.2 +gensim==4.1.2 +scipy==1.8.0 +scikit-learn==1.0.2 +setuptools==60.9.3 +requests==2.27.1 +tqdm==4.62.3 +sparse_dot_topn==0.3.1 +bioregistry==0.4.63 +nltk==3.7 +rapidfuzz==2.0.5 +shortuuid==1.0.9 From 237896c2acce366a0a3754eaa89d7eda2aa24e57 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 23 Aug 2022 19:01:11 -0400 Subject: [PATCH 041/185] Renamed 'fuzzy' mapper to Indel which the pkg computes Bumped rapidfuzz version to 2.6.0 and tool version to 0.4.1 --- README.md | 4 ++-- requirements.txt | 2 +- setup.py | 4 ++-- text2term/mapper.py | 2 +- text2term/syntactic_mapper.py | 10 +++++----- text2term/t2t.py | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f4cc02e..f672c6e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # text2term ontology mapper -A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration. +A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology. ## Usage @@ -25,7 +25,7 @@ To display a help message with descriptions of tool arguments do: `-o OUTPUT` Path to desired output file for the mappings. -`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal*. +`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, indel, fuzzy, tfidf, zooma, bioportal*. `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids'). diff --git a/requirements.txt b/requirements.txt index 7e7fef1..e43c523 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,5 @@ tqdm==4.62.3 sparse_dot_topn==0.3.1 bioregistry==0.4.63 nltk==3.7 -rapidfuzz==2.0.5 +rapidfuzz==2.6.0 shortuuid==1.0.9 diff --git a/setup.py b/setup.py index ebe7574..d159c5a 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.4.0' +version = '0.4.1' setup( name='text2term', @@ -16,7 +16,7 @@ include_package_data=True, url='https://github.com/ccb-hms/ontology-mapper', license='MIT', - description='A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration', + description='A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology', long_description=long_description, long_description_content_type='text/markdown', author='Center for Computational Biomedicine, Harvard Medical School', diff --git a/text2term/mapper.py b/text2term/mapper.py index 2c42f01..dd92d57 100644 --- a/text2term/mapper.py +++ b/text2term/mapper.py @@ -9,8 +9,8 @@ class Mapper(str, Enum): JARO = 'jaro' JARO_WINKLER = 'jarowinkler' JACCARD = 'jaccard' + INDEL = 'indel' FUZZY = 'fuzzy' - FUZZY_WEIGHTED = 'fuzzyw' TFIDF = 'tfidf' ZOOMA = 'zooma' BIOPORTAL = 'bioportal' diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index 9ef5e9e..c85dbbe 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -70,10 +70,10 @@ def compare(self, s1, s2, mapper): return self.compare_jaro(s1, s2) elif mapper == Mapper.JARO_WINKLER: return self.compare_jarowinkler(s1, s2) + elif mapper == Mapper.INDEL: + return self.compare_indel(s1, s2) elif mapper == Mapper.FUZZY: - return self.compare_fuzzy(s1, s2) - elif mapper == Mapper.FUZZY_WEIGHTED: - return self.compare_fuzzy_weighted(s1, s2) + return self.compare_fuzzy_ratio(s1, s2) elif mapper == Mapper.JACCARD: return self.compare_jaccard(s1, s2) else: @@ -103,7 +103,7 @@ def compare_jarowinkler(self, s1, s2): similarity = rapidfuzz.string_metric.jaro_winkler_similarity(s1, s2)/100 return similarity - def compare_fuzzy(self, s1, s2): + def compare_indel(self, s1, s2): """ Calculates the normalized Indel distance between s1 and s2. See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#ratio @@ -112,7 +112,7 @@ def compare_fuzzy(self, s1, s2): similarity = rapidfuzz.fuzz.ratio(s1, s2)/100 return similarity - def compare_fuzzy_weighted(self, s1, s2): + def compare_fuzzy_ratio(self, s1, s2): """ Calculates a weighted ratio between s1 and s2 based on rapidfuzz's fuzzy ratio algorithms. See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#wratio diff --git a/text2term/t2t.py b/text2term/t2t.py index 71dc463..5cae94c 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -147,7 +147,7 @@ def _do_mapping(self, source_terms, source_term_ids, ontology_terms, mapper, max elif mapper == Mapper.BIOPORTAL: term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) - elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.FUZZY, Mapper.FUZZY_WEIGHTED, Mapper.JACCARD}: + elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms) return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) else: From f5ffd26ceef31656e98bcc529de247ee58eabc4f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 23 Aug 2022 19:23:49 -0400 Subject: [PATCH 042/185] Add option to specify a cell separator to allow parsing eg TSVs --- text2term/__main__.py | 4 +++- text2term/t2t.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index e5bd523..cf82c6d 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -21,6 +21,8 @@ help="Specifies that the input is a CSV file—This should be followed by the name of the column " "that contains the terms to map, optionally followed by the name of the column that " "contains identifiers for the terms (eg 'my_terms,my_term_ids')") + parser.add_argument("-sep", "--separator", required=False, type=str, default=',', + help="Specifies the cell separator to be used when reading a non-comma-separated tabular file") parser.add_argument("-top", "--top_mappings", required=False, type=int, default=3, help="Maximum number of top-ranked mappings returned per source term (default=3)") parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, @@ -46,4 +48,4 @@ Text2Term().map_file(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True) + save_mappings=True, separator=arguments.separator) diff --git a/text2term/t2t.py b/text2term/t2t.py index 5cae94c..a81ad90 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -20,7 +20,8 @@ def __init__(self): pass def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False): + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, + separator=','): """ Map the terms in the given input file to the specified target ontology. @@ -37,6 +38,8 @@ def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), ex csv_columns : tuple Name of the column that contains the terms to map, optionally followed by the name of the column that contains identifiers for the terms (eg 'my_terms,my_term_ids') + separator : str + Specifies the cell separator to be used when reading a non-comma-separated tabular file excl_deprecated : bool Exclude ontology terms stated as deprecated via `owl:deprecated true` mapper : mapper.Mapper @@ -58,7 +61,7 @@ def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), ex df Data frame containing the generated ontology mappings """ - source_terms, source_terms_ids = self._load_data(input_file, csv_columns) + source_terms, source_terms_ids = self._load_data(input_file, csv_columns, separator) return self.map(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings) @@ -117,12 +120,12 @@ def map(self, source_terms, target_ontology, base_iris=(), excl_deprecated=False self._save_graphs(target_terms, output_file) return mappings_df - def _load_data(self, input_file_path, csv_column_names): + def _load_data(self, input_file_path, csv_column_names, separator): if len(csv_column_names) >= 1: term_id_col_name = "" if len(csv_column_names) == 2: term_id_col_name = csv_column_names[1] - terms, term_ids = onto_utils.parse_csv_file(input_file_path, + terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, term_column_name=csv_column_names[0], term_id_column_name=term_id_col_name) else: From 89cc7ef852a70a121427dd41653884e5d3c6fcde Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 29 Aug 2022 19:01:57 -0400 Subject: [PATCH 043/185] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f672c6e..67d74eb 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Install package using **pip**: Execute the tool as follows: -`text2term -s SOURCE -t TARGET [-o OUTPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-i INCL_INDIVIDUALS]` +`text2term -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-s SAVE_TERM_GRAPHS]` To display a help message with descriptions of tool arguments do: From 9110f80076ce6a307755638b4d6cefc22354129d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 21 Sep 2022 18:35:39 -0400 Subject: [PATCH 044/185] Minor changes to tool description --- setup.py | 7 +++---- text2term/__main__.py | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index d159c5a..13f7b24 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,12 @@ from setuptools import setup, find_packages - +version = '0.4.1' +description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.4.1' - setup( name='text2term', version=version, @@ -16,7 +15,7 @@ include_package_data=True, url='https://github.com/ccb-hms/ontology-mapper', license='MIT', - description='A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology', + description=description, long_description=long_description, long_description_content_type='text/markdown', author='Center for Computational Biomedicine, Harvard Medical School', diff --git a/text2term/__main__.py b/text2term/__main__.py index cf82c6d..b74bb3b 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -5,7 +5,8 @@ from mapper import Mapper if __name__ == "__main__": - parser = argparse.ArgumentParser(description="A tool to map unstructured terms to ontology terms") + parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' + 'entities to controlled terms in an ontology') parser.add_argument("-s", "--source", required=True, type=str, help="Input file containing 'source' terms to map to ontology terms (list of terms or CSV file)") parser.add_argument("-t", "--target", required=True, type=str, From 1be4ed908b0fb68f3363a9d79fb4f314543def78 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 17 Oct 2022 10:27:36 -0400 Subject: [PATCH 045/185] Edits for PyPi Makes various edits in preparation for introduction to PyPi. Specifically, flattens the Text2Term module into functions, changes some "import" syntax, and updates the README with programmatic usage documentation. --- README.md | 62 ++++++- text2term/__init__.py | 3 + text2term/__main__.py | 4 +- text2term/t2t.py | 295 +++++++++++++++--------------- text2term/term_graph_generator.py | 4 +- 5 files changed, 210 insertions(+), 158 deletions(-) diff --git a/README.md b/README.md index 67d74eb..9ec7421 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,54 @@ A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology. -## Usage +## Programmatic Usage +Install package using **pip**: + +`pip install text2term` + +The tool can be executed in Python with either of the two following functions: +`text2term.map_files(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF,min_score=0.3, output_file='', save_graphs=False, save_mappings=False, separator=',')` + +or + +`map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=())` + +### Arguments +For `map_files`, the first argument 'input_file' specifies a path to a file containing the names of every term that needs to be mapped. For `map_terms`, The first argument 'source_terms' takes in a list of the terms to be mapped. + +All other arguments are the same, and have the same functionality: + +`target_ontology` : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies +`base_iris` : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +`source_terms_ids` : tuple + Collection of identifiers for the given source terms +`excl_deprecated` : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` +`mapper` : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal + These can be initialized by invoking mapper.Mapper e.g. `mapper.Mapper.TFIDF` +`max_mappings` : int + Maximum number of top-ranked mappings returned per source term +`min_score` : float + Minimum similarity score [0,1] for the mappings (1=exact match) +`output_file` : str + Path to desired output file for the mappings +`save_graphs` : bool + Save vis.js graphs representing the neighborhood of each ontology term +`save_mappings` : bool + Save the generated mappings to a file (specified by `output_file`) + +All default values, if they exist, can be seen above. + +### Return Value +Both functions return the same value: +`df` : Data frame containing the generated ontology mappings + +## Command Line Usage Install package using **pip**: @@ -10,11 +57,11 @@ Install package using **pip**: Execute the tool as follows: -`text2term -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-s SAVE_TERM_GRAPHS]` +`python text2term -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-g SAVE_TERM_GRAPHS]` To display a help message with descriptions of tool arguments do: -`text2term -h` or `text2term --help` +`python text2term -h` or `python text2term --help` ### Required arguments `-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). @@ -41,7 +88,16 @@ To display a help message with descriptions of tool arguments do: ## Examples +### Programmatic +``` +import text2term +import pandas + +df1 = text2term.map_file(unstruct_terms.txt, http://www.ebi.ac.uk/efo/efo.owl) +df2 = text2term.map_terms(["asthma", "colon cancer"], http://www.ebi.ac.uk/efo/efo.owl) +``` +### Command Line The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: `python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` diff --git a/text2term/__init__.py b/text2term/__init__.py index e69de29..4287821 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -0,0 +1,3 @@ +from .t2t import map_terms +from .t2t import map_file +from .mapper import Mapper \ No newline at end of file diff --git a/text2term/__main__.py b/text2term/__main__.py index b74bb3b..3cdd907 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -1,7 +1,7 @@ import argparse import os import sys -from t2t import Text2Term +from t2t import map_file from mapper import Mapper if __name__ == "__main__": @@ -46,7 +46,7 @@ csv_columns = arguments.csv_input if len(csv_columns) > 0: csv_columns = tuple(csv_columns.split(',')) - Text2Term().map_file(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, + map_file(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, save_mappings=True, separator=arguments.separator) diff --git a/text2term/t2t.py b/text2term/t2t.py index a81ad90..6d5bee9 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -3,165 +3,158 @@ import os import json import datetime -import onto_utils -from mapper import Mapper -from term_collector import OntologyTermCollector -from term_graph_generator import TermGraphGenerator -from bioportal_mapper import BioPortalAnnotatorMapper -from syntactic_mapper import SyntacticMapper -from tfidf_mapper import TFIDFMapper -from zooma_mapper import ZoomaMapper +from text2term import onto_utils +from text2term.mapper import Mapper +from text2term.term_collector import OntologyTermCollector +from text2term.term_graph_generator import TermGraphGenerator +from text2term.bioportal_mapper import BioPortalAnnotatorMapper +from text2term.syntactic_mapper import SyntacticMapper +from text2term.tfidf_mapper import TFIDFMapper +from text2term.zooma_mapper import ZoomaMapper +def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, + separator=','): + """ + Maps the terms in the given input file to the specified target ontology. -class Text2Term: - """ Main class in text2term package """ + Parameters + ---------- + input_file : str + Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + csv_columns : tuple + Name of the column that contains the terms to map, optionally followed by the name of the column that + contains identifiers for the terms (eg 'my_terms,my_term_ids') + separator : str + Specifies the cell separator to be used when reading a non-comma-separated tabular file + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) - def __init__(self): - pass + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) + return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, + output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings) - def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=','): - """ - Map the terms in the given input file to the specified target ontology. +def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=()): + """ + Maps the terms in the given list to the specified target ontology. - Parameters - ---------- - input_file : str - Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - csv_columns : tuple - Name of the column that contains the terms to map, optionally followed by the name of the column that - contains identifiers for the terms (eg 'my_terms,my_term_ids') - separator : str - Specifies the cell separator to be used when reading a non-comma-separated tabular file - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) + Parameters + ---------- + source_terms : list + List of 'source' terms to map to ontology terms + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ - source_terms, source_terms_ids = self._load_data(input_file, csv_columns, separator) - return self.map(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, - excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, - output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings) + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + if len(source_terms_ids) != len(source_terms): + source_terms_ids = onto_utils.generate_iris(len(source_terms)) + if output_file == '': + timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + output_file = "t2t-mappings-" + timestamp + ".csv" + if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: + target_terms = '' if target_ontology.lower() == 'all' else target_ontology + else: + target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated) + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) + if save_mappings: + _save_mappings(mappings_df, output_file) + if save_graphs: + _save_graphs(target_terms, output_file) + return mappings_df - def map(self, source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=()): - """ - Map the terms in the given list to the specified target ontology. +def _load_data(input_file_path, csv_column_names, separator): + if len(csv_column_names) >= 1: + term_id_col_name = "" + if len(csv_column_names) == 2: + term_id_col_name = csv_column_names[1] + terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, + term_column_name=csv_column_names[0], + term_id_column_name=term_id_col_name) + else: + terms = onto_utils.parse_list_file(input_file_path) + term_ids = onto_utils.generate_iris(len(terms)) + return terms, term_ids - Parameters - ---------- - source_terms : list - List of 'source' terms to map to ontology terms - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - source_terms_ids : tuple - Collection of identifiers for the given source terms - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) +def _load_ontology(ontology, iris, exclude_deprecated): + term_collector = OntologyTermCollector(ontology) + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) + if len(onto_terms) == 0: + raise RuntimeError("Could not find any terms in the given ontology.") + return onto_terms - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ - if len(source_terms_ids) != len(source_terms): - source_terms_ids = onto_utils.generate_iris(len(source_terms)) - if output_file == '': - timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") - output_file = "t2t-mappings-" + timestamp + ".csv" - if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: - target_terms = '' if target_ontology.lower() == 'all' else target_ontology - else: - target_terms = self._load_ontology(target_ontology, base_iris, excl_deprecated) - mappings_df = self._do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) - if save_mappings: - self._save_mappings(mappings_df, output_file) - if save_graphs: - self._save_graphs(target_terms, output_file) - return mappings_df +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): + if mapper == Mapper.TFIDF: + term_mapper = TFIDFMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + elif mapper == Mapper.ZOOMA: + term_mapper = ZoomaMapper() + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper == Mapper.BIOPORTAL: + term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: + term_mapper = SyntacticMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + else: + raise ValueError("Unsupported mapper: " + mapper) - def _load_data(self, input_file_path, csv_column_names, separator): - if len(csv_column_names) >= 1: - term_id_col_name = "" - if len(csv_column_names) == 2: - term_id_col_name = csv_column_names[1] - terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, - term_column_name=csv_column_names[0], - term_id_column_name=term_id_col_name) - else: - terms = onto_utils.parse_list_file(input_file_path) - term_ids = onto_utils.generate_iris(len(terms)) - return terms, term_ids +def _save_mappings(mappings, output_file): + if os.path.dirname(output_file): # create output directories if needed + os.makedirs(os.path.dirname(output_file), exist_ok=True) + mappings.to_csv(output_file, index=False) - def _load_ontology(self, ontology, iris, exclude_deprecated): - term_collector = OntologyTermCollector(ontology) - onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) - if len(onto_terms) == 0: - raise RuntimeError("Could not find any terms in the given ontology.") - return onto_terms - - def _do_mapping(self, source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): - if mapper == Mapper.TFIDF: - term_mapper = TFIDFMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) - elif mapper == Mapper.ZOOMA: - term_mapper = ZoomaMapper() - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) - elif mapper == Mapper.BIOPORTAL: - term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) - elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: - term_mapper = SyntacticMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) - else: - raise ValueError("Unsupported mapper: " + mapper) - - def _save_mappings(self, mappings, output_file): - if os.path.dirname(output_file): # create output directories if needed - os.makedirs(os.path.dirname(output_file), exist_ok=True) - mappings.to_csv(output_file, index=False) - - def _save_graphs(self, terms, output_file): - term_graphs = TermGraphGenerator().graphs_dicts(terms) - with open(output_file + "-term-graphs.json", 'w') as json_file: - json.dump(term_graphs, json_file, indent=2) +def _save_graphs(terms, output_file): + term_graphs = TermGraphGenerator().graphs_dicts(terms) + with open(output_file + "-term-graphs.json", 'w') as json_file: + json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py index b0101b5..2dea645 100644 --- a/text2term/term_graph_generator.py +++ b/text2term/term_graph_generator.py @@ -1,5 +1,5 @@ -import onto_utils -from term_graph import TermGraph, Node, Edge +from text2term import onto_utils +from text2term.term_graph import TermGraph, Node, Edge from owlready2 import Thing, ThingClass From fc2ae218f912f12ad1ede9ceec3301fb0b0f5830 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 17 Oct 2022 14:11:20 -0400 Subject: [PATCH 046/185] Initial work toward ontology caching --- text2term/__main__.py | 8 +++ text2term/resources/ontologies.csv | 11 ++++ text2term/t2t.py | 43 +++++++++++++- text2term/term.py | 20 +++++-- text2term/term_collector.py | 95 ++++++++++++++++++++---------- text2term/term_encoder.py | 8 +++ text2term/term_graph_generator.py | 57 ++++++++++-------- 7 files changed, 179 insertions(+), 63 deletions(-) create mode 100644 text2term/resources/ontologies.csv create mode 100644 text2term/term_encoder.py diff --git a/text2term/__main__.py b/text2term/__main__.py index b74bb3b..c1e33af 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -35,6 +35,14 @@ help="Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False)") parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") + + # TODO mapping to a cached ontology should be possible through the command-line interface + parser.add_argument("-rc", "--read_from_cache", required=False, default=False, action="store_true", + help="Load the target ontology from local cache") + # TODO writing to cache is an entirely different operation (mapping has 2 required arguments, caching has none)... + parser.add_argument("-wc", "--write_cache", required=False, default=False, action="store_true", + help="Write a local cache of ontology term details from the ontologies specified in the file: " + "'resources/ontologies.csv' for faster performance when mapping terms to those ontologies") arguments = parser.parse_args() if not os.path.exists(arguments.source): parser.error("The file '{}' does not exist".format(arguments.source)) diff --git a/text2term/resources/ontologies.csv b/text2term/resources/ontologies.csv new file mode 100644 index 0000000..f0fdecd --- /dev/null +++ b/text2term/resources/ontologies.csv @@ -0,0 +1,11 @@ +acronym,name,version,date,url,comments +CLO,Cell Line Ontology,2.1.178,3/20/22,http://purl.obolibrary.org/obo/clo.owl,non-versioned IRI +CL,Cell Ontology,9/15/22,9/15/22,http://purl.obolibrary.org/obo/cl/releases/2022-09-15/cl.owl, +EFO,Experimental Factor Ontology,3.46.0,9/15/22,https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl, +GO,Gene Ontology,9/19/22,9/19/22,http://purl.obolibrary.org/obo/go/releases/2022-09-19/go.owl, +HPO,Human Phenotype Ontology,6/11/22,6/11/22,http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl, +MONDO,Monarch Disease Ontology,8/1/22,8/1/22,http://purl.obolibrary.org/obo/mondo/releases/2022-08-01/mondo.owl, +NCIT,NCI Thesaurus,22.07d,8/19/22,http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl, +PRO,Protein Ontology,67,8/8/22,http://purl.obolibrary.org/obo/pr/67.0/pr.owl, +UBERON,Uber-anatomy ontology,8/19/22,8/19/22,http://purl.obolibrary.org/obo/uberon/releases/2022-08-19/uberon.owl, +MP,Mammalian Phenotype Ontology,8/4/22,8/4/22,http://purl.obolibrary.org/obo/mp/releases/2022-08-04/mp.owl, \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index a81ad90..0cc0053 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -2,8 +2,11 @@ import os import json +import pickle +import time import datetime import onto_utils +import pandas as pd from mapper import Mapper from term_collector import OntologyTermCollector from term_graph_generator import TermGraphGenerator @@ -11,6 +14,7 @@ from syntactic_mapper import SyntacticMapper from tfidf_mapper import TFIDFMapper from zooma_mapper import ZoomaMapper +from term_encoder import TermEncoder class Text2Term: @@ -120,6 +124,39 @@ def map(self, source_terms, target_ontology, base_iris=(), excl_deprecated=False self._save_graphs(target_terms, output_file) return mappings_df + def cache_ontology_set(self, ontology_registry_path): + registry = pd.read_csv(ontology_registry_path) + for index, row in registry.iterrows(): + self.cache_ontology(row.url, row.acronym) + + def cache_ontology(self, ontology_url, ontology_acronym, base_iris=()): + ontology_terms = self._load_ontology(ontology_url, base_iris, exclude_deprecated=False) + cache_dir = "cache/" + ontology_acronym + "/" + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + self.serialize_ontology_json(ontology_terms, ontology_acronym, cache_dir) + + self.serialize_ontology_pickle(ontology_terms, ontology_acronym, cache_dir) + + self._save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) + ontology_terms.clear() + + def serialize_ontology_pickle(self, ontology_terms, ontology_acronym, cache_dir): + start = time.time() + with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: + pickle.dump(ontology_terms, out_file) + end = time.time() + print("Ontology-Term-Details Pickle serialization time: " + str(end - start)) + + def serialize_ontology_json(self, ontology_terms, ontology_acronym, cache_dir): + start = time.time() + term_dicts = [t.__dict__ for t in ontology_terms.values()] + with open(cache_dir + ontology_acronym + "-term-details.json", 'w+') as json_file: + json.dump(term_dicts, json_file, indent=2, cls=TermEncoder) + end = time.time() + print("Ontology-Term-Details JSON serialization time: " + str(end - start)) + def _load_data(self, input_file_path, csv_column_names, separator): if len(csv_column_names) >= 1: term_id_col_name = "" @@ -134,8 +171,8 @@ def _load_data(self, input_file_path, csv_column_names, separator): return terms, term_ids def _load_ontology(self, ontology, iris, exclude_deprecated): - term_collector = OntologyTermCollector(ontology) - onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated) + term_collector = OntologyTermCollector() + onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated) if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms @@ -162,6 +199,6 @@ def _save_mappings(self, mappings, output_file): mappings.to_csv(output_file, index=False) def _save_graphs(self, terms, output_file): - term_graphs = TermGraphGenerator().graphs_dicts(terms) + term_graphs = TermGraphGenerator(terms).graphs_dicts() with open(output_file + "-term-graphs.json", 'w') as json_file: json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/term.py b/text2term/term.py index 5eef645..b86c667 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -3,11 +3,21 @@ class OntologyTerm: - def __init__(self, iri, labels, synonyms, definition, parents=(), children=(), instances=()): + def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=()): + """ + Constructor for a succinct representation of an ontology term + :param iri: IRI of the ontology term + :param labels: Set of human-readable labels for the term (e.g., rdfs:label, skos:prefLabel) + :param definitions: Set of textual definitions of the term + :param synonyms: Set of synonyms of the term (e.g., alternative labels) + :param parents: Dictionary containing the IRIs of parent terms (superclasses) and their label(s) + :param children: Dictionary containing the IRIs of child terms (subclasses) and their label(s) + :param instances: Dictionary containing the IRIs of instances of the term (rdf:type) and their label(s) + """ self._iri = iri self._labels = labels self._synonyms = synonyms - self._definition = definition + self._definitions = definitions self._parents = parents self._children = children self._instances = instances @@ -25,8 +35,8 @@ def synonyms(self): return self._synonyms @property - def definition(self): - return self._definition + def definitions(self): + return self._definitions @property def parents(self): @@ -55,5 +65,5 @@ def __hash__(self): def __str__(self): return "Ontology Term: " + self.iri + ", Labels: " + str(self.labels) + ", Synonyms: " + \ - str(self.synonyms) + ", Definition: " + str(self.definition) + ", Parents: " + str(self.parents) + \ + str(self.synonyms) + ", Definitions: " + str(self.definitions) + ", Parents: " + str(self.parents) + \ ", Children: " + str(self.children) + ", Instances: " + str(self.instances) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 63aa313..adf885b 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -1,6 +1,5 @@ """Provides OntologyTermCollector class""" -import logging from owlready2 import * from text2term import onto_utils from text2term.term import OntologyTerm @@ -8,82 +7,99 @@ class OntologyTermCollector: - def __init__(self, ontology_iri): - """" - :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) - """ - self.logger = onto_utils.get_logger(__name__, logging.INFO) - self.ontology_iri = ontology_iri + def __init__(self): + self.logger = onto_utils.get_logger(__name__) - def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=False): + def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False): """ Collect the terms described in the ontology at the specified IRI + :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple :param use_reasoning: Use a reasoner to compute inferred class hierarchy :param exclude_deprecated: Exclude ontology terms stated as deprecated using owl:deprecated 'true' - :return: Collection of ontology terms in the specified ontology + :return: Dictionary of ontology term IRIs and their respective details in the specified ontology """ - ontology = self._load_ontology(self.ontology_iri) + ontology = self._load_ontology(ontology_iri) if use_reasoning: self._classify_ontology(ontology) self.logger.info("Collecting ontology term details...") start = time.time() - ontology_terms = [] + ontology_terms = dict() if len(base_iris) > 0: for iri in base_iris: iri = iri.strip() query = iri + "*" self.logger.info("...collecting terms with IRIs starting in: " + iri) iris = list(default_world.search(iri=query)) - ontology_terms.extend(self._get_ontology_terms(iris, ontology, exclude_deprecated)) + ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated) else: ontology_terms = self._get_ontology_terms(ontology.classes(), ontology, exclude_deprecated) end = time.time() - self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end-start) + self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), + end - start) + # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup + # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies + ontology.destroy() return ontology_terms def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): - ontology_terms = [] + ontology_terms = dict() for ontology_term in term_list: - if not isinstance(ontology_term, PropertyClass) and ontology_term is not Thing and ontology_term is not Nothing: + if not isinstance(ontology_term, PropertyClass) and ontology_term is not Thing \ + and ontology_term is not Nothing: if (exclude_deprecated and not deprecated[ontology_term]) or (not exclude_deprecated): + iri = ontology_term.iri labels = self._get_labels(ontology_term) synonyms = self._get_synonyms(ontology_term) parents = self._get_parents(ontology_term) children = self._get_children(ontology_term, ontology) instances = self._get_instances(ontology_term, ontology) - definition = self._get_definition(ontology_term) - term_details = OntologyTerm(ontology_term.iri, labels, synonyms, definition, + definitions = self._get_definitions(ontology_term) + term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=parents, children=children, instances=instances) - ontology_terms.append(term_details) + ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) return ontology_terms def _get_parents(self, ontology_term): - parents = set() # named/atomic superclasses except owl:Thing + parents = dict() # named/atomic superclasses except owl:Thing try: - all_parents = ontology_term.is_a # obtain all (direct and indirect) parents of this entity + all_parents = ontology_term.is_a # obtain direct parents of this entity for parent in all_parents: # exclude OWL restrictions and owl:Thing and Self if isinstance(parent, ThingClass) and parent is not Thing and parent is not ontology_term: - parents.add(parent) + if len(parent.label) > 0: + parents.update({parent.iri: parent.label[0]}) + else: + parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) except AttributeError as err: self.logger.debug(err) return parents def _get_children(self, ontology_term, ontology): - children = set() + children = dict() try: - children = set(ontology.get_children_of(ontology_term)) + for child in ontology.get_children_of(ontology_term): + if len(child.iri) > 0: + if len(child.label) > 0: + children.update({child.iri: child.label[0]}) + else: + children.update({child.iri: onto_utils.label_from_iri(child.iri)}) except (TypeError, AttributeError) as err: self.logger.debug(err) return children def _get_instances(self, ontology_term, ontology): - instances = set() + instances = dict() try: - instances = set(ontology.get_instances_of(ontology_term)) + for instance in ontology.get_instances_of(ontology_term): + if len(instance.iri) > 0: + if len(instance.label) > 0: + instances.update({instance.iri: instance.label[0]}) + else: + instances.update({instance.iri: onto_utils.label_from_iri(instance.iri)}) + instances[instance.iri] = instance.label[0] except AttributeError as err: self.logger.debug(err) return instances @@ -101,8 +117,8 @@ def _get_labels(self, ontology_term): labels.add(skos_label) if len(labels) == 0: label_from_iri = onto_utils.label_from_iri(ontology_term.iri) - self.logger.info("...ontology term %s has no labels (rdfs:label or skos:prefLabel). " - "Using a label based on the term IRI: %s", ontology_term.iri, label_from_iri) + self.logger.debug("...ontology term %s has no labels (rdfs:label or skos:prefLabel). " + "Using a label based on the term IRI: %s", ontology_term.iri, label_from_iri) labels.add(label_from_iri) self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) return labels @@ -190,12 +206,29 @@ def _get_nci_synonyms(self, ontology_term): self.logger.debug(err) return nci_synonyms - def _get_definition(self, ontology_term): + def _get_definitions(self, ontology_term): """ - Get the definition (if one exists) of the given term as specified using the skos:definition annotation property + Get definitions (if any exist) of the given term as specified using either the skos:definition annotation + property or the IAO_0000115 ('definition') property :param ontology_term: Ontology term to collect definition of - :return: String value of the skos:definition annotation property assertion on the given term + :return: Set of term definition strings """ + definitions = set() + for definition in self._get_skos_definition(ontology_term): + definitions.add(definition) + for definition in self._get_iao_definition(ontology_term): + definitions.add(definition) + return definitions + + def _get_iao_definition(self, ontology_term): + definition = "" + try: + definition = ontology_term.IAO_0000115 + except AttributeError as err: + self.logger.debug(err) + return definition + + def _get_skos_definition(self, ontology_term): definition = "" try: definition = ontology_term.definition @@ -214,7 +247,7 @@ def _load_ontology(self, ontology_iri): ontology = get_ontology(ontology_iri).load() end = time.time() self._log_ontology_metrics(ontology) - self.logger.info("...done (ontology loading time: %.2fs)", end-start) + self.logger.info("...done (ontology loading time: %.2fs)", end - start) return ontology def _classify_ontology(self, ontology): diff --git a/text2term/term_encoder.py b/text2term/term_encoder.py new file mode 100644 index 0000000..eed0ca5 --- /dev/null +++ b/text2term/term_encoder.py @@ -0,0 +1,8 @@ +import json + + +class TermEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return json.JSONEncoder.default(self, obj) diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py index b0101b5..55ee47d 100644 --- a/text2term/term_graph_generator.py +++ b/text2term/term_graph_generator.py @@ -1,12 +1,12 @@ import onto_utils from term_graph import TermGraph, Node, Edge -from owlready2 import Thing, ThingClass class TermGraphGenerator: - def __init__(self): - pass + def __init__(self, terms): + self._terms = terms + self._logger = onto_utils.get_logger(__name__) def graph(self, term): """ Build and return a graph representing the neighborhood of an ontology term. """ @@ -18,22 +18,23 @@ def graph(self, term): return TermGraph(term.iri, nodes, edges) def _add_superclasses(self, term, nodes, edges): - for parent in term.parents: - self._add_node(parent, nodes) - edges.add(Edge(term.iri, parent.iri, Edge.IS_A)) - self._add_ancestors(parent, nodes, edges) - - def _add_ancestors(self, node, nodes, edges): - for ancestor in node.is_a: - if ancestor is not Thing and isinstance(ancestor, ThingClass): - self._add_node(ancestor, nodes) - edges.add(Edge(node.iri, ancestor.iri, Edge.IS_A)) - self._add_ancestors(ancestor, nodes, edges) + parents = term.parents + for parent_iri in parents: + self._add_node(parent_iri, parents[parent_iri], nodes) + edges.add(Edge(term.iri, parent_iri, Edge.IS_A)) + self._add_ancestors(parent_iri, nodes, edges) + + def _add_ancestors(self, node_iri, nodes, edges): + ancestors = self._terms[node_iri].parents + for ancestor_iri in ancestors: + self._add_node(ancestor_iri, ancestors[ancestor_iri], nodes) + edges.add(Edge(node_iri, ancestor_iri, Edge.IS_A)) + self._add_ancestors(ancestor_iri, nodes, edges) def _add_children(self, term, children, edge_type, nodes, edges): - for child in children: - self._add_node(child, nodes) - edges.add(Edge(child.iri, term.iri, edge_type)) + for child_iri in children: + self._add_node(child_iri, children[child_iri], nodes) + edges.add(Edge(child_iri, term.iri, edge_type)) def _add_subclasses(self, term, subclasses, nodes, edges): self._add_children(term, subclasses, Edge.IS_A, nodes, edges) @@ -41,16 +42,24 @@ def _add_subclasses(self, term, subclasses, nodes, edges): def _add_instances(self, term, instances, nodes, edges): self._add_children(term, instances, Edge.INSTANCE_OF, nodes, edges) - def _add_node(self, term, term_set): - if len(term.label) == 0: - label = onto_utils.label_from_iri(term.iri) + def _add_node(self, term_iri, term_label, nodes): + if len(term_iri) > 0: + if isinstance(term_label, list) and len(term_label) > 0: + label = term_label[0] + elif isinstance(term_label, str): + label = term_label + else: + label = onto_utils.label_from_iri(term_iri) + if label is not None and len(label) > 0: + nodes.add(Node(term_iri, label)) + else: + self._logger.debug("Label is null or empty for term " + term_iri) else: - label = term.label[0] - term_set.add(Node(term.iri, label)) + self._logger.debug("The given term has no IRI") - def graphs_dicts(self, terms): + def graphs_dicts(self): """Convenience function to get a list of all term graphs' dictionary representations""" graph_dicts = [] - for term in terms: + for term in self._terms.values(): graph_dicts.append(self.graph(term).as_dict()) return graph_dicts From 4077b8fed219ccdf651e19cd3f8e1e95ce1dc9d2 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 17 Oct 2022 14:14:32 -0400 Subject: [PATCH 047/185] Add examples of caching and loading --- text2term/test_t2t_cache.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 text2term/test_t2t_cache.py diff --git a/text2term/test_t2t_cache.py b/text2term/test_t2t_cache.py new file mode 100644 index 0000000..7f4140a --- /dev/null +++ b/text2term/test_t2t_cache.py @@ -0,0 +1,24 @@ +import json +import pickle +import time +from t2t import Text2Term + +t2t = Text2Term() + +# Cache a single ontology +t2t.cache_ontology("https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl", "EFO_TEST") + +# Cache all ontologies in a CSV file +# t2t.cache_ontology_set(ontology_registry_path="/Users/rsgoncalves/Documents/Workspace/text2term/text2term/resources/ontologies.csv") + +# Deserialize a cached ontology to get the ontology-terms dictionary needed for mapping +start = time.time() +# terms = json.load(open("cache/EFO_TEST/EFO_TEST-term-details.json", "rb")) +terms = pickle.load(open("cache/EFO_TEST/EFO_TEST-term-details.pickle", "rb")) +end = time.time() +print("Deserialization time: " + str(end-start)) +print(str(len(terms))) +for term_iri, term_obj in terms.items(): + print(term_iri) + print(term_obj) + break From 3313e8b6e9ffde5f9347435fac5ed27eebff7616 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 18 Oct 2022 11:51:44 -0400 Subject: [PATCH 048/185] Update simple caching test --- text2term/test_t2t_cache.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/text2term/test_t2t_cache.py b/text2term/test_t2t_cache.py index 7f4140a..1b90627 100644 --- a/text2term/test_t2t_cache.py +++ b/text2term/test_t2t_cache.py @@ -1,6 +1,7 @@ import json import pickle import time +from mapper import Mapper from t2t import Text2Term t2t = Text2Term() @@ -22,3 +23,11 @@ print(term_iri) print(term_obj) break + +mappings = t2t._do_mapping(['heart attack', 'alzeimers'], + ['t1', 't2'], + terms, + Mapper.TFIDF, + max_mappings=3, + min_score=0.2) +print(mappings) From c0841cf65ae89f39b121d94e73b7e7f3ec4dd75e Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 19 Oct 2022 10:19:25 -0400 Subject: [PATCH 049/185] Updated Versioning Updated the Version to 1.0.0 and updated the status from Alpha to Beta. Also fixed some formatting issues in README. --- README.md | 10 ++++++++++ setup.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9ec7421..b4e876b 100644 --- a/README.md +++ b/README.md @@ -22,24 +22,33 @@ All other arguments are the same, and have the same functionality: `target_ontology` : str Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + `base_iris` : tuple Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + `source_terms_ids` : tuple Collection of identifiers for the given source terms + `excl_deprecated` : bool Exclude ontology terms stated as deprecated via `owl:deprecated true` + `mapper` : mapper.Mapper Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal These can be initialized by invoking mapper.Mapper e.g. `mapper.Mapper.TFIDF` + `max_mappings` : int Maximum number of top-ranked mappings returned per source term + `min_score` : float Minimum similarity score [0,1] for the mappings (1=exact match) + `output_file` : str Path to desired output file for the mappings + `save_graphs` : bool Save vis.js graphs representing the neighborhood of each ontology term + `save_mappings` : bool Save the generated mappings to a file (specified by `output_file`) @@ -47,6 +56,7 @@ All default values, if they exist, can be seen above. ### Return Value Both functions return the same value: + `df` : Data frame containing the generated ontology mappings ## Command Line Usage diff --git a/setup.py b/setup.py index 13f7b24..01ab603 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '0.4.1' +version = '1.0.0' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() @@ -21,7 +21,7 @@ author='Center for Computational Biomedicine, Harvard Medical School', author_email='rafael_goncalves@hms.harvard.edu', classifiers=[ - 'Development Status :: 3 - Alpha', + 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', From b594d364f2ea32ad3ae0b963555cbf2cefe0e12d Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Tue, 8 Nov 2022 14:35:37 -0500 Subject: [PATCH 050/185] Finishing Cache Changes Implemented the specifics of the cache and solidified the core functionality. --- text2term/onto_utils.py | 2 +- text2term/t2t.py | 374 ++++++++++++++++++------------------ text2term/term.py | 7 +- text2term/term_collector.py | 17 +- text2term/test_t2t_cache.py | 33 ---- text2term/tfidf_mapper.py | 2 +- 6 files changed, 212 insertions(+), 223 deletions(-) delete mode 100644 text2term/test_t2t_cache.py diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 63fe747..06e4d0c 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -94,7 +94,7 @@ def _get_iri(ont_name, term_name): return iri -def get_logger(name, level): +def get_logger(name, level=logging.INFO): formatter = logging.Formatter("%(asctime)s %(levelname)s [%(name)s]: %(message)s", "%Y-%m-%d %H:%M:%S") logger = logging.getLogger(name) logger.setLevel(level=level) diff --git a/text2term/t2t.py b/text2term/t2t.py index 0cc0053..801fc2b 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -6,7 +6,9 @@ import time import datetime import onto_utils +import owlready2 import pandas as pd +from shutil import rmtree from mapper import Mapper from term_collector import OntologyTermCollector from term_graph_generator import TermGraphGenerator @@ -16,189 +18,191 @@ from zooma_mapper import ZoomaMapper from term_encoder import TermEncoder - -class Text2Term: - """ Main class in text2term package """ - - def __init__(self): - pass - - def map_file(self, input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=','): - """ - Map the terms in the given input file to the specified target ontology. - - Parameters - ---------- - input_file : str - Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - csv_columns : tuple - Name of the column that contains the terms to map, optionally followed by the name of the column that - contains identifiers for the terms (eg 'my_terms,my_term_ids') - separator : str - Specifies the cell separator to be used when reading a non-comma-separated tabular file - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ - source_terms, source_terms_ids = self._load_data(input_file, csv_columns, separator) - return self.map(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, - excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, - output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings) - - def map(self, source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=()): - """ - Map the terms in the given list to the specified target ontology. - - Parameters - ---------- - source_terms : list - List of 'source' terms to map to ontology terms - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - source_terms_ids : tuple - Collection of identifiers for the given source terms - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ - if len(source_terms_ids) != len(source_terms): - source_terms_ids = onto_utils.generate_iris(len(source_terms)) - if output_file == '': - timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") - output_file = "t2t-mappings-" + timestamp + ".csv" - if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: - target_terms = '' if target_ontology.lower() == 'all' else target_ontology - else: - target_terms = self._load_ontology(target_ontology, base_iris, excl_deprecated) - mappings_df = self._do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) - if save_mappings: - self._save_mappings(mappings_df, output_file) - if save_graphs: - self._save_graphs(target_terms, output_file) - return mappings_df - - def cache_ontology_set(self, ontology_registry_path): - registry = pd.read_csv(ontology_registry_path) - for index, row in registry.iterrows(): - self.cache_ontology(row.url, row.acronym) - - def cache_ontology(self, ontology_url, ontology_acronym, base_iris=()): - ontology_terms = self._load_ontology(ontology_url, base_iris, exclude_deprecated=False) - cache_dir = "cache/" + ontology_acronym + "/" - if not os.path.exists(cache_dir): - os.makedirs(cache_dir) - - self.serialize_ontology_json(ontology_terms, ontology_acronym, cache_dir) - - self.serialize_ontology_pickle(ontology_terms, ontology_acronym, cache_dir) - - self._save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) - ontology_terms.clear() - - def serialize_ontology_pickle(self, ontology_terms, ontology_acronym, cache_dir): - start = time.time() - with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: - pickle.dump(ontology_terms, out_file) - end = time.time() - print("Ontology-Term-Details Pickle serialization time: " + str(end - start)) - - def serialize_ontology_json(self, ontology_terms, ontology_acronym, cache_dir): - start = time.time() - term_dicts = [t.__dict__ for t in ontology_terms.values()] - with open(cache_dir + ontology_acronym + "-term-details.json", 'w+') as json_file: - json.dump(term_dicts, json_file, indent=2, cls=TermEncoder) - end = time.time() - print("Ontology-Term-Details JSON serialization time: " + str(end - start)) - - def _load_data(self, input_file_path, csv_column_names, separator): - if len(csv_column_names) >= 1: - term_id_col_name = "" - if len(csv_column_names) == 2: - term_id_col_name = csv_column_names[1] - terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, - term_column_name=csv_column_names[0], - term_id_column_name=term_id_col_name) - else: - terms = onto_utils.parse_list_file(input_file_path) - term_ids = onto_utils.generate_iris(len(terms)) - return terms, term_ids - - def _load_ontology(self, ontology, iris, exclude_deprecated): - term_collector = OntologyTermCollector() +def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, + separator=',', use_cache=False): + """ + Map the terms in the given input file to the specified target ontology. + + Parameters + ---------- + input_file : str + Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + csv_columns : tuple + Name of the column that contains the terms to map, optionally followed by the name of the column that + contains identifiers for the terms (eg 'my_terms,my_term_ids') + separator : str + Specifies the cell separator to be used when reading a non-comma-separated tabular file + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) + return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, + output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, use_cache=use_cache) + +def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): + """ + Map the terms in the given list to the specified target ontology. + + Parameters + ---------- + source_terms : list + List of 'source' terms to map to ontology terms + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + if len(source_terms_ids) != len(source_terms): + source_terms_ids = onto_utils.generate_iris(len(source_terms)) + if output_file == '': + timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + output_file = "t2t-mappings-" + timestamp + ".csv" + if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: + target_terms = '' if target_ontology.lower() == 'all' else target_ontology + else: + target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache) + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) + if save_mappings: + _save_mappings(mappings_df, output_file) + if save_graphs: + _save_graphs(target_terms, output_file) + return mappings_df + +def cache_ontology_set(ontology_registry_path): + registry = pd.read_csv(ontology_registry_path) + for index, row in registry.iterrows(): + try: + cache_ontology(row.url, row.acronym) + except Exception as err: + print("Could not cache ontology ", row.acronym, " due to error: ", err) + owlready2.default_world.ontologies.clear() + +def cache_ontology(ontology_url, ontology_acronym, base_iris=()): + ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False) + cache_dir = "cache/" + ontology_acronym + "/" + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) + _save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) + ontology_terms.clear() + +def clear_cache(ontology_acronym=''): + cache_dir = "cache/" + if ontology_acronym != '': + cache_dir = os.path.join(cache_dir, ontology_acronym) + # rm -r cache_dir + try: + rmtree(cache_dir) + print("Cache has been cleared successfully") + except OSError as error: + print("Cache cannot be removed:") + print(error) + +def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): + start = time.time() + with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: + pickle.dump(ontology_terms, out_file) + end = time.time() + +def _load_data(input_file_path, csv_column_names, separator): + if len(csv_column_names) >= 1: + term_id_col_name = "" + if len(csv_column_names) == 2: + term_id_col_name = csv_column_names[1] + terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, + term_column_name=csv_column_names[0], + term_id_column_name=term_id_col_name) + else: + terms = onto_utils.parse_list_file(input_file_path) + term_ids = onto_utils.generate_iris(len(terms)) + return terms, term_ids + +def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False): + term_collector = OntologyTermCollector() + if use_cache: + pickle_file = "cache/" + ontology + "/" + ontology + "-term-details.pickle" + onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) + onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated) + else: onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated) - if len(onto_terms) == 0: - raise RuntimeError("Could not find any terms in the given ontology.") - return onto_terms - - def _do_mapping(self, source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): - if mapper == Mapper.TFIDF: - term_mapper = TFIDFMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) - elif mapper == Mapper.ZOOMA: - term_mapper = ZoomaMapper() - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) - elif mapper == Mapper.BIOPORTAL: - term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) - elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: - term_mapper = SyntacticMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) - else: - raise ValueError("Unsupported mapper: " + mapper) - - def _save_mappings(self, mappings, output_file): - if os.path.dirname(output_file): # create output directories if needed - os.makedirs(os.path.dirname(output_file), exist_ok=True) - mappings.to_csv(output_file, index=False) - - def _save_graphs(self, terms, output_file): - term_graphs = TermGraphGenerator(terms).graphs_dicts() - with open(output_file + "-term-graphs.json", 'w') as json_file: - json.dump(term_graphs, json_file, indent=2) + if len(onto_terms) == 0: + raise RuntimeError("Could not find any terms in the given ontology.") + return onto_terms + +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): + if mapper == Mapper.TFIDF: + term_mapper = TFIDFMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + elif mapper == Mapper.ZOOMA: + term_mapper = ZoomaMapper() + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper == Mapper.BIOPORTAL: + term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") + return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: + term_mapper = SyntacticMapper(ontology_terms) + return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + else: + raise ValueError("Unsupported mapper: " + mapper) + +def _save_mappings(mappings, output_file): + if os.path.dirname(output_file): # create output directories if needed + os.makedirs(os.path.dirname(output_file), exist_ok=True) + mappings.to_csv(output_file, index=False) + +def _save_graphs(terms, output_file): + term_graphs = TermGraphGenerator(terms).graphs_dicts() + with open(output_file + "-term-graphs.json", 'w') as json_file: + json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/term.py b/text2term/term.py index b86c667..d24f35e 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -3,7 +3,7 @@ class OntologyTerm: - def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=()): + def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=(), deprecated=False): """ Constructor for a succinct representation of an ontology term :param iri: IRI of the ontology term @@ -21,6 +21,7 @@ def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), childre self._parents = parents self._children = children self._instances = instances + self._deprecated = deprecated @property def iri(self): @@ -55,6 +56,10 @@ def label(self): """Return a single label for this term""" return next(iter(self.labels)) + @property + def deprecated(self): + return self._deprecated + def __eq__(self, other): if isinstance(other, OntologyTerm): return self._iri == other._iri diff --git a/text2term/term_collector.py b/text2term/term_collector.py index adf885b..3f9338f 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -39,9 +39,20 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex end - start) # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies - ontology.destroy() + try: + ontology.destroy() + except Exception as err: + print("Unable to destroy ontology: ", err) return ontology_terms + def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): + for term in onto_terms: + begins_with_iri = (iris == ()) or any(term.iri().startswith(iri) for iri in iris) + is_not_depricated = (excl_deprecated and not term.deprecated()) or (not excl_deprecated) + if not (begins_with_iri and is_not_depricated): + onto_terms.pop(term.iri()) + return onto_terms + def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): ontology_terms = dict() for ontology_term in term_list: @@ -55,8 +66,10 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): children = self._get_children(ontology_term, ontology) instances = self._get_instances(ontology_term, ontology) definitions = self._get_definitions(ontology_term) + is_deprecated = deprecated[ontology_term] == [True] term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, - parents=parents, children=children, instances=instances) + parents=parents, children=children, instances=instances, + deprecated=is_deprecated) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) diff --git a/text2term/test_t2t_cache.py b/text2term/test_t2t_cache.py deleted file mode 100644 index 1b90627..0000000 --- a/text2term/test_t2t_cache.py +++ /dev/null @@ -1,33 +0,0 @@ -import json -import pickle -import time -from mapper import Mapper -from t2t import Text2Term - -t2t = Text2Term() - -# Cache a single ontology -t2t.cache_ontology("https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl", "EFO_TEST") - -# Cache all ontologies in a CSV file -# t2t.cache_ontology_set(ontology_registry_path="/Users/rsgoncalves/Documents/Workspace/text2term/text2term/resources/ontologies.csv") - -# Deserialize a cached ontology to get the ontology-terms dictionary needed for mapping -start = time.time() -# terms = json.load(open("cache/EFO_TEST/EFO_TEST-term-details.json", "rb")) -terms = pickle.load(open("cache/EFO_TEST/EFO_TEST-term-details.pickle", "rb")) -end = time.time() -print("Deserialization time: " + str(end-start)) -print(str(len(terms))) -for term_iri, term_obj in terms.items(): - print(term_iri) - print(term_obj) - break - -mappings = t2t._do_mapping(['heart attack', 'alzeimers'], - ['t1', 't2'], - terms, - Mapper.TFIDF, - max_mappings=3, - min_score=0.2) -print(mappings) diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 99817bf..098c04f 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -84,7 +84,7 @@ def _get_mappings(self, results_mtx, max_mappings, source_terms, source_terms_id def _get_target_labels_terms(self, ontology_terms): """Get lists of labels and terms to enable retrieving terms from their labels""" target_labels, target_terms = [], [] - for term in ontology_terms: + for term in ontology_terms.values(): for label in term.labels: target_labels.append(label) target_terms.append(term) From eb0da2ac248552d994faaaac3f9fe90e4fccc3ad Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 9 Nov 2022 11:04:11 -0500 Subject: [PATCH 051/185] Update README and version Updates the README to include information on caching. Also increments the version number to 1.1.0 and removes the unnecessary term_encoder.py. --- README.md | 35 +++++++++++++++++++++++++++++++---- setup.py | 2 +- text2term/t2t.py | 6 ++++++ text2term/term_encoder.py | 8 -------- 4 files changed, 38 insertions(+), 13 deletions(-) delete mode 100644 text2term/term_encoder.py diff --git a/README.md b/README.md index b4e876b..9b97d10 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ Install package using **pip**: `pip install text2term` The tool can be executed in Python with either of the two following functions: -`text2term.map_files(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF,min_score=0.3, output_file='', save_graphs=False, save_mappings=False, separator=',')` +`text2term.map_files(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF,min_score=0.3, output_file='', save_graphs=False, save_mappings=False, separator=',', use_cache=False)` or -`map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=())` +`map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False)` ### Arguments For `map_files`, the first argument 'input_file' specifies a path to a file containing the names of every term that needs to be mapped. For `map_terms`, The first argument 'source_terms' takes in a list of the terms to be mapped. @@ -59,6 +59,26 @@ Both functions return the same value: `df` : Data frame containing the generated ontology mappings +### Caching +As of version 1.1.0, users can now cache ontologies that they want to use regularly or quickly. Programatically, there are two steps to using the cache: creating the cache, then accessing it. First, the user can cache ontologies using either of two functions: + +`cache_ontology(ontology_url, ontology_acronym, base_iris=())` +Or +`cache_ontology_set(ontology_registry_path)` + +The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found below. +The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: +`acronym,name,version,date,url,comments` + +Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. + +NOTE: Due to how ontologies are processed in memory, `cache_ontology_set` must be used to cache multiple ontologies in a single Python instance. If `cache_ontology` is used multiple times in one instance, the behavior is undefined and may cause visible or invisible errors. + +After an ontology is cached, the user can access the cache by using the assigned acronym in the place of `target_ontology` and setting the `use_cache` flag to `True`. +To clear the cache, one can call: +`clear_cache(ontology_acronym='')` +If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. + ## Command Line Usage Install package using **pip**: @@ -103,8 +123,15 @@ To display a help message with descriptions of tool arguments do: import text2term import pandas -df1 = text2term.map_file(unstruct_terms.txt, http://www.ebi.ac.uk/efo/efo.owl) -df2 = text2term.map_terms(["asthma", "colon cancer"], http://www.ebi.ac.uk/efo/efo.owl) +df1 = text2term.map_file(unstruct_terms.txt, "http://www.ebi.ac.uk/efo/efo.owl") +df2 = text2term.map_terms(["asthma", "colon cancer"], "http://www.ebi.ac.uk/efo/efo.owl") +``` +Below is an example of caching, assuming the same imports as above: +``` +text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") +df1 = text2term.map_file(unstruct_terms.txt, "EFO") +df2 = text2term.map_terms(["asthma", "colon cancer"], "EFO") +text2term.clear_cache("EFO") ``` ### Command Line diff --git a/setup.py b/setup.py index 01ab603..7674f8e 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.0.0' +version = '1.1.0' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/t2t.py b/text2term/t2t.py index b1db6b4..f0ed2f8 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -118,6 +118,9 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False _save_graphs(target_terms, output_file) return mappings_df +""" +CACHING FUNCTIONS -- Public +""" def cache_ontology_set(ontology_registry_path): registry = pd.read_csv(ontology_registry_path) for index, row in registry.iterrows(): @@ -149,6 +152,9 @@ def clear_cache(ontology_acronym=''): print("Cache cannot be removed:") print(error) +""" +PRIVATE/HELPER FUNCTIONS +""" def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): start = time.time() with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: diff --git a/text2term/term_encoder.py b/text2term/term_encoder.py deleted file mode 100644 index eed0ca5..0000000 --- a/text2term/term_encoder.py +++ /dev/null @@ -1,8 +0,0 @@ -import json - - -class TermEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) From ead2bf9327abc9b6707d2b9977f7bf28ca23b3e2 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 10 Nov 2022 15:05:11 -0500 Subject: [PATCH 052/185] Command-line Cache Adds cache functionality to the command-line interface, as well as updates the README to reflect it. --- README.md | 7 +++++++ text2term/__main__.py | 20 ++++++++++---------- text2term/t2t.py | 7 +++++++ 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9b97d10..4f36df8 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ After an ontology is cached, the user can access the cache by using the assigned To clear the cache, one can call: `clear_cache(ontology_acronym='')` If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. +Finally, `cache_exists(ontology_acronym)` is a simple program that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. ## Command Line Usage @@ -116,6 +117,7 @@ To display a help message with descriptions of tool arguments do: `-g SAVE_TERM_GRAPHS` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. +`-c STORE_IN_CACHE` Using this flag followed by the acronym the ontology should be stored as, the program will same the target ontology to the cache. After that, referencing the acronym in `target` will reference the cache. Examples are below. ## Examples ### Programmatic @@ -151,3 +153,8 @@ Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) u Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: `python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. + +Use the cache on the command line, first by flagging it, then in the future using the acronym: +`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` +Then, after running this, the following command is equivalent: +`python text2term -s unstruct_terms.txt -t EFO` diff --git a/text2term/__main__.py b/text2term/__main__.py index 83ef21b..e08c1e8 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -1,7 +1,7 @@ import argparse import os import sys -from t2t import map_file +from t2t import map_file, cache_ontology, cache_exists from mapper import Mapper if __name__ == "__main__": @@ -35,14 +35,9 @@ help="Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False)") parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") + parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", + help="Store the target ontology into local cache under acronym") - # TODO mapping to a cached ontology should be possible through the command-line interface - parser.add_argument("-rc", "--read_from_cache", required=False, default=False, action="store_true", - help="Load the target ontology from local cache") - # TODO writing to cache is an entirely different operation (mapping has 2 required arguments, caching has none)... - parser.add_argument("-wc", "--write_cache", required=False, default=False, action="store_true", - help="Write a local cache of ontology term details from the ontologies specified in the file: " - "'resources/ontologies.csv' for faster performance when mapping terms to those ontologies") arguments = parser.parse_args() if not os.path.exists(arguments.source): parser.error("The file '{}' does not exist".format(arguments.source)) @@ -54,7 +49,12 @@ csv_columns = arguments.csv_input if len(csv_columns) > 0: csv_columns = tuple(csv_columns.split(',')) - map_file(arguments.source, arguments.target, output_file=arguments.output, csv_columns=csv_columns, + target = arguments.target + acronym = arguments.store_in_cache + if acronym != "": + cache_ontology(target, acronym, iris) + target = acronym + map_file(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True, separator=arguments.separator) + save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target)) diff --git a/text2term/t2t.py b/text2term/t2t.py index f0ed2f8..0c6a245 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -121,6 +121,7 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False """ CACHING FUNCTIONS -- Public """ +# Caches many ontologies from a csv def cache_ontology_set(ontology_registry_path): registry = pd.read_csv(ontology_registry_path) for index, row in registry.iterrows(): @@ -130,6 +131,7 @@ def cache_ontology_set(ontology_registry_path): print("Could not cache ontology ", row.acronym, " due to error: ", err) owlready2.default_world.ontologies.clear() +# Caches a single ontology def cache_ontology(ontology_url, ontology_acronym, base_iris=()): ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False) cache_dir = "cache/" + ontology_acronym + "/" @@ -140,6 +142,11 @@ def cache_ontology(ontology_url, ontology_acronym, base_iris=()): _save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) ontology_terms.clear() +# Will check if an acronym exists in the cache +def cache_exists(ontology_acronym): + return os.path.exists("cache/" + ontology_acronym) + +# Clears the cache def clear_cache(ontology_acronym=''): cache_dir = "cache/" if ontology_acronym != '': From abbcb3b373c88b2e159a9d8c1978ab5fd22952ac Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 16 Nov 2022 10:52:15 -0500 Subject: [PATCH 053/185] PyPi Upload Bugs Fixes a minor bug needed to upload to PyPi. Also corrects the README for an example in caching. --- README.md | 4 ++-- text2term/__init__.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4f36df8..d5965d2 100644 --- a/README.md +++ b/README.md @@ -131,8 +131,8 @@ df2 = text2term.map_terms(["asthma", "colon cancer"], "http://www.ebi.ac.uk/efo/ Below is an example of caching, assuming the same imports as above: ``` text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_file(unstruct_terms.txt, "EFO") -df2 = text2term.map_terms(["asthma", "colon cancer"], "EFO") +df1 = text2term.map_file(unstruct_terms.txt, "EFO", use_cache=True) +df2 = text2term.map_terms(["asthma", "colon cancer"], "EFO", use_cache=True) text2term.clear_cache("EFO") ``` diff --git a/text2term/__init__.py b/text2term/__init__.py index 4287821..7a75bc7 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -1,3 +1,7 @@ from .t2t import map_terms from .t2t import map_file +from .t2t import cache_ontology +from .t2t import cache_ontology_set +from .t2t import clear_cache +from .t2t import cache_exists from .mapper import Mapper \ No newline at end of file From 030adae8d3a928eecbb6e47338d7dc7ca6224fb4 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 19 Dec 2022 11:44:57 -0500 Subject: [PATCH 054/185] Bug Fixes in term_collector.py Fixes the known bugs in the filtering function, which prevented the user from filtering on the base IRIs while using a cached ontology. Also fixed a bug that prevented the tool from working with stored terms. --- setup.py | 2 +- text2term/__init__.py | 4 ++++ text2term/term_collector.py | 17 ++++++++++------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 7674f8e..38822c5 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.1.0' +version = '1.1.1' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/__init__.py b/text2term/__init__.py index 4287821..ef2efad 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -1,3 +1,7 @@ from .t2t import map_terms from .t2t import map_file +from .t2t import cache_ontology_set +from .t2t import cache_ontology +from .t2t import cache_exists +from .t2t import clear_cache from .mapper import Mapper \ No newline at end of file diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 3f9338f..b0c02e8 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -46,12 +46,16 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex return ontology_terms def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): - for term in onto_terms: - begins_with_iri = (iris == ()) or any(term.iri().startswith(iri) for iri in iris) - is_not_depricated = (excl_deprecated and not term.deprecated()) or (not excl_deprecated) - if not (begins_with_iri and is_not_depricated): - onto_terms.pop(term.iri()) - return onto_terms + filtered_onto_terms = {} + for base_iri, term in onto_terms.items(): + if type(iris) == str: + begins_with_iri = (iris == ()) or base_iri.startswith(iris) + else: + begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) + is_not_depricated = (not excl_deprecated) or (not term.deprecated) + if begins_with_iri and is_not_depricated: + filtered_onto_terms.update({base_iri: term}) + return filtered_onto_terms def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): ontology_terms = dict() @@ -112,7 +116,6 @@ def _get_instances(self, ontology_term, ontology): instances.update({instance.iri: instance.label[0]}) else: instances.update({instance.iri: onto_utils.label_from_iri(instance.iri)}) - instances[instance.iri] = instance.label[0] except AttributeError as err: self.logger.debug(err) return instances From 20160dd19d1bd99324cd78c024f166eb5b2ae4e2 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Tue, 20 Dec 2022 13:23:34 -0500 Subject: [PATCH 055/185] Update .gitignore Updates .gitignore to include the scripts used to update PyPi, which are not publicly available but must live in the directory. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index c67a9c3..96637bf 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,10 @@ ipython_config.py # pyenv .python-version +# For PyPi upload +make-pypi.sh +test-pypi.py + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies From c6b0328f0cd500657f1a3a13a509ecc1c3966a4a Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 13 Jan 2023 11:23:11 -0500 Subject: [PATCH 056/185] Added preprocessing Added the preprocessing functionality and initial basic documentation. --- README.md | 10 ++++++++++ setup.py | 2 +- text2term/__init__.py | 4 +++- text2term/preprocess.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 text2term/preprocess.py diff --git a/README.md b/README.md index d5965d2..6b290ed 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,16 @@ To clear the cache, one can call: If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. Finally, `cache_exists(ontology_acronym)` is a simple program that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. +### Preprocessing +As of version 1.2.0, text2term now includes a simple preprocessing functionality for input. Specifically, these functions take the original input text and Regex expressions, then match each text to a regular expression to simplify the input. + +Like the "map" functions above, the two functions differ on whether is input is a file or a list of strings: +`preprocess_file(file_path, template_path)` +or +`preprocess_terms(terms, template_path)` + +In both cases, the templates must be stored in a newline seperated file. + ## Command Line Usage Install package using **pip**: diff --git a/setup.py b/setup.py index 38822c5..9c8d78c 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.1.1' +version = '1.2.0' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/__init__.py b/text2term/__init__.py index ef2efad..5f79211 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -4,4 +4,6 @@ from .t2t import cache_ontology from .t2t import cache_exists from .t2t import clear_cache -from .mapper import Mapper \ No newline at end of file +from .mapper import Mapper +from .preprocess import preprocess_file +from .preprocess import preprocess_terms \ No newline at end of file diff --git a/text2term/preprocess.py b/text2term/preprocess.py new file mode 100644 index 0000000..76e8948 --- /dev/null +++ b/text2term/preprocess.py @@ -0,0 +1,35 @@ +import re +import os + +def preprocess_file(file_path, template_path): + terms = _get_values(file_path) + processed_terms = preprocess_terms(terms, template_path) + + filename, file_extension = os.path.splitext(file_path) + output_file = filename + "-preprocessed.txt" + with open(output_file, 'w') as fp: + fp.write('\n'.join(processed_terms)) + +def preprocess_terms(terms, template_path): + template_strings = _get_values(template_path) + template_strings.append("(.*)") + + # Form the templates as regular expressions + templates = [] + for template_string in template_strings: + templates.append(re.compile(template_string)) + + # Checks all terms against each template + processed_terms = [] + for term in terms: + for template in templates: + match = template.fullmatch(term) + if match: + combined_matches = ' '.join(map(str, match.groups())) + if combined_matches: + processed_terms.append(combined_matches) + break + return processed_terms + +def _get_values(path): + return open(path).read().splitlines() \ No newline at end of file From dc7a13e6f360ed5ef2f281cd053a72f3df4da38f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 17 Jan 2023 10:35:57 -0500 Subject: [PATCH 057/185] Collect OBO related synonyms during term detail collection --- text2term/term_collector.py | 39 ++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index b0c02e8..486fb45 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -52,8 +52,8 @@ def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): begins_with_iri = (iris == ()) or base_iri.startswith(iris) else: begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) - is_not_depricated = (not excl_deprecated) or (not term.deprecated) - if begins_with_iri and is_not_depricated: + is_not_deprecated = (not excl_deprecated) or (not term.deprecated) + if begins_with_iri and is_not_deprecated: filtered_onto_terms.update({base_iri: term}) return filtered_onto_terms @@ -90,7 +90,7 @@ def _get_parents(self, ontology_term): parents.update({parent.iri: parent.label[0]}) else: parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) - except AttributeError as err: + except (AttributeError, ValueError) as err: self.logger.debug(err) return parents @@ -103,7 +103,7 @@ def _get_children(self, ontology_term, ontology): children.update({child.iri: child.label[0]}) else: children.update({child.iri: onto_utils.label_from_iri(child.iri)}) - except (TypeError, AttributeError) as err: + except (TypeError, AttributeError, ValueError) as err: self.logger.debug(err) return children @@ -148,6 +148,8 @@ def _get_synonyms(self, ontology_term): synonyms = set() for synonym in self._get_obo_exact_synonyms(ontology_term): synonyms.add(synonym) + for synonym in self._get_obo_related_synonyms(ontology_term): + synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): @@ -165,7 +167,7 @@ def _get_rdfs_labels(self, ontology_term): try: for rdfs_label in ontology_term.label: rdfs_labels.append(rdfs_label) - except AttributeError as err: + except (AttributeError, ValueError) as err: self.logger.debug(err) return rdfs_labels @@ -194,14 +196,33 @@ def _get_efo_alt_terms(self, ontology_term): def _get_obo_exact_synonyms(self, ontology_term): """ - Collect synonyms of the given term that are specified using the annotation property used by DOID, MONDO, EFO, - HPO, and other OBO ontologies: . - :param ontology_term: Ontology term to collect synonyms from - :return: Collection of synonyms + Collect exact synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect exact synonyms from + :return: Collection of exact synonyms """ synonyms = [] try: for synonym in ontology_term.hasExactSynonym: + if synonym.iri is not None: + synonym = synonym.iri + synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return synonyms + + def _get_obo_related_synonyms(self, ontology_term): + """ + Collect related synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect related synonyms from + :return: Collection of related synonyms + """ + synonyms = [] + try: + for synonym in ontology_term.hasRelatedSynonym: + if synonym.iri is not None: + synonym = synonym.iri synonyms.append(synonym) except AttributeError as err: self.logger.debug(err) From 4199ed194fd53aadd80844995dc436d1947a49f3 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 17 Jan 2023 10:38:23 -0500 Subject: [PATCH 058/185] Upgrade setuptools version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e43c523..f0bbc23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy==1.23.2 gensim==4.1.2 scipy==1.8.0 scikit-learn==1.0.2 -setuptools==60.9.3 +setuptools==66.0.0 requests==2.27.1 tqdm==4.62.3 sparse_dot_topn==0.3.1 From 48a7fdc3aa8fbecdb2b033b3e5e7ebc0c4c3c4a4 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 17 Jan 2023 18:52:18 -0500 Subject: [PATCH 059/185] Include imported ontology terms in term collection & mapping --- setup.py | 5 ++--- text2term/term_collector.py | 10 +++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 9c8d78c..7623067 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.2.0' +version = '1.2.1' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() @@ -27,8 +27,7 @@ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.9', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Bio-Informatics' + 'Topic :: Scientific/Engineering' ], python_requires=">=3.9", ) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 486fb45..b6f8119 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -33,7 +33,8 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex iris = list(default_world.search(iri=query)) ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated) else: - ontology_terms = self._get_ontology_terms(ontology.classes(), ontology, exclude_deprecated) + ontology_signature = self._get_ontology_signature(ontology) + ontology_terms = self._get_ontology_terms(ontology_signature, ontology, exclude_deprecated) end = time.time() self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end - start) @@ -57,6 +58,13 @@ def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): filtered_onto_terms.update({base_iri: term}) return filtered_onto_terms + def _get_ontology_signature(self, ontology): + signature = list(ontology.classes()) + # ontology.classes() does not include classes in imported ontologies; we need to explicitly add them to our list + for imported_ontology in ontology.imported_ontologies: + signature.extend(list(imported_ontology.classes())) + return signature + def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): ontology_terms = dict() for ontology_term in term_list: From 0e729c7b8ea994f17bf01f400abd9c9b061e9693 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 18 Jan 2023 16:47:36 -0500 Subject: [PATCH 060/185] Output changes Minor changes to the outputs, mainly printing errors to stderr and changing some messages to logging. --- text2term/t2t.py | 11 ++++++----- text2term/term_collector.py | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 0c6a245..fdaf1eb 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -1,6 +1,7 @@ """Provides Text2Term class""" import os +import sys import json import pickle import time @@ -128,7 +129,7 @@ def cache_ontology_set(ontology_registry_path): try: cache_ontology(row.url, row.acronym) except Exception as err: - print("Could not cache ontology ", row.acronym, " due to error: ", err) + sys.stderr.write("Could not cache ontology", row.acronym, "due to error:", err) owlready2.default_world.ontologies.clear() # Caches a single ontology @@ -151,13 +152,13 @@ def clear_cache(ontology_acronym=''): cache_dir = "cache/" if ontology_acronym != '': cache_dir = os.path.join(cache_dir, ontology_acronym) - # rm -r cache_dir + # Is equivalent to: rm -r cache_dir try: rmtree(cache_dir) - print("Cache has been cleared successfully") + sys.stderr.write("Cache has been cleared successfully") except OSError as error: - print("Cache cannot be removed:") - print(error) + sys.stderr.write("Cache cannot be removed:") + sys.stderr.write(error) """ PRIVATE/HELPER FUNCTIONS diff --git a/text2term/term_collector.py b/text2term/term_collector.py index b6f8119..25a5a15 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -3,12 +3,12 @@ from owlready2 import * from text2term import onto_utils from text2term.term import OntologyTerm - +import logging class OntologyTermCollector: - def __init__(self): - self.logger = onto_utils.get_logger(__name__) + def __init__(self, log_level=logging.INFO): + self.logger = onto_utils.get_logger(__name__, level=log_level) def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False): """ @@ -43,7 +43,7 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex try: ontology.destroy() except Exception as err: - print("Unable to destroy ontology: ", err) + self.logger.debug("Unable to destroy ontology: ", err) return ontology_terms def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): From a9fbfa201417eb8b01968844b21dcdef0a5d9ffb Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 18 Jan 2023 18:01:54 -0500 Subject: [PATCH 061/185] Include OBO 'broad' synonyms in ontology term details collection --- text2term/term_collector.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 25a5a15..25c3102 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -5,6 +5,7 @@ from text2term.term import OntologyTerm import logging + class OntologyTermCollector: def __init__(self, log_level=logging.INFO): @@ -158,6 +159,8 @@ def _get_synonyms(self, ontology_term): synonyms.add(synonym) for synonym in self._get_obo_related_synonyms(ontology_term): synonyms.add(synonym) + for synonym in self._get_obo_broad_synonyms(ontology_term): + synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): @@ -212,7 +215,7 @@ def _get_obo_exact_synonyms(self, ontology_term): synonyms = [] try: for synonym in ontology_term.hasExactSynonym: - if synonym.iri is not None: + if hasattr(synonym, 'iri'): synonym = synonym.iri synonyms.append(synonym) except AttributeError as err: @@ -229,7 +232,24 @@ def _get_obo_related_synonyms(self, ontology_term): synonyms = [] try: for synonym in ontology_term.hasRelatedSynonym: - if synonym.iri is not None: + if hasattr(synonym, 'iri'): + synonym = synonym.iri + synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return synonyms + + def _get_obo_broad_synonyms(self, ontology_term): + """ + Collect broad synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect broad synonyms from + :return: Collection of broad synonyms + """ + synonyms = [] + try: + for synonym in ontology_term.hasBroadSynonym: + if hasattr(synonym, 'iri'): synonym = synonym.iri synonyms.append(synonym) except AttributeError as err: From d6d9e8be37923c73e28812739f8dd67f9f165c49 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 18 Jan 2023 18:11:55 -0500 Subject: [PATCH 062/185] Enable outputting term graphs when ancestor terms are missing when terms are filtered by their IRIs upfront, the term details dictionary (and resulting term graphs) are incomplete by design --- text2term/term_graph_generator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py index 0c6ccc8..231c602 100644 --- a/text2term/term_graph_generator.py +++ b/text2term/term_graph_generator.py @@ -24,11 +24,15 @@ def _add_superclasses(self, term, nodes, edges): self._add_ancestors(parent_iri, nodes, edges) def _add_ancestors(self, node_iri, nodes, edges): - ancestors = self._terms[node_iri].parents - for ancestor_iri in ancestors: - self._add_node(ancestor_iri, ancestors[ancestor_iri], nodes) - edges.add(Edge(node_iri, ancestor_iri, Edge.IS_A)) - self._add_ancestors(ancestor_iri, nodes, edges) + if node_iri in self._terms: + ancestors = self._terms[node_iri].parents + for ancestor_iri in ancestors: + self._add_node(ancestor_iri, ancestors[ancestor_iri], nodes) + edges.add(Edge(node_iri, ancestor_iri, Edge.IS_A)) + self._add_ancestors(ancestor_iri, nodes, edges) + else: + self._logger.debug("Unable to get ancestor term %s from the ontology term details dictionary " + "(possibly filtered out through the `base_iris` option)", node_iri) def _add_children(self, term, children, edge_type, nodes, edges): for child_iri in children: From 270e47bd678b33ce4141ddefa79bfb7a668da455 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 19 Jan 2023 15:46:36 -0500 Subject: [PATCH 063/185] Single section for installation. Minor formatting changes Starting to use ```python ...``` for better readability of code --- README.md | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6b290ed..2fa20a4 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,49 @@ -# text2term ontology mapper - +# *text2term* ontology mapper A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology. -## Programmatic Usage +## Installation Install package using **pip**: -`pip install text2term` +``` +pip install . +``` +## Programmatic Usage The tool can be executed in Python with either of the two following functions: -`text2term.map_files(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF,min_score=0.3, output_file='', save_graphs=False, save_mappings=False, separator=',', use_cache=False)` +```python +text2term.map_file(input_file='/some/file.txt', + target_ontology='http://some.ontology/v1.owl', + base_iris=(), + csv_columns=(), + excl_deprecated=False, + max_mappings=3, + mapper=Mapper.TFIDF, + min_score=0.3, + output_file='', + save_graphs=False, + save_mappings=False, + separator=',', + use_cache=False) +``` or - -`map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False)` +```python +map_terms(source_terms=['term one', 'term two'], + target_ontology='http://some.ontology/v1.owl', + base_iris=(), + excl_deprecated=False, + max_mappings=3, + min_score=0.3, + mapper=Mapper.TFIDF, + output_file='', + save_graphs=False, + save_mappings=False, + source_terms_ids=(), + use_cache=False) +``` ### Arguments -For `map_files`, the first argument 'input_file' specifies a path to a file containing the names of every term that needs to be mapped. For `map_terms`, The first argument 'source_terms' takes in a list of the terms to be mapped. +For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped. All other arguments are the same, and have the same functionality: @@ -92,11 +120,7 @@ In both cases, the templates must be stored in a newline seperated file. ## Command Line Usage -Install package using **pip**: - -`pip install .` - -Execute the tool as follows: +After installation, execute the tool from a command line as follows: `python text2term -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-g SAVE_TERM_GRAPHS]` From f31dbc71eca1337c2f55b37ba93ac284f1d38b9f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 19 Jan 2023 15:49:46 -0500 Subject: [PATCH 064/185] Include 'text2term' in map_terms function call --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 2fa20a4..fb30c51 100644 --- a/README.md +++ b/README.md @@ -28,18 +28,18 @@ text2term.map_file(input_file='/some/file.txt', ``` or ```python -map_terms(source_terms=['term one', 'term two'], - target_ontology='http://some.ontology/v1.owl', - base_iris=(), - excl_deprecated=False, - max_mappings=3, - min_score=0.3, - mapper=Mapper.TFIDF, - output_file='', - save_graphs=False, - save_mappings=False, - source_terms_ids=(), - use_cache=False) +text2term.map_terms(source_terms=['term one', 'term two'], + target_ontology='http://some.ontology/v1.owl', + base_iris=(), + excl_deprecated=False, + max_mappings=3, + min_score=0.3, + mapper=Mapper.TFIDF, + output_file='', + save_graphs=False, + save_mappings=False, + source_terms_ids=(), + use_cache=False) ``` ### Arguments From ad009f4aad552a4f733512c1d0cc69b2e5a9d087 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Tue, 24 Jan 2023 09:59:23 -0500 Subject: [PATCH 065/185] Added Blacklisting to String Preprocessing Added the blacklisting functionality to the preprocess functions. Also changed the output so it is only a file output if specified. --- README.md | 8 ++++--- setup.py | 2 +- text2term/preprocess.py | 48 ++++++++++++++++++++++++++++++----------- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index fb30c51..c5ad391 100644 --- a/README.md +++ b/README.md @@ -112,11 +112,13 @@ Finally, `cache_exists(ontology_acronym)` is a simple program that returns `True As of version 1.2.0, text2term now includes a simple preprocessing functionality for input. Specifically, these functions take the original input text and Regex expressions, then match each text to a regular expression to simplify the input. Like the "map" functions above, the two functions differ on whether is input is a file or a list of strings: -`preprocess_file(file_path, template_path)` +`preprocess_file(file_path, template_path, output_file="", blacklist_path="", blacklist_char='')` or -`preprocess_terms(terms, template_path)` +`preprocess_terms(terms, template_path, output_file="", blacklist_path="", blacklist_char='')` -In both cases, the templates must be stored in a newline seperated file. +In both cases, the templates and the blacklist must be stored in a newline seperated file. If an output file is specified, the preprocessed strings are written to that file and the list is passed back regardless. + +The blacklist functionality allows the user to specify another regex file. If any terms match any regex in blacklist, they are removed from the terms, or, if a blacklist character is specified, replaced with that character for placeholding. ## Command Line Usage diff --git a/setup.py b/setup.py index 7623067..942ac3e 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.2.1' +version = '1.3.0' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 76e8948..25b1277 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -1,27 +1,39 @@ import re import os -def preprocess_file(file_path, template_path): +def preprocess_file(file_path, template_path, output_file="", blacklist_path="", \ + blacklist_char=''): terms = _get_values(file_path) - processed_terms = preprocess_terms(terms, template_path) + processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ + blacklist_path=blacklist_path, blacklist_char=blacklist_char) - filename, file_extension = os.path.splitext(file_path) - output_file = filename + "-preprocessed.txt" - with open(output_file, 'w') as fp: - fp.write('\n'.join(processed_terms)) + return processed_terms -def preprocess_terms(terms, template_path): +def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ + blacklist_char=''): + # Form the templates as regular expressions template_strings = _get_values(template_path) template_strings.append("(.*)") + templates = _make_regex_list(template_strings) - # Form the templates as regular expressions - templates = [] - for template_string in template_strings: - templates.append(re.compile(template_string)) + # Create the blacklist, if it exists + if blacklist_path != "": + blacklist_strings = _get_values(blacklist_path) + blacklist = _make_regex_list(blacklist_strings) - # Checks all terms against each template + # Checks all terms against each blacklist then template processed_terms = [] for term in terms: + blacklisted = False + for banned in blacklist: + match = banned.fullmatch(term) + if match: + if blacklist_char != '': + processed_terms.append(blacklist_char) + blacklisted = True + break + if blacklisted: + continue for template in templates: match = template.fullmatch(term) if match: @@ -29,7 +41,17 @@ def preprocess_terms(terms, template_path): if combined_matches: processed_terms.append(combined_matches) break + + if output_file != "": + with open(output_file, 'w') as fp: + fp.write('\n'.join(processed_terms)) return processed_terms def _get_values(path): - return open(path).read().splitlines() \ No newline at end of file + return open(path).read().splitlines() + +def _make_regex_list(strings): + regexes = [] + for string in strings: + regexes.append(re.compile(string)) + return regexes From b3159dc09d9d1c44add378f4bd3e6cfac2159733 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 26 Jan 2023 10:24:11 -0500 Subject: [PATCH 066/185] Added Remove Duplicates to Preprocessing Added functionality to remove duplicate terms in the string preprocessing module. Also updates README accordingly. --- README.md | 15 +++++++++++++-- text2term/preprocess.py | 25 ++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c5ad391..d0e0fd1 100644 --- a/README.md +++ b/README.md @@ -112,14 +112,25 @@ Finally, `cache_exists(ontology_acronym)` is a simple program that returns `True As of version 1.2.0, text2term now includes a simple preprocessing functionality for input. Specifically, these functions take the original input text and Regex expressions, then match each text to a regular expression to simplify the input. Like the "map" functions above, the two functions differ on whether is input is a file or a list of strings: -`preprocess_file(file_path, template_path, output_file="", blacklist_path="", blacklist_char='')` +`preprocess_file(file_path, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` or -`preprocess_terms(terms, template_path, output_file="", blacklist_path="", blacklist_char='')` +`preprocess_terms(terms, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` In both cases, the templates and the blacklist must be stored in a newline seperated file. If an output file is specified, the preprocessed strings are written to that file and the list is passed back regardless. The blacklist functionality allows the user to specify another regex file. If any terms match any regex in blacklist, they are removed from the terms, or, if a blacklist character is specified, replaced with that character for placeholding. +Finally, the Remove Duplicates functionality will remove all duplicate terms before processing, after, or both depending on the setting. To represent the settings, the following emnumeration class exists in the preprocessing module: +```python +class DupSetting(Enum): + NO_REM = 0 + REM_BEFORE = 1 + REM_AFTER = 2 + REM_BOTH = 3 +``` + +WARNING: Removing duplicates at any point does not guarantee that the order of the terms are maintained. As such, if order is important to the output, this functionality is not recommended. + ## Command Line Usage After installation, execute the tool from a command line as follows: diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 25b1277..ab0fe5f 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -1,22 +1,35 @@ import re import os +from enum import Enum + +class DupSetting(Enum): + NO_REM = 0 + REM_BEFORE = 1 + REM_AFTER = 2 + REM_BOTH = 3 def preprocess_file(file_path, template_path, output_file="", blacklist_path="", \ - blacklist_char=''): + blacklist_char='', rem_duplicates=DupSetting.NO_REM): terms = _get_values(file_path) processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ - blacklist_path=blacklist_path, blacklist_char=blacklist_char) + blacklist_path=blacklist_path, blacklist_char=blacklist_char, \ + rem_duplicates=rem_duplicates) return processed_terms def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ - blacklist_char=''): + blacklist_char='', rem_duplicates=DupSetting.NO_REM): + # Remove duplicate terms, if settings indicate + if rem_duplicates == DupSetting.REM_BEFORE or rem_duplicates == DupSetting.REM_BOTH: + terms = _remove_duplicates(terms) + # Form the templates as regular expressions template_strings = _get_values(template_path) template_strings.append("(.*)") templates = _make_regex_list(template_strings) # Create the blacklist, if it exists + blacklist = [] if blacklist_path != "": blacklist_strings = _get_values(blacklist_path) blacklist = _make_regex_list(blacklist_strings) @@ -42,6 +55,9 @@ def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ processed_terms.append(combined_matches) break + if rem_duplicates == DupSetting.REM_AFTER or rem_duplicates == DupSetting.REM_BOTH: + processed_terms = _remove_duplicates(processed_terms) + if output_file != "": with open(output_file, 'w') as fp: fp.write('\n'.join(processed_terms)) @@ -55,3 +71,6 @@ def _make_regex_list(strings): for string in strings: regexes.append(re.compile(string)) return regexes + +def _remove_duplicates(list): + return [*set(list)] From baea078c971c9eae42792beede479c7f585efa64 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 31 Jan 2023 17:07:12 -0500 Subject: [PATCH 067/185] Update pip install reference --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d0e0fd1..ccafc64 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A tool for mapping free-text descriptions of (biomedical) entities to controlled Install package using **pip**: ``` -pip install . +pip install text2term ``` ## Programmatic Usage From ea3a2c6c12a2f0e0a419144a04a493fb1068239a Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 7 Feb 2023 17:37:10 -0500 Subject: [PATCH 068/185] Update syntactic mapper to use dict of terms (due to change in return type) Specify python engine in onto_utils csv parser to address a pandas warning --- setup.py | 2 +- text2term/onto_utils.py | 2 +- text2term/syntactic_mapper.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 942ac3e..e149233 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.3.0' +version = '1.3.1' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 06e4d0c..20a9217 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -114,7 +114,7 @@ def parse_list_file(file_path): def parse_csv_file(file_path, term_column_name, term_id_column_name, separator=','): - data = pd.read_csv(file_path, sep=separator) + data = pd.read_csv(file_path, sep=separator, engine='python') if term_column_name not in data.columns: raise ValueError("Could not find specified column name for input terms: " + term_column_name) terms = data[term_column_name].values diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index c85dbbe..a9ab4ff 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -39,7 +39,7 @@ def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_ma def _map(self, source_term, source_term_id, mapper, max_matches=3): self.logger.debug("Matching %s...", source_term) term_matches = [] - for term in self.target_ontology_terms: + for term in self.target_ontology_terms.values(): highest_similarity = 0.0 for target_name in self._term_names(term): similarity = self.compare(source_term, target_name, mapper) From d675170d449181d75f802d78250997fb0147d322 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 7 Feb 2023 18:35:16 -0500 Subject: [PATCH 069/185] Update Zooma mapper to only search in ontologies --- text2term/zooma_mapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 1a6b245..26df493 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -33,9 +33,10 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters + # If 'required:[none]' is specified, Zooma will search the OLS without looking into the datasources. params = { "propertyValue": onto_utils.normalize(source_term), - "filter": "required:[gwas,cttv,atlas,eva-clinvar,sysmicro],ontologies:[" + ontologies + "]" + "filter": "required:[none],ontologies:[" + ontologies + "]" } if len(api_params) > 0: params.update(api_params) From cf65473fead1357eea19fc0771172a3c6c9398cf Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 9 Feb 2023 11:14:21 -0500 Subject: [PATCH 070/185] Added Tagging Added tagging functionality to both the text2term mapping functions (by adding a new function) and to the preprocessing module. Also changed the preprocessing to return a dictionary. --- README.md | 45 ++++++++++---- setup.py | 2 +- text2term/__init__.py | 4 +- text2term/preprocess.py | 125 +++++++++++++++++++++++++++++--------- text2term/t2t.py | 34 +++++++++++ text2term/tagged_terms.py | 25 ++++++++ 6 files changed, 194 insertions(+), 41 deletions(-) create mode 100644 text2term/tagged_terms.py diff --git a/README.md b/README.md index ccafc64..138aa98 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ pip install text2term ``` ## Programmatic Usage -The tool can be executed in Python with either of the two following functions: +The tool can be executed in Python with any of the three following functions: ```python text2term.map_file(input_file='/some/file.txt', @@ -41,9 +41,26 @@ text2term.map_terms(source_terms=['term one', 'term two'], source_terms_ids=(), use_cache=False) ``` +or +```python +text2term.map_tagged_terms(tagged_terms_dict={'term one': ["tag 1", "tag 2"]}, + target_ontology='http://some.ontology/v1.owl', + base_iris=(), + excl_deprecated=False, + max_mappings=3, + min_score=0.3, + mapper=Mapper.TFIDF, + output_file='', + save_graphs=False, + save_mappings=False, + source_terms_ids=(), + use_cache=False) +``` ### Arguments -For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped. +For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. It also has a `csv_column` argument that allows the user to specify a column to map if a csv is passed in as the input file. +For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped. +For `map_tagged_terms`, everything is the same as `map_terms` except the first argument is either a dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. All other arguments are the same, and have the same functionality: @@ -115,21 +132,27 @@ Like the "map" functions above, the two functions differ on whether is input is `preprocess_file(file_path, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` or `preprocess_terms(terms, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` +or +`preprocess_tagged_terms(file_path, template_path="", blacklist_path="", blacklist_char='', rem_duplicates=False, separator=";:;")` -In both cases, the templates and the blacklist must be stored in a newline seperated file. If an output file is specified, the preprocessed strings are written to that file and the list is passed back regardless. +In all cases, the templates and the blacklist must be stored in a newline seperated file. If an output file is specified, the preprocessed strings are written to that file and the list is passed back regardless. The blacklist functionality allows the user to specify another regex file. If any terms match any regex in blacklist, they are removed from the terms, or, if a blacklist character is specified, replaced with that character for placeholding. -Finally, the Remove Duplicates functionality will remove all duplicate terms before processing, after, or both depending on the setting. To represent the settings, the following emnumeration class exists in the preprocessing module: +The Remove Duplicates functionality will remove all duplicate terms after processing, if true. +WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. + +The non-tagged functions both return a dictionary where the keys are the original terms and the values are the preprocessed terms. +The tagged function returns a list of TaggedTerm items with the following function contracts: ```python -class DupSetting(Enum): - NO_REM = 0 - REM_BEFORE = 1 - REM_AFTER = 2 - REM_BOTH = 3 +def __init__(self, term=None, tags=[], original_term=None) +def add_tags(self, new_tags) +def update_term(self, term) +def get_original_term(self) +def get_term(self) +def get_tags(self) ``` - -WARNING: Removing duplicates at any point does not guarantee that the order of the terms are maintained. As such, if order is important to the output, this functionality is not recommended. +As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy prgorammatic usage. Note that this allows multiple of the same preprocessed term with different tags. ## Command Line Usage diff --git a/setup.py b/setup.py index e149233..71e597c 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '1.3.1' +version = '2.0.0' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/__init__.py b/text2term/__init__.py index 5f79211..c8764a4 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -4,6 +4,8 @@ from .t2t import cache_ontology from .t2t import cache_exists from .t2t import clear_cache +from .t2t import map_tagged_terms from .mapper import Mapper from .preprocess import preprocess_file -from .preprocess import preprocess_terms \ No newline at end of file +from .preprocess import preprocess_terms +from .preprocess import preprocess_tagged_terms \ No newline at end of file diff --git a/text2term/preprocess.py b/text2term/preprocess.py index ab0fe5f..479dd73 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -1,15 +1,10 @@ import re import os from enum import Enum - -class DupSetting(Enum): - NO_REM = 0 - REM_BEFORE = 1 - REM_AFTER = 2 - REM_BOTH = 3 +from .tagged_terms import TaggedTerm def preprocess_file(file_path, template_path, output_file="", blacklist_path="", \ - blacklist_char='', rem_duplicates=DupSetting.NO_REM): + blacklist_char='', rem_duplicates=False): terms = _get_values(file_path) processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ blacklist_path=blacklist_path, blacklist_char=blacklist_char, \ @@ -17,14 +12,67 @@ def preprocess_file(file_path, template_path, output_file="", blacklist_path="", return processed_terms -def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ - blacklist_char='', rem_duplicates=DupSetting.NO_REM): - # Remove duplicate terms, if settings indicate - if rem_duplicates == DupSetting.REM_BEFORE or rem_duplicates == DupSetting.REM_BOTH: - terms = _remove_duplicates(terms) +## Tags should be stored with their terms in the same line, delineated by ";:;" +## ex: Age when diagnosed with (.*) ;:; age,diagnosis +## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} +def preprocess_tagged_terms(file_path, template_path="", blacklist_path="", \ + blacklist_char='', rem_duplicates=False, separator=";:;"): + # Seperate tags from the terms, put in TaggedTerm and add to list + raw_terms = _get_values(file_path) + terms = [] + for raw_term in raw_terms: + seperated = raw_term.split(separator) + try: + tags = seperated[1].split(",") + term = TaggedTerm(original_term=seperated[0], tags=tags) + except IndexError: + term = TaggedTerm(original_term=raw_term) + terms.append(term) + + # Seperate tags from templates, store together in dictionary + templates = {} + if template_path != "": + raw_templates = _get_values(template_path) + for raw_template in raw_templates: + seperated = raw_template.split(separator) + try: + tags = seperated[1].split(",") + regex_term = re.compile(seperated[0]) + templates[regex_term] = tags + except IndexError: + regex_term = re.compile(raw_template) + templates[regex_term] = [] + templates[re.compile("(.*)")] = [] + + # Create the blacklist, if it exists + blacklist = [] + if blacklist_path != "": + blacklist_strings = _get_values(blacklist_path) + blacklist = _make_regex_list(blacklist_strings) + + processed_terms = [] + for term in terms: + if _blacklist_term(processed_terms, term, blacklist, blacklist_char, tagged=True): + continue + for template, tem_tags in templates.items(): + match = template.fullmatch(term.get_original_term()) + if match: + combined_matches = ' '.join(map(str, match.groups())) + if combined_matches: + _update_tagged_term(processed_terms, term, combined_matches, tem_tags) + break + if rem_duplicates: + processed_terms = _remove_duplicates(processed_terms) + + return processed_terms + +def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ + blacklist_char='', rem_duplicates=False): # Form the templates as regular expressions - template_strings = _get_values(template_path) + template_strings = [] + if template_path != "": + template_strings = _get_values(template_path) template_strings.append("(.*)") templates = _make_regex_list(template_strings) @@ -35,34 +83,45 @@ def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ blacklist = _make_regex_list(blacklist_strings) # Checks all terms against each blacklist then template - processed_terms = [] + processed_terms = {} for term in terms: - blacklisted = False - for banned in blacklist: - match = banned.fullmatch(term) - if match: - if blacklist_char != '': - processed_terms.append(blacklist_char) - blacklisted = True - break - if blacklisted: + if _blacklist_term(processed_terms, term, blacklist, blacklist_char): continue for template in templates: match = template.fullmatch(term) if match: combined_matches = ' '.join(map(str, match.groups())) if combined_matches: - processed_terms.append(combined_matches) + processed_terms[term] = combined_matches break - if rem_duplicates == DupSetting.REM_AFTER or rem_duplicates == DupSetting.REM_BOTH: + if rem_duplicates: processed_terms = _remove_duplicates(processed_terms) if output_file != "": with open(output_file, 'w') as fp: - fp.write('\n'.join(processed_terms)) + fp.write('\n'.join(processed_terms.values())) return processed_terms +## Note: Because Python Dictionaries and Lists are passed by reference (sort of), updating the +## dictionary/list here will update the dictionary in the caller +def _blacklist_term(processed_terms, term, blacklist, blacklist_char, tagged=False): + for banned in blacklist: + match = banned.fullmatch(term if type(term) is not TaggedTerm else term.get_original_term()) + if match: + if blacklist_char != '': + if tagged: + _update_tagged_term(processed_terms, term, blacklist_char) + else: + processed_terms[term] = blacklist_char + return True + return False + +def _update_tagged_term(processed_terms, term, new_term, tags=[]): + term.update_term(new_term) + term.add_tags(tags) + processed_terms.append(term) + def _get_values(path): return open(path).read().splitlines() @@ -72,5 +131,15 @@ def _make_regex_list(strings): regexes.append(re.compile(string)) return regexes -def _remove_duplicates(list): - return [*set(list)] +def _remove_duplicates(terms): + if type(terms) is dict: + temp = {val : key for key, val in terms.items()} + final = {val : key for key, val in temp.items()} + else: + temp = [] + final = [] + for term in terms: + if term.get_term() not in temp: + temp.append(term.get_term()) + final.append(term) + return final diff --git a/text2term/t2t.py b/text2term/t2t.py index fdaf1eb..d488b12 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -65,6 +65,40 @@ def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_dep excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, use_cache=use_cache) +def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): + """ + All parameters are the same as below, but tagged_terms_dict is a dictionary where the + key is the source term and the value is a list of all tags (or a single string for + one tag). It can also be a list of TaggedTerm objects. + The dataframe returned is the same but contains a tags column + """ + # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms + if tagged_terms_dict is dict: + terms = list(tagged_terms_dict.keys()) + else: + terms = [tagged_term.get_term() for tagged_term in tagged_terms_dict] + df = map_terms(terms, target_ontology, base_iris=base_iris, excl_deprecated=excl_deprecated, \ + max_mappings=max_mappings, min_score=min_score, mapper=mapper, output_file=output_file, \ + save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache) + + # For each term in dict, add tags to corresponding mappings row in "Tags" Column + if tagged_terms_dict is dict: + for key, value in tagged_terms_dict.items(): + if value is list: + to_store = ','.join(value) + else: + to_store = str(value) + df.loc[df['Source Term'] == key, "Tags"] = to_store + else: + for term in tagged_terms_dict: + to_store = ','.join(term.get_tags()) + df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store + + if save_mappings: + _save_mappings(df, output_file) + return df + def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): """ diff --git a/text2term/tagged_terms.py b/text2term/tagged_terms.py new file mode 100644 index 0000000..c90ac53 --- /dev/null +++ b/text2term/tagged_terms.py @@ -0,0 +1,25 @@ + +class TaggedTerm: + def __init__(self, term=None, tags=[], original_term=None): + self.term = term + self.tags = tags + self.original_term = original_term + + def __repr__(self): + return f" Date: Fri, 17 Feb 2023 14:41:17 -0500 Subject: [PATCH 071/185] Instance Bug Fix Fixes a bug where the the types are not properly checked in all situations. --- setup.py | 2 +- text2term/t2t.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 71e597c..e079d7b 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '2.0.0' +version = '2.0.1' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/t2t.py b/text2term/t2t.py index d488b12..22543b9 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -74,7 +74,7 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr The dataframe returned is the same but contains a tags column """ # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms - if tagged_terms_dict is dict: + if isinstance(tagged_terms_dict, dict): terms = list(tagged_terms_dict.keys()) else: terms = [tagged_term.get_term() for tagged_term in tagged_terms_dict] @@ -83,9 +83,9 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache) # For each term in dict, add tags to corresponding mappings row in "Tags" Column - if tagged_terms_dict is dict: + if isinstance(tagged_terms_dict, dict): for key, value in tagged_terms_dict.items(): - if value is list: + if isinstance(value, list): to_store = ','.join(value) else: to_store = str(value) From 947e72a83bc6c441286a15094065acbacbc6269a Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 24 Feb 2023 11:02:23 -0500 Subject: [PATCH 072/185] Source Term ID Bug Fixes Fixes bugs pertaining to adding a Source Term ID to the tagged terms. It does this by implementing an extra data member and functions to the TaggedTerms object. There is also a warning if source_terms_ids is not empty but is not used due to unequal length. Also exposes the TaggedTerm object on the top layer of t2t. --- README.md | 5 ++++- setup.py | 2 +- text2term/__init__.py | 3 ++- text2term/t2t.py | 13 ++++++++++++- text2term/tagged_terms.py | 9 ++++++++- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 138aa98..7fcd6c4 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ All other arguments are the same, and have the same functionality: `source_terms_ids` : tuple Collection of identifiers for the given source terms + WARNING: While this is still available for the tagged term function, it is worth noting that dictionaries do not necessarily preserve order, so it is not recommended. If using the TaggedTerm object, the source terms can be attached there to guarantee order. `excl_deprecated` : bool Exclude ontology terms stated as deprecated via `owl:deprecated true` @@ -145,12 +146,14 @@ WARNING: Removing duplicates at any point does not guarantee which original term The non-tagged functions both return a dictionary where the keys are the original terms and the values are the preprocessed terms. The tagged function returns a list of TaggedTerm items with the following function contracts: ```python -def __init__(self, term=None, tags=[], original_term=None) +def __init__(self, term=None, tags=[], original_term=None, source_term_id=None) def add_tags(self, new_tags) def update_term(self, term) +def update_source_term_id(self, source_term_id) def get_original_term(self) def get_term(self) def get_tags(self) +def get_source_term_id(self) ``` As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy prgorammatic usage. Note that this allows multiple of the same preprocessed term with different tags. diff --git a/setup.py b/setup.py index e079d7b..f9471a6 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '2.0.1' +version = '2.0.2' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' long_description = open('README.md').read() diff --git a/text2term/__init__.py b/text2term/__init__.py index c8764a4..6ed6d92 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -8,4 +8,5 @@ from .mapper import Mapper from .preprocess import preprocess_file from .preprocess import preprocess_terms -from .preprocess import preprocess_tagged_terms \ No newline at end of file +from .preprocess import preprocess_tagged_terms +from .tagged_terms import TaggedTerm \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index 22543b9..a02e01d 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -77,7 +77,16 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr if isinstance(tagged_terms_dict, dict): terms = list(tagged_terms_dict.keys()) else: - terms = [tagged_term.get_term() for tagged_term in tagged_terms_dict] + terms = [] + source_terms_id_list = [] + for tagged_term in tagged_terms_dict: + terms.append(tagged_term.get_term()) + if tagged_term.get_source_term_id() != None: + source_terms_id_list.append(tagged_term.get_source_term_id()) + if len(source_terms_id_list) > 0: + source_terms_ids = tuple(source_terms_id_list) + + # Run the mapper df = map_terms(terms, target_ontology, base_iris=base_iris, excl_deprecated=excl_deprecated, \ max_mappings=max_mappings, min_score=min_score, mapper=mapper, output_file=output_file, \ save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache) @@ -138,6 +147,8 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False Data frame containing the generated ontology mappings """ if len(source_terms_ids) != len(source_terms): + if len(source_terms_ids) > 0: + sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) if output_file == '': timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") diff --git a/text2term/tagged_terms.py b/text2term/tagged_terms.py index c90ac53..d845999 100644 --- a/text2term/tagged_terms.py +++ b/text2term/tagged_terms.py @@ -1,9 +1,10 @@ class TaggedTerm: - def __init__(self, term=None, tags=[], original_term=None): + def __init__(self, term=None, tags=[], original_term=None, source_term_id=None): self.term = term self.tags = tags self.original_term = original_term + self.source_term_id = source_term_id def __repr__(self): return f" Date: Wed, 8 Mar 2023 10:59:50 -0500 Subject: [PATCH 073/185] Ignore NA values in input table --- text2term/onto_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 20a9217..1f02cea 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -115,6 +115,7 @@ def parse_list_file(file_path): def parse_csv_file(file_path, term_column_name, term_id_column_name, separator=','): data = pd.read_csv(file_path, sep=separator, engine='python') + data = data.dropna(subset=[term_column_name, term_id_column_name]) if term_column_name not in data.columns: raise ValueError("Could not find specified column name for input terms: " + term_column_name) terms = data[term_column_name].values From 78725021e865cbf0fa457705de7098ce28088ce3 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 8 Mar 2023 11:00:51 -0500 Subject: [PATCH 074/185] Modify output table ID column name --- text2term/term_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 243af5d..b97e2a0 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -5,7 +5,7 @@ class TermMapping: SRC_TERM = "Source Term" - SRC_TERM_ID = "Source Term Id" + SRC_TERM_ID = "Source Term ID" TGT_TERM_LBL = "Mapped Term Label" TGT_TERM_IRI = "Mapped Term IRI" MAPPING_SCORE = "Mapping Score" From 94d9fcb1b82b4dcabefab00b968f414f5dc01ca8 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 8 Mar 2023 11:01:19 -0500 Subject: [PATCH 075/185] Update requirements, readme and version --- README.md | 2 +- requirements.txt | 28 ++++++++++++++-------------- setup.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7fcd6c4..3d01282 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # *text2term* ontology mapper -A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology. +A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in ontologies. ## Installation Install package using **pip**: diff --git a/requirements.txt b/requirements.txt index f0bbc23..3ddde78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -Owlready2==0.36 +Owlready2==0.40 argparse==1.4.0 -pandas==1.4.1 -numpy==1.23.2 -gensim==4.1.2 -scipy==1.8.0 -scikit-learn==1.0.2 -setuptools==66.0.0 -requests==2.27.1 -tqdm==4.62.3 -sparse_dot_topn==0.3.1 -bioregistry==0.4.63 -nltk==3.7 -rapidfuzz==2.6.0 -shortuuid==1.0.9 +pandas==1.5.3 +numpy==1.24.2 +gensim==4.3.0 +scipy==1.10.1 +scikit-learn==1.2.1 +setuptools==67.6.0 +requests==2.28.2 +tqdm==4.65.0 +sparse_dot_topn==0.3.4 +bioregistry==0.6.92 +nltk==3.8.1 +rapidfuzz==2.13.7 +shortuuid==1.0.11 diff --git a/setup.py b/setup.py index f9471a6..85655f8 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages -version = '2.0.2' -description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in an ontology' +version = '2.0.3' +description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in ontologies' long_description = open('README.md').read() with open('requirements.txt') as f: From 3d4a991ecc2c7e1b957f8c7f8176fb45c5508550 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 9 Mar 2023 17:01:24 -0500 Subject: [PATCH 076/185] Minor readme updates & added note on NAs in input --- README.md | 58 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 3d01282..c90a889 100644 --- a/README.md +++ b/README.md @@ -105,16 +105,20 @@ Both functions return the same value: `df` : Data frame containing the generated ontology mappings -### Caching -As of version 1.1.0, users can now cache ontologies that they want to use regularly or quickly. Programatically, there are two steps to using the cache: creating the cache, then accessing it. First, the user can cache ontologies using either of two functions: +### Ontology Caching +As of version 1.1.0, users can cache ontologies that they want to use regularly or quickly. Programmatically, there are two steps to using the cache: creating the cache, then accessing it. First, the user can cache ontologies using either of two functions: -`cache_ontology(ontology_url, ontology_acronym, base_iris=())` -Or -`cache_ontology_set(ontology_registry_path)` +```python +cache_ontology(ontology_url, ontology_acronym, base_iris=()) +``` + +```python +cache_ontology_set(ontology_registry_path) +``` The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found below. The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: -`acronym,name,version,date,url,comments` +`acronym,name,version,date,url,comments`. An example is provided in `resources/ontologies.csv` Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. @@ -124,27 +128,31 @@ After an ontology is cached, the user can access the cache by using the assigned To clear the cache, one can call: `clear_cache(ontology_acronym='')` If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. -Finally, `cache_exists(ontology_acronym)` is a simple program that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. +Finally, `cache_exists(ontology_acronym)` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. -### Preprocessing -As of version 1.2.0, text2term now includes a simple preprocessing functionality for input. Specifically, these functions take the original input text and Regex expressions, then match each text to a regular expression to simplify the input. +### Input Preprocessing +As of version 1.2.0, text2term includes regex-based preprocessing functionality for input terms. Specifically, these functions take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. -Like the "map" functions above, the two functions differ on whether is input is a file or a list of strings: -`preprocess_file(file_path, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` -or -`preprocess_terms(terms, template_path, output_file="", blacklist_path="", blacklist_char='', rem_duplicates=DupSetting.NO_REM)` -or -`preprocess_tagged_terms(file_path, template_path="", blacklist_path="", blacklist_char='', rem_duplicates=False, separator=";:;")` +Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings: +```python +preprocess_file(file_path, template_path, output_file='', blacklist_path='', blacklist_char='', rem_duplicates=False) +``` +```python +preprocess_terms(terms, template_path, output_file='', blacklist_path='', blacklist_char='', rem_duplicates=False) +``` +```python +preprocess_tagged_terms(file_path, template_path='', blacklist_path='', blacklist_char='', rem_duplicates=False, separator=';:;') +``` -In all cases, the templates and the blacklist must be stored in a newline seperated file. If an output file is specified, the preprocessed strings are written to that file and the list is passed back regardless. +In all cases, the regex templates and blacklist must be stored in a newline-separated file. If an output file is specified, the preprocessed strings are written to that file and the list of preprocessed strings is returned. The blacklist functionality allows the user to specify another regex file. If any terms match any regex in blacklist, they are removed from the terms, or, if a blacklist character is specified, replaced with that character for placeholding. -The Remove Duplicates functionality will remove all duplicate terms after processing, if true. +The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. -The non-tagged functions both return a dictionary where the keys are the original terms and the values are the preprocessed terms. -The tagged function returns a list of TaggedTerm items with the following function contracts: +The functions `preprocess_file()` and `preprocess_terms()` both return a dictionary where the keys are the original terms and the values are the preprocessed terms. +The `preprocess_tagged_terms()` function returns a list of TaggedTerm items with the following function contracts: ```python def __init__(self, term=None, tags=[], original_term=None, source_term_id=None) def add_tags(self, new_tags) @@ -155,7 +163,9 @@ def get_term(self) def get_tags(self) def get_source_term_id(self) ``` -As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy prgorammatic usage. Note that this allows multiple of the same preprocessed term with different tags. +As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. + +**Note on NA values in input**: As of v2.0.3, when the input to text2term is a table file, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. ## Command Line Usage @@ -194,18 +204,18 @@ To display a help message with descriptions of tool arguments do: ## Examples ### Programmatic -``` +```python import text2term import pandas df1 = text2term.map_file(unstruct_terms.txt, "http://www.ebi.ac.uk/efo/efo.owl") -df2 = text2term.map_terms(["asthma", "colon cancer"], "http://www.ebi.ac.uk/efo/efo.owl") +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") ``` Below is an example of caching, assuming the same imports as above: -``` +```python text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") df1 = text2term.map_file(unstruct_terms.txt, "EFO", use_cache=True) -df2 = text2term.map_terms(["asthma", "colon cancer"], "EFO", use_cache=True) +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) text2term.clear_cache("EFO") ``` From a33539bc2748f4e9b1aa7aa5b6c9c0b5a07c2e4f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 15 Mar 2023 10:49:30 -0400 Subject: [PATCH 077/185] Include ontology term CURIEs in output mappings file --- text2term/onto_utils.py | 7 ++++++- text2term/term_mapping.py | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 1f02cea..222f6c2 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -56,7 +56,12 @@ def remove_whitespace(string): def curie_from_iri(iri): - return bioregistry.curie_from_iri(iri) + curie = bioregistry.curie_from_iri(iri) + if curie is None: + sys.stderr.write("Error obtaining CURIE for IRI: " + iri) + return "" + else: + return curie.upper() def label_from_iri(iri): diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index b97e2a0..8abfc1f 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -1,12 +1,14 @@ """Provides TermMapping and TermMappingCollection classes""" import pandas as pd +import onto_utils class TermMapping: SRC_TERM = "Source Term" SRC_TERM_ID = "Source Term ID" TGT_TERM_LBL = "Mapped Term Label" + TGT_TERM_CURIE = "Mapped Term CURIE" TGT_TERM_IRI = "Mapped Term IRI" MAPPING_SCORE = "Mapping Score" @@ -42,6 +44,7 @@ def to_dict(self): self.SRC_TERM_ID: self.source_term_id, self.SRC_TERM: self.source_term, self.TGT_TERM_LBL: self.mapped_term_label, + self.TGT_TERM_CURIE: onto_utils.curie_from_iri(self.mapped_term_iri), self.TGT_TERM_IRI: self.mapped_term_iri, self.MAPPING_SCORE: self.mapping_score } From 180d48bce8bf40f5f07feafb26323e41f958ee62 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 15 Mar 2023 14:42:54 -0400 Subject: [PATCH 078/185] Include only essential fields in ontology registry table --- README.md | 2 +- text2term/resources/ontologies.csv | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c90a889..5fdfe81 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ cache_ontology_set(ontology_registry_path) The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found below. The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: -`acronym,name,version,date,url,comments`. An example is provided in `resources/ontologies.csv` +`acronym,version,url`. An example is provided in `resources/ontologies.csv` Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. diff --git a/text2term/resources/ontologies.csv b/text2term/resources/ontologies.csv index f0fdecd..77edfb6 100644 --- a/text2term/resources/ontologies.csv +++ b/text2term/resources/ontologies.csv @@ -1,11 +1,11 @@ -acronym,name,version,date,url,comments -CLO,Cell Line Ontology,2.1.178,3/20/22,http://purl.obolibrary.org/obo/clo.owl,non-versioned IRI -CL,Cell Ontology,9/15/22,9/15/22,http://purl.obolibrary.org/obo/cl/releases/2022-09-15/cl.owl, -EFO,Experimental Factor Ontology,3.46.0,9/15/22,https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl, -GO,Gene Ontology,9/19/22,9/19/22,http://purl.obolibrary.org/obo/go/releases/2022-09-19/go.owl, -HPO,Human Phenotype Ontology,6/11/22,6/11/22,http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl, -MONDO,Monarch Disease Ontology,8/1/22,8/1/22,http://purl.obolibrary.org/obo/mondo/releases/2022-08-01/mondo.owl, -NCIT,NCI Thesaurus,22.07d,8/19/22,http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl, -PRO,Protein Ontology,67,8/8/22,http://purl.obolibrary.org/obo/pr/67.0/pr.owl, -UBERON,Uber-anatomy ontology,8/19/22,8/19/22,http://purl.obolibrary.org/obo/uberon/releases/2022-08-19/uberon.owl, -MP,Mammalian Phenotype Ontology,8/4/22,8/4/22,http://purl.obolibrary.org/obo/mp/releases/2022-08-04/mp.owl, \ No newline at end of file +acronym,version,url +CLO,2.1.178,http://purl.obolibrary.org/obo/clo.owl +CL,9/15/22,http://purl.obolibrary.org/obo/cl/releases/2022-09-15/cl.owl +EFO,3.46.0,https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl +GO,9/19/22,http://purl.obolibrary.org/obo/go/releases/2022-09-19/go.owl +HPO,6/11/22,http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl +MONDO,8/1/22,http://purl.obolibrary.org/obo/mondo/releases/2022-08-01/mondo.owl +NCIT,22.07d,http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl +PRO,67,http://purl.obolibrary.org/obo/pr/67.0/pr.owl +UBERON,8/19/22,http://purl.obolibrary.org/obo/uberon/releases/2022-08-19/uberon.owl +MP,8/4/22,http://purl.obolibrary.org/obo/mp/releases/2022-08-04/mp.owl \ No newline at end of file From a21b1cbd31eb2ce2c97e01cdc5ddefef03a5ae58 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 20 Mar 2023 17:01:43 -0400 Subject: [PATCH 079/185] Fix module import statement --- text2term/term_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 8abfc1f..f2ac1d2 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -1,7 +1,7 @@ """Provides TermMapping and TermMappingCollection classes""" import pandas as pd -import onto_utils +from text2term import onto_utils class TermMapping: From 4c7f65cd6db9c60fe87943a4e77af922160ef1a5 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 27 Mar 2023 13:33:37 -0400 Subject: [PATCH 080/185] Fixes to Clarity and Terminology Adds comments to the outputted csv files for text2term when the save_mappings options in enabled. These comments include information on the mapper and settings used, as well as the time and version that produced the results. Also changed all references from 'blacklist' to 'blocklist' to use modern terminology in the preprocessing. Still allows for backwards compatibility. --- README.md | 11 +-- setup.py | 4 +- text2term/config.py | 1 + text2term/preprocess.py | 68 +++++++++------ text2term/t2t.py | 182 +++++++++++++++++++++------------------- 5 files changed, 148 insertions(+), 118 deletions(-) create mode 100644 text2term/config.py diff --git a/README.md b/README.md index 5fdfe81..2ebe50d 100644 --- a/README.md +++ b/README.md @@ -135,18 +135,19 @@ As of version 1.2.0, text2term includes regex-based preprocessing functionality Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings: ```python -preprocess_file(file_path, template_path, output_file='', blacklist_path='', blacklist_char='', rem_duplicates=False) +preprocess_file(file_path, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) ``` ```python -preprocess_terms(terms, template_path, output_file='', blacklist_path='', blacklist_char='', rem_duplicates=False) +preprocess_terms(terms, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) ``` ```python -preprocess_tagged_terms(file_path, template_path='', blacklist_path='', blacklist_char='', rem_duplicates=False, separator=';:;') +preprocess_tagged_terms(file_path, template_path='', blocklist_path='', blocklist_char='', rem_duplicates=False, separator=';:;') ``` -In all cases, the regex templates and blacklist must be stored in a newline-separated file. If an output file is specified, the preprocessed strings are written to that file and the list of preprocessed strings is returned. +In all cases, the regex templates and blocklist must be stored in a newline-separated file. If an output file is specified, the preprocessed strings are written to that file and the list of preprocessed strings is returned. -The blacklist functionality allows the user to specify another regex file. If any terms match any regex in blacklist, they are removed from the terms, or, if a blacklist character is specified, replaced with that character for placeholding. +The blocklist functionality allows the user to specify another regex file. If any terms match any regex in blocklist, they are removed from the terms, or, if a blocklist character is specified, replaced with that character for placeholding. +NOTE: As of version 2.1.0, the arguments were changed to "blocklist" from "blacklist". Backwards compatibility is currently supported, but will likely be discontinued at the next major release. The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. diff --git a/setup.py b/setup.py index 85655f8..7292f02 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages +from text2term.config import VERSION -version = '2.0.3' description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in ontologies' long_description = open('README.md').read() @@ -9,7 +9,7 @@ setup( name='text2term', - version=version, + version=VERSION, install_requires=requirements, packages=find_packages(), include_package_data=True, diff --git a/text2term/config.py b/text2term/config.py new file mode 100644 index 0000000..7aceefd --- /dev/null +++ b/text2term/config.py @@ -0,0 +1 @@ +VERSION = "2.1.0" \ No newline at end of file diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 479dd73..17ea7ea 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -3,11 +3,17 @@ from enum import Enum from .tagged_terms import TaggedTerm -def preprocess_file(file_path, template_path, output_file="", blacklist_path="", \ - blacklist_char='', rem_duplicates=False): +def preprocess_file(file_path, template_path, output_file="", blocklist_path="", \ + blocklist_char='', blacklist_path="", blacklist_char='', \ + rem_duplicates=False): + # Allows backwards compatibility to blacklist. Will eventually be deleted + if blocklist_char == '': + blocklist_char = blacklist_char + if blocklist_path == "": + blocklist_path = blacklist_path terms = _get_values(file_path) processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ - blacklist_path=blacklist_path, blacklist_char=blacklist_char, \ + blocklist_path=blocklist_path, blocklist_char=blocklist_char, \ rem_duplicates=rem_duplicates) return processed_terms @@ -15,8 +21,14 @@ def preprocess_file(file_path, template_path, output_file="", blacklist_path="", ## Tags should be stored with their terms in the same line, delineated by ";:;" ## ex: Age when diagnosed with (.*) ;:; age,diagnosis ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} -def preprocess_tagged_terms(file_path, template_path="", blacklist_path="", \ - blacklist_char='', rem_duplicates=False, separator=";:;"): +def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ + blocklist_char='', blacklist_path="", blacklist_char='', \ + rem_duplicates=False, separator=";:;"): + # Allows backwards compatibility to blacklist. Will eventually be deleted + if blocklist_char == '': + blocklist_char = blacklist_char + if blocklist_path == "": + blocklist_path = blacklist_path # Seperate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] @@ -44,15 +56,15 @@ def preprocess_tagged_terms(file_path, template_path="", blacklist_path="", \ templates[regex_term] = [] templates[re.compile("(.*)")] = [] - # Create the blacklist, if it exists - blacklist = [] - if blacklist_path != "": - blacklist_strings = _get_values(blacklist_path) - blacklist = _make_regex_list(blacklist_strings) + # Create the blocklist, if it exists + blocklist = [] + if blocklist_path != "": + blocklist_strings = _get_values(blocklist_path) + blocklist = _make_regex_list(blocklist_strings) processed_terms = [] for term in terms: - if _blacklist_term(processed_terms, term, blacklist, blacklist_char, tagged=True): + if _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=True): continue for template, tem_tags in templates.items(): match = template.fullmatch(term.get_original_term()) @@ -67,8 +79,14 @@ def preprocess_tagged_terms(file_path, template_path="", blacklist_path="", \ return processed_terms -def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ - blacklist_char='', rem_duplicates=False): +def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ + blocklist_char='', blacklist_path="", blacklist_char='', \ + rem_duplicates=False): + # Allows backwards compatibility to blacklist. Will eventually be deleted + if blocklist_char == '': + blocklist_char = blacklist_char + if blocklist_path == "": + blocklist_path = blacklist_path # Form the templates as regular expressions template_strings = [] if template_path != "": @@ -76,16 +94,16 @@ def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ template_strings.append("(.*)") templates = _make_regex_list(template_strings) - # Create the blacklist, if it exists - blacklist = [] - if blacklist_path != "": - blacklist_strings = _get_values(blacklist_path) - blacklist = _make_regex_list(blacklist_strings) + # Create the blocklist, if it exists + blocklist = [] + if blocklist_path != "": + blocklist_strings = _get_values(blocklist_path) + blocklist = _make_regex_list(blocklist_strings) - # Checks all terms against each blacklist then template + # Checks all terms against each blocklist then template processed_terms = {} for term in terms: - if _blacklist_term(processed_terms, term, blacklist, blacklist_char): + if _blocklist_term(processed_terms, term, blocklist, blocklist_char): continue for template in templates: match = template.fullmatch(term) @@ -105,15 +123,15 @@ def preprocess_terms(terms, template_path, output_file="", blacklist_path="", \ ## Note: Because Python Dictionaries and Lists are passed by reference (sort of), updating the ## dictionary/list here will update the dictionary in the caller -def _blacklist_term(processed_terms, term, blacklist, blacklist_char, tagged=False): - for banned in blacklist: +def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=False): + for banned in blocklist: match = banned.fullmatch(term if type(term) is not TaggedTerm else term.get_original_term()) if match: - if blacklist_char != '': + if blocklist_char != '': if tagged: - _update_tagged_term(processed_terms, term, blacklist_char) + _update_tagged_term(processed_terms, term, blocklist_char) else: - processed_terms[term] = blacklist_char + processed_terms[term] = blocklist_char return True return False diff --git a/text2term/t2t.py b/text2term/t2t.py index a02e01d..17621e7 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -1,5 +1,4 @@ """Provides Text2Term class""" - import os import sys import json @@ -17,62 +16,63 @@ from text2term.syntactic_mapper import SyntacticMapper from text2term.tfidf_mapper import TFIDFMapper from text2term.zooma_mapper import ZoomaMapper +from text2term.config import VERSION + +""" +Maps the terms in the given input file to the specified target ontology. + +Parameters +---------- +input_file : str + Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) +target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies +base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +csv_columns : tuple + Name of the column that contains the terms to map, optionally followed by the name of the column that + contains identifiers for the terms (eg 'my_terms,my_term_ids') +separator : str + Specifies the cell separator to be used when reading a non-comma-separated tabular file +excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` +mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal +max_mappings : int + Maximum number of top-ranked mappings returned per source term +min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) +output_file : str + Path to desired output file for the mappings +save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term +save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) +Returns +---------- +df + Data frame containing the generated ontology mappings +""" def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, separator=',', use_cache=False): - """ - Maps the terms in the given input file to the specified target ontology. - - Parameters - ---------- - input_file : str - Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - csv_columns : tuple - Name of the column that contains the terms to map, optionally followed by the name of the column that - contains identifiers for the terms (eg 'my_terms,my_term_ids') - separator : str - Specifies the cell separator to be used when reading a non-comma-separated tabular file - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, use_cache=use_cache) +""" +All parameters are the same as below, but tagged_terms_dict is a dictionary where the + key is the source term and the value is a list of all tags (or a single string for + one tag). It can also be a list of TaggedTerm objects. + The dataframe returned is the same but contains a tags column +""" def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): - """ - All parameters are the same as below, but tagged_terms_dict is a dictionary where the - key is the source term and the value is a list of all tags (or a single string for - one tag). It can also be a list of TaggedTerm objects. - The dataframe returned is the same but contains a tags column - """ # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms if isinstance(tagged_terms_dict, dict): terms = list(tagged_terms_dict.keys()) @@ -105,47 +105,47 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store if save_mappings: - _save_mappings(df, output_file) + _save_mappings(df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings) return df -def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): - """ - Maps the terms in the given list to the specified target ontology. +""" +Maps the terms in the given list to the specified target ontology. - Parameters - ---------- - source_terms : list - List of 'source' terms to map to ontology terms - target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') - source_terms_ids : tuple - Collection of identifiers for the given source terms - excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` - mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal - max_mappings : int - Maximum number of top-ranked mappings returned per source term - min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) - output_file : str - Path to desired output file for the mappings - save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term - save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) +Parameters +---------- +source_terms : list + List of 'source' terms to map to ontology terms +target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies +base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +source_terms_ids : tuple + Collection of identifiers for the given source terms +excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` +mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal +max_mappings : int + Maximum number of top-ranked mappings returned per source term +min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) +output_file : str + Path to desired output file for the mappings +save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term +save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) - Returns - ---------- - df - Data frame containing the generated ontology mappings - """ +Returns +---------- +df + Data frame containing the generated ontology mappings +""" +def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") @@ -159,7 +159,7 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache) mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) if save_mappings: - _save_mappings(mappings_df, output_file) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df @@ -255,10 +255,20 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi else: raise ValueError("Unsupported mapper: " + mapper) -def _save_mappings(mappings, output_file): +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) - mappings.to_csv(output_file, index=False) + with open(output_file, "a") as f: + f.write("# Date and time run: %s\n" % datetime.datetime.now()) + f.write("# Target Ontology: %s\n" % target_ontology) + f.write("# Text2term version: %s\n" % VERSION) + f.write("# Minimum Score: %.2f\n" % min_score) + f.write("# Mapper: %s\n" % mapper.value) + f.write("# Base IRIs: %s\n" % (base_iris,)) + f.write("# Max Mappings: %d\n" % max_mappings) + f.write("# Depricated Terms ") + f.write("Excluded\n" if excl_deprecated else "Included\n") + mappings.to_csv(output_file, index=False, mode='a') def _save_graphs(terms, output_file): term_graphs = TermGraphGenerator(terms).graphs_dicts() From 5115e9aa57d5e5133d9e37c84d1fbb757f889009 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 29 Mar 2023 16:05:15 -0400 Subject: [PATCH 081/185] Add ontology term CURIE as property of mapping object --- text2term/term_mapping.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index f2ac1d2..39ef795 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -35,6 +35,10 @@ def mapped_term_label(self): def mapped_term_iri(self): return self._mapped_term_iri + @property + def mapped_term_curie(self): + return onto_utils.curie_from_iri(self.mapped_term_iri) + @property def mapping_score(self): return self._mapping_score @@ -44,7 +48,7 @@ def to_dict(self): self.SRC_TERM_ID: self.source_term_id, self.SRC_TERM: self.source_term, self.TGT_TERM_LBL: self.mapped_term_label, - self.TGT_TERM_CURIE: onto_utils.curie_from_iri(self.mapped_term_iri), + self.TGT_TERM_CURIE: self.mapped_term_curie, self.TGT_TERM_IRI: self.mapped_term_iri, self.MAPPING_SCORE: self.mapping_score } From bb0e18bb641bcc6dae9206b77c47251df2bfd30b Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Tue, 25 Apr 2023 14:48:44 -0400 Subject: [PATCH 082/185] Allow Properties Add functionality to text2term to let the user match and work with Property Classes in ontologies as well as the standard class ontologies. --- .gitignore | 3 +- README.md | 100 +++++++++++++++++++----------------- text2term/__main__.py | 4 +- text2term/config.py | 2 +- text2term/t2t.py | 26 ++++++---- text2term/term.py | 7 ++- text2term/term_collector.py | 47 +++++++++++++---- 7 files changed, 117 insertions(+), 72 deletions(-) diff --git a/.gitignore b/.gitignore index 96637bf..b38afab 100644 --- a/.gitignore +++ b/.gitignore @@ -84,9 +84,8 @@ ipython_config.py # pyenv .python-version -# For PyPi upload +# For PyPi upload and testing make-pypi.sh -test-pypi.py # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. diff --git a/README.md b/README.md index 2ebe50d..1dd89ab 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,45 @@ Install package using **pip**: ``` pip install text2term ``` +## Examples +### Programmatic +```python +import text2term +import pandas + +df1 = text2term.map_file(unstruct_terms.txt, "http://www.ebi.ac.uk/efo/efo.owl") +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") +``` +Below is an example of caching, assuming the same imports as above: +```python +text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") +df1 = text2term.map_file(unstruct_terms.txt, "EFO", use_cache=True) +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) +text2term.clear_cache("EFO") +``` + +### Command Line +The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: +`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` + +Specify an output file where the mappings should be saved using `-o`: +`python text2term -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` + +Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: +`python text2term -s unstruct_terms.txt -t efo.owl -min 0.8` +The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. + +Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: +`python text2term -s unstruct_terms.txt -t efo.owl -d` + +Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: +`python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` +Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. + +Use the cache on the command line, first by flagging it, then in the future using the acronym: +`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` +Then, after running this, the following command is equivalent: +`python text2term -s unstruct_terms.txt -t EFO` ## Programmatic Usage The tool can be executed in Python with any of the three following functions: @@ -24,7 +63,8 @@ text2term.map_file(input_file='/some/file.txt', save_graphs=False, save_mappings=False, separator=',', - use_cache=False) + use_cache=False, + term_type='classes') ``` or ```python @@ -39,7 +79,8 @@ text2term.map_terms(source_terms=['term one', 'term two'], save_graphs=False, save_mappings=False, source_terms_ids=(), - use_cache=False) + use_cache=False, + term_type='classes') ``` or ```python @@ -54,7 +95,8 @@ text2term.map_tagged_terms(tagged_terms_dict={'term one': ["tag 1", "tag 2"]}, save_graphs=False, save_mappings=False, source_terms_ids=(), - use_cache=False) + use_cache=False, + term_type='classes') ``` ### Arguments @@ -98,6 +140,12 @@ All other arguments are the same, and have the same functionality: `save_mappings` : bool Save the generated mappings to a file (specified by `output_file`) +`use_cache` : bool + Use the cache for the ontology. More details are below. + +`term_type` : str + Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError. + All default values, if they exist, can be seen above. ### Return Value @@ -116,7 +164,7 @@ cache_ontology(ontology_url, ontology_acronym, base_iris=()) cache_ontology_set(ontology_registry_path) ``` -The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found below. +The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found above. The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: `acronym,version,url`. An example is provided in `resources/ontologies.csv` @@ -128,7 +176,7 @@ After an ontology is cached, the user can access the cache by using the assigned To clear the cache, one can call: `clear_cache(ontology_acronym='')` If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. -Finally, `cache_exists(ontology_acronym)` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. +Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. ### Input Preprocessing As of version 1.2.0, text2term includes regex-based preprocessing functionality for input terms. Specifically, these functions take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. @@ -201,44 +249,4 @@ To display a help message with descriptions of tool arguments do: `-g SAVE_TERM_GRAPHS` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. -`-c STORE_IN_CACHE` Using this flag followed by the acronym the ontology should be stored as, the program will same the target ontology to the cache. After that, referencing the acronym in `target` will reference the cache. Examples are below. - -## Examples -### Programmatic -```python -import text2term -import pandas - -df1 = text2term.map_file(unstruct_terms.txt, "http://www.ebi.ac.uk/efo/efo.owl") -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") -``` -Below is an example of caching, assuming the same imports as above: -```python -text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_file(unstruct_terms.txt, "EFO", use_cache=True) -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) -text2term.clear_cache("EFO") -``` - -### Command Line -The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` - -Specify an output file where the mappings should be saved using `-o`: -`python text2term -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` - -Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term -s unstruct_terms.txt -t efo.owl -min 0.8` -The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. - -Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term -s unstruct_terms.txt -t efo.owl -d` - -Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` -Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. - -Use the cache on the command line, first by flagging it, then in the future using the acronym: -`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` -Then, after running this, the following command is equivalent: -`python text2term -s unstruct_terms.txt -t EFO` +`-c STORE_IN_CACHE` Using this flag followed by the acronym the ontology should be stored as, the program will same the target ontology to the cache. After that, referencing the acronym in `target` will reference the cache. Examples are above. diff --git a/text2term/__main__.py b/text2term/__main__.py index e08c1e8..d8069d3 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -37,6 +37,8 @@ help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", help="Store the target ontology into local cache under acronym") + parser.add_argument("-type", "--term_type", required=False, type=str, default="classes", + help="Define whether to return ontology classes, properties, or both") arguments = parser.parse_args() if not os.path.exists(arguments.source): @@ -57,4 +59,4 @@ map_file(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target)) + save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), term_types=arguments.term_type) diff --git a/text2term/config.py b/text2term/config.py index 7aceefd..5138066 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.1.0" \ No newline at end of file +VERSION = "2.2.0" \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index 17621e7..f8b88d5 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -59,11 +59,12 @@ """ def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=',', use_cache=False): + separator=',', use_cache=False, term_type='classes'): source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, - output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, use_cache=use_cache) + output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, + use_cache=use_cache, term_type=term_type) """ All parameters are the same as below, but tagged_terms_dict is a dictionary where the @@ -72,7 +73,8 @@ def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_dep The dataframe returned is the same but contains a tags column """ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False, + term_type='classes'): # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms if isinstance(tagged_terms_dict, dict): terms = list(tagged_terms_dict.keys()) @@ -89,7 +91,8 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr # Run the mapper df = map_terms(terms, target_ontology, base_iris=base_iris, excl_deprecated=excl_deprecated, \ max_mappings=max_mappings, min_score=min_score, mapper=mapper, output_file=output_file, \ - save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache) + save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache, \ + term_type=term_type) # For each term in dict, add tags to corresponding mappings row in "Tags" Column if isinstance(tagged_terms_dict, dict): @@ -145,7 +148,8 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr Data frame containing the generated ontology mappings """ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False): + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + use_cache=False, term_type='classes'): if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") @@ -156,7 +160,7 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: target_terms = '' if target_ontology.lower() == 'all' else target_ontology else: - target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache) + target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) if save_mappings: _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings) @@ -179,7 +183,7 @@ def cache_ontology_set(ontology_registry_path): # Caches a single ontology def cache_ontology(ontology_url, ontology_acronym, base_iris=()): - ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False) + ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type='both') cache_dir = "cache/" + ontology_acronym + "/" if not os.path.exists(cache_dir): os.makedirs(cache_dir) @@ -189,7 +193,7 @@ def cache_ontology(ontology_url, ontology_acronym, base_iris=()): ontology_terms.clear() # Will check if an acronym exists in the cache -def cache_exists(ontology_acronym): +def cache_exists(ontology_acronym=''): return os.path.exists("cache/" + ontology_acronym) # Clears the cache @@ -227,14 +231,14 @@ def _load_data(input_file_path, csv_column_names, separator): term_ids = onto_utils.generate_iris(len(terms)) return terms, term_ids -def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False): +def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type='classes'): term_collector = OntologyTermCollector() if use_cache: pickle_file = "cache/" + ontology + "/" + ontology + "-term-details.pickle" onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) - onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated) + onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: - onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated) + onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms diff --git a/text2term/term.py b/text2term/term.py index d24f35e..4698982 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -3,7 +3,7 @@ class OntologyTerm: - def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=(), deprecated=False): + def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=(), deprecated=False, termtype='class'): """ Constructor for a succinct representation of an ontology term :param iri: IRI of the ontology term @@ -22,6 +22,7 @@ def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), childre self._children = children self._instances = instances self._deprecated = deprecated + self._termtype = termtype @property def iri(self): @@ -60,6 +61,10 @@ def label(self): def deprecated(self): return self._deprecated + @property + def termtype(self): + return self._termtype + def __eq__(self, other): if isinstance(other, OntologyTerm): return self._iri == other._iri diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 25c3102..34d2b71 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -5,13 +5,14 @@ from text2term.term import OntologyTerm import logging +options = ['classes', 'properties', 'both'] class OntologyTermCollector: def __init__(self, log_level=logging.INFO): self.logger = onto_utils.get_logger(__name__, level=log_level) - def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False): + def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False, term_type="classes"): """ Collect the terms described in the ontology at the specified IRI :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) @@ -32,10 +33,10 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex query = iri + "*" self.logger.info("...collecting terms with IRIs starting in: " + iri) iris = list(default_world.search(iri=query)) - ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated) + ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated, term_type) else: ontology_signature = self._get_ontology_signature(ontology) - ontology_terms = self._get_ontology_terms(ontology_signature, ontology, exclude_deprecated) + ontology_terms = self._get_ontology_terms(ontology_signature, ontology, exclude_deprecated, term_type) end = time.time() self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end - start) @@ -47,7 +48,7 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex self.logger.debug("Unable to destroy ontology: ", err) return ontology_terms - def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): + def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type='classes'): filtered_onto_terms = {} for base_iri, term in onto_terms.items(): if type(iris) == str: @@ -55,22 +56,26 @@ def filter_terms(self, onto_terms, iris=(), excl_deprecated=False): else: begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) is_not_deprecated = (not excl_deprecated) or (not term.deprecated) - if begins_with_iri and is_not_deprecated: + include = self._filter_term_type(term, term_type, True) + if begins_with_iri and is_not_deprecated and include: filtered_onto_terms.update({base_iri: term}) return filtered_onto_terms - def _get_ontology_signature(self, ontology): + def _get_ontology_signature(self, ontology, term_type='classes'): signature = list(ontology.classes()) + signature.extend(list(ontology.properties())) # ontology.classes() does not include classes in imported ontologies; we need to explicitly add them to our list for imported_ontology in ontology.imported_ontologies: signature.extend(list(imported_ontology.classes())) + signature.extend(list(imported_ontology.properties())) return signature - def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): + def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type): ontology_terms = dict() for ontology_term in term_list: - if not isinstance(ontology_term, PropertyClass) and ontology_term is not Thing \ - and ontology_term is not Nothing: + # Parse if should include ontology classes, properties, or both + include = self._filter_term_type(ontology_term, term_type, False) + if include and ontology_term is not Thing and ontology_term is not Nothing: if (exclude_deprecated and not deprecated[ontology_term]) or (not exclude_deprecated): iri = ontology_term.iri labels = self._get_labels(ontology_term) @@ -80,14 +85,36 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): instances = self._get_instances(ontology_term, ontology) definitions = self._get_definitions(ontology_term) is_deprecated = deprecated[ontology_term] == [True] + if self._filter_term_type(ontology_term, "classes", False): + termtype = 'class' + elif self._filter_term_type(ontology_term, "properties", False): + termtype = 'property' + else: + termtype = None term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=parents, children=children, instances=instances, - deprecated=is_deprecated) + deprecated=is_deprecated, termtype=termtype) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) return ontology_terms + def _filter_term_type(self, ontology_term, term_type, cached): + if term_type == 'classes': + if cached: + return ontology_term.termtype == 'class' + else: + return not isinstance(ontology_term, PropertyClass) + elif term_type == 'properties': + if cached: + return ontology_term.termtype == 'property' + else: + return isinstance(ontology_term, PropertyClass) + elif term_type == 'both': + return True + else: + raise ValueError("Option to include Properties or Classes is not valid") + def _get_parents(self, ontology_term): parents = dict() # named/atomic superclasses except owl:Thing try: From f1ab790a53ce84a94fda11a9210b9f2babb63224 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 27 Apr 2023 10:18:50 -0400 Subject: [PATCH 083/185] Added example Added the test/unstruct_terms.txt to the repo to allow users to try functionality from the start --- README.md | 18 +++++++++--------- text2term/__main__.py | 2 +- text2term/t2t.py | 1 + 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1dd89ab..edb6db7 100644 --- a/README.md +++ b/README.md @@ -13,39 +13,39 @@ pip install text2term import text2term import pandas -df1 = text2term.map_file(unstruct_terms.txt, "http://www.ebi.ac.uk/efo/efo.owl") +df1 = text2term.map_file("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") ``` Below is an example of caching, assuming the same imports as above: ```python text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_file(unstruct_terms.txt, "EFO", use_cache=True) +df1 = text2term.map_file("test/unstruct_terms.txt", "EFO", use_cache=True) df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) text2term.clear_cache("EFO") ``` ### Command Line The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` +`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` Specify an output file where the mappings should be saved using `-o`: -`python text2term -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` +`python text2term -s test/unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term -s unstruct_terms.txt -t efo.owl -min 0.8` +`python text2term -s test/unstruct_terms.txt -t efo.owl -min 0.8` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term -s unstruct_terms.txt -t efo.owl -d` +`python text2term -s test/unstruct_terms.txt -t efo.owl -d` Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` +`python text2term.py -s test/unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. Use the cache on the command line, first by flagging it, then in the future using the acronym: -`python text2term -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` +`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` Then, after running this, the following command is equivalent: -`python text2term -s unstruct_terms.txt -t EFO` +`python text2term -s test/unstruct_terms.txt -t EFO` ## Programmatic Usage The tool can be executed in Python with any of the three following functions: diff --git a/text2term/__main__.py b/text2term/__main__.py index d8069d3..34e1d39 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -59,4 +59,4 @@ map_file(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), term_types=arguments.term_type) + save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), term_type=arguments.term_type) diff --git a/text2term/t2t.py b/text2term/t2t.py index f8b88d5..61c3241 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -238,6 +238,7 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: + onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") From b809fb782dcae8a938961bdb3af045eab744db94 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 27 Apr 2023 10:20:47 -0400 Subject: [PATCH 084/185] Removed test/ from .gitignore Exposes the tests we have in the local folder, including unstruct_terms.txt --- .gitignore | 3 +-- test/simple-test.py | 14 ++++++++++++++ test/test-pypi.py | 39 +++++++++++++++++++++++++++++++++++++++ test/unstruct_terms.txt | 2 ++ 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 test/simple-test.py create mode 100644 test/test-pypi.py create mode 100644 test/unstruct_terms.txt diff --git a/.gitignore b/.gitignore index b38afab..e66c4c9 100644 --- a/.gitignore +++ b/.gitignore @@ -133,5 +133,4 @@ dmypy.json # Other .idea -.DS_Store -test/* \ No newline at end of file +.DS_Store \ No newline at end of file diff --git a/test/simple-test.py b/test/simple-test.py new file mode 100644 index 0000000..6ac2607 --- /dev/null +++ b/test/simple-test.py @@ -0,0 +1,14 @@ +import text2term + +def main(): + efo = "http://www.ebi.ac.uk/efo/efo.owl#" + pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" + ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" + if not text2term.cache_exists("EFO"): + text2term.cache_ontology(efo, "EFO") + df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") + # df = text2term.map_terms(["contains", "asthma"], efo, term_type="classes") + print(df.to_string()) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/test-pypi.py b/test/test-pypi.py new file mode 100644 index 0000000..813fca1 --- /dev/null +++ b/test/test-pypi.py @@ -0,0 +1,39 @@ +from contextlib import contextmanager +import sys, os +import text2term + +def main(): + try: + with suppress_stdout(): + # Simple set up and testing + text2term.map_terms(["fever", "headache"], "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") + text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") + text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) + text2term.map_terms(["fever", "headache"], "EFO", base_iris=("www."), mapper=text2term.mapper.Mapper.levenshtein, max_mappings=4, use_cache=True) + + # Properties and classes tests + text2term.map_terms(["fever", "headache"], "EFO", term_type="classes", use_cache=True) + text2term.map_terms(["contains", "location"], "EFO", term_type="properties", use_cache=True) + text2term.map_terms(["fever", "contains"], "EFO", term_type="both", use_cache=True) + + # Clear cache and set down + text2term.clear_cache("EFO") + except: + print("ERROR") + +# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python +@contextmanager +def suppress_stdout(): + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + old_stderr = sys.stderr + sys.stdout = devnull + sys.stderr = devnull + try: + yield + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/unstruct_terms.txt b/test/unstruct_terms.txt new file mode 100644 index 0000000..2dc3f97 --- /dev/null +++ b/test/unstruct_terms.txt @@ -0,0 +1,2 @@ +asthma +acute bronchitis \ No newline at end of file From 79014ba9cf17f86783c1586cda31d0459c6c60f6 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 1 May 2023 10:44:32 -0400 Subject: [PATCH 085/185] Minor changes + By default do not use broad synonyms --- text2term/t2t.py | 3 +-- text2term/term_collector.py | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 61c3241..45a4a84 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -1,4 +1,3 @@ -"""Provides Text2Term class""" import os import sys import json @@ -271,7 +270,7 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Mapper: %s\n" % mapper.value) f.write("# Base IRIs: %s\n" % (base_iris,)) f.write("# Max Mappings: %d\n" % max_mappings) - f.write("# Depricated Terms ") + f.write("# Deprecated Terms ") f.write("Excluded\n" if excl_deprecated else "Included\n") mappings.to_csv(output_file, index=False, mode='a') diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 34d2b71..4cc5a60 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -5,7 +5,6 @@ from text2term.term import OntologyTerm import logging -options = ['classes', 'properties', 'both'] class OntologyTermCollector: @@ -175,10 +174,11 @@ def _get_labels(self, ontology_term): self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) return labels - def _get_synonyms(self, ontology_term): + def _get_synonyms(self, ontology_term, include_broad_synonyms=False): """ Collect the synonyms of the given ontology term :param ontology_term: Ontology term + :param include_broad_synonyms: true if broad (i.e. more generic) synonyms should be included, false otherwise :return: Collection of synonyms of the ontology term """ synonyms = set() @@ -186,12 +186,13 @@ def _get_synonyms(self, ontology_term): synonyms.add(synonym) for synonym in self._get_obo_related_synonyms(ontology_term): synonyms.add(synonym) - for synonym in self._get_obo_broad_synonyms(ontology_term): - synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): synonyms.add(efo_alt_term) + if include_broad_synonyms: + for synonym in self._get_obo_broad_synonyms(ontology_term): + synonyms.add(synonym) self.logger.debug("...collected %i synonyms for %s", len(synonyms), ontology_term) return synonyms From 38b7fdeff77f531d0a48d33035859b9c2600910c Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 4 May 2023 10:52:40 -0400 Subject: [PATCH 086/185] Ontology Acronyms and Cache Objects This update allows the user to use a known ontology acronym instead of an explicit download link. Additionally, the cache mechanisms have been reorganized. Now, cache_ontology returns a class object that allows a user to directly call the map function with the cache. --- .gitignore | 5 ++- README.md | 6 ++- test/simple-test.py | 12 ++++-- text2term/__init__.py | 8 ++-- text2term/config.py | 2 +- text2term/onto_cache.py | 80 +++++++++++++++++++++++++++++++++++++ text2term/t2t.py | 44 +++++--------------- text2term/term_collector.py | 4 ++ 8 files changed, 114 insertions(+), 47 deletions(-) create mode 100644 text2term/onto_cache.py diff --git a/.gitignore b/.gitignore index e66c4c9..423af80 100644 --- a/.gitignore +++ b/.gitignore @@ -84,9 +84,12 @@ ipython_config.py # pyenv .python-version -# For PyPi upload and testing +# For PyPi upload make-pypi.sh +# Cache should not be uploaded +cache/ + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies diff --git a/README.md b/README.md index edb6db7..d2ed2a4 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ All other arguments are the same, and have the same functionality: `target_ontology` : str Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + As of version 2.3.0, passing a recognized acronym to `target_ontology` will generate the download link automatically. This is done using the `bioregistry` python package. `base_iris` : tuple Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: @@ -157,18 +158,19 @@ Both functions return the same value: As of version 1.1.0, users can cache ontologies that they want to use regularly or quickly. Programmatically, there are two steps to using the cache: creating the cache, then accessing it. First, the user can cache ontologies using either of two functions: ```python -cache_ontology(ontology_url, ontology_acronym, base_iris=()) +cache_ontology(ontology_url, ontology_acronym="", base_iris=()) ``` ```python cache_ontology_set(ontology_registry_path) ``` -The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. An example can be found above. +The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. If no acronym is given, it will use the URL as the cache name. An example can be found above. The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: `acronym,version,url`. An example is provided in `resources/ontologies.csv` Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. +As of version 2.3.0, the `cache_ontology` function also returns an object that can be used to call any of the `map` functions, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is not specified and there is no `use_cache` option, as it is always True. NOTE: Due to how ontologies are processed in memory, `cache_ontology_set` must be used to cache multiple ontologies in a single Python instance. If `cache_ontology` is used multiple times in one instance, the behavior is undefined and may cause visible or invisible errors. diff --git a/test/simple-test.py b/test/simple-test.py index 6ac2607..8bfec72 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -1,13 +1,17 @@ import text2term +import bioregistry def main(): efo = "http://www.ebi.ac.uk/efo/efo.owl#" pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - if not text2term.cache_exists("EFO"): - text2term.cache_ontology(efo, "EFO") - df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") - # df = text2term.map_terms(["contains", "asthma"], efo, term_type="classes") + # print(bioregistry.get_owl_download("eFo")) + # if not text2term.cache_exists("EFO"): + # cached_onto = text2term.cache_ontology("EFO") + # # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") + # print("Cache exists:", cached_onto.cache_exists()) + # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") + df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") print(df.to_string()) if __name__ == '__main__': diff --git a/text2term/__init__.py b/text2term/__init__.py index 6ed6d92..9e3c8a0 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -1,10 +1,10 @@ from .t2t import map_terms from .t2t import map_file -from .t2t import cache_ontology_set -from .t2t import cache_ontology -from .t2t import cache_exists -from .t2t import clear_cache from .t2t import map_tagged_terms +from .t2t import cache_ontology +from .onto_cache import cache_ontology_set +from .onto_cache import cache_exists +from .onto_cache import clear_cache from .mapper import Mapper from .preprocess import preprocess_file from .preprocess import preprocess_terms diff --git a/text2term/config.py b/text2term/config.py index 5138066..7e8ebeb 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.2.0" \ No newline at end of file +VERSION = "2.3.0" \ No newline at end of file diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py new file mode 100644 index 0000000..f15e737 --- /dev/null +++ b/text2term/onto_cache.py @@ -0,0 +1,80 @@ +import text2term +from .mapper import Mapper +import os +from shutil import rmtree +import sys + +""" +CACHING FUNCTIONS -- Public +""" +# Caches many ontologies from a csv +def cache_ontology_set(ontology_registry_path): + registry = pd.read_csv(ontology_registry_path) + cache_set = {} + for index, row in registry.iterrows(): + try: + cache = text2term.cache_ontology(row.url, row.acronym) + cache_set.update({row.acronym : cache}) + except Exception as err: + sys.stderr.write("Could not cache ontology", row.acronym, "due to error:", err) + owlready2.default_world.ontologies.clear() + return cache_set + +# Will check if an acronym exists in the cache +def cache_exists(ontology_acronym=''): + return os.path.exists("cache/" + ontology_acronym) + +# Clears the cache +def clear_cache(ontology_acronym=''): + cache_dir = "cache/" + if ontology_acronym != '': + cache_dir = os.path.join(cache_dir, ontology_acronym) + # Is equivalent to: rm -r cache_dir + try: + rmtree(cache_dir) + sys.stderr.write("Cache has been cleared successfully\n") + except OSError as error: + sys.stderr.write("Cache cannot be removed:") + sys.stderr.write(error) + +## Class that is returned to run +class OntologyCache: + def __init__(self, ontology_acronym): + self.acronym = ontology_acronym + self.ontology = "cache/" + ontology_acronym + "/" + + def map_terms(self, source_terms, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + term_type='classes'): + return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, \ + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ + mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ + term_type=term_type) + + def map_tagged_terms(self, tagged_terms_dict, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + term_type='classes'): + return text2term.map_tagged_terms(tagged_terms_dict, self.acronym, base_iris=base_iris, \ + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ + mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ + term_type=term_type) + + def map_file(self, input_file, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, + separator=',', term_type='classes'): + return text2term.map_file(source_terms, self.acronym, base_iris=base_iris, csv_columns=csv_columns, \ + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ + mapper=mapper, output_file=output_file, save_graphs=save_graphs, separator=separator, \ + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ + term_type=term_type) + + def clear_cache(self): + clear_cache(self.acronym) + + def cache_exists(self): + return cache_exists(self.acronym) + + def acroynm(self): + return self.acronym diff --git a/text2term/t2t.py b/text2term/t2t.py index 45a4a84..711c3c0 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -6,9 +6,9 @@ import datetime import owlready2 import pandas as pd -from shutil import rmtree from text2term import onto_utils from text2term.mapper import Mapper +from text2term import onto_cache from text2term.term_collector import OntologyTermCollector from text2term.term_graph_generator import TermGraphGenerator from text2term.bioportal_mapper import BioPortalAnnotatorMapper @@ -107,7 +107,7 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store if save_mappings: - _save_mappings(df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings) + _save_mappings(df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) return df """ @@ -162,26 +162,15 @@ def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) if save_mappings: - _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df -""" -CACHING FUNCTIONS -- Public -""" -# Caches many ontologies from a csv -def cache_ontology_set(ontology_registry_path): - registry = pd.read_csv(ontology_registry_path) - for index, row in registry.iterrows(): - try: - cache_ontology(row.url, row.acronym) - except Exception as err: - sys.stderr.write("Could not cache ontology", row.acronym, "due to error:", err) - owlready2.default_world.ontologies.clear() - # Caches a single ontology -def cache_ontology(ontology_url, ontology_acronym, base_iris=()): +def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): + if ontology_acronym == "": + ontology_acronym = ontology_url ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type='both') cache_dir = "cache/" + ontology_acronym + "/" if not os.path.exists(cache_dir): @@ -190,23 +179,7 @@ def cache_ontology(ontology_url, ontology_acronym, base_iris=()): _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) _save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) ontology_terms.clear() - -# Will check if an acronym exists in the cache -def cache_exists(ontology_acronym=''): - return os.path.exists("cache/" + ontology_acronym) - -# Clears the cache -def clear_cache(ontology_acronym=''): - cache_dir = "cache/" - if ontology_acronym != '': - cache_dir = os.path.join(cache_dir, ontology_acronym) - # Is equivalent to: rm -r cache_dir - try: - rmtree(cache_dir) - sys.stderr.write("Cache has been cleared successfully") - except OSError as error: - sys.stderr.write("Cache cannot be removed:") - sys.stderr.write(error) + return onto_cache.OntologyCache(ontology_acronym) """ PRIVATE/HELPER FUNCTIONS @@ -259,7 +232,7 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi else: raise ValueError("Unsupported mapper: " + mapper) -def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings): +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "a") as f: @@ -270,6 +243,7 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Mapper: %s\n" % mapper.value) f.write("# Base IRIs: %s\n" % (base_iris,)) f.write("# Max Mappings: %d\n" % max_mappings) + f.write("# Term Type: %s\n" % term_type) f.write("# Deprecated Terms ") f.write("Excluded\n" if excl_deprecated else "Included\n") mappings.to_csv(output_file, index=False, mode='a') diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 4cc5a60..4074728 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -4,6 +4,7 @@ from text2term import onto_utils from text2term.term import OntologyTerm import logging +import bioregistry class OntologyTermCollector: @@ -337,6 +338,9 @@ def _load_ontology(self, ontology_iri): """ self.logger.info("Loading ontology %s...", ontology_iri) start = time.time() + owl_link = bioregistry.get_owl_download(ontology_iri) + if owl_link != None: + ontology_iri = owl_link ontology = get_ontology(ontology_iri).load() end = time.time() self._log_ontology_metrics(ontology) From 70d95c2bd3d30e17a16ae391436f551858c82a22 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 10 May 2023 12:22:40 -0400 Subject: [PATCH 087/185] Fix Cache Ontology Set Bug The Cache Ontology Set functionality was broken due to the previous refactoring. It has now been fixed --- test/simple-test.py | 5 +++-- text2term/config.py | 2 +- text2term/onto_cache.py | 5 ++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/simple-test.py b/test/simple-test.py index 8bfec72..0145ca7 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -10,8 +10,9 @@ def main(): # cached_onto = text2term.cache_ontology("EFO") # # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") # print("Cache exists:", cached_onto.cache_exists()) - # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") - df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") + caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") + df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") + # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") print(df.to_string()) if __name__ == '__main__': diff --git a/text2term/config.py b/text2term/config.py index 7e8ebeb..9b332ca 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.3.0" \ No newline at end of file +VERSION = "2.3.1" \ No newline at end of file diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py index f15e737..7af3e40 100644 --- a/text2term/onto_cache.py +++ b/text2term/onto_cache.py @@ -3,6 +3,8 @@ import os from shutil import rmtree import sys +import pandas as pd +import owlready2 """ CACHING FUNCTIONS -- Public @@ -16,7 +18,8 @@ def cache_ontology_set(ontology_registry_path): cache = text2term.cache_ontology(row.url, row.acronym) cache_set.update({row.acronym : cache}) except Exception as err: - sys.stderr.write("Could not cache ontology", row.acronym, "due to error:", err) + err_message = "Could not cache ontology " + row.acronym + " due to error: " + str(err) + sys.stderr.write(err_message) owlready2.default_world.ontologies.clear() return cache_set From f76c562fca03498b415be8839e872d38953440c6 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 12 May 2023 11:10:31 -0400 Subject: [PATCH 088/185] Fixes Min Score Bug Fixes a bug where the min_score argument in the mapper was ignored when not using TFIDF --- test/simple-test.py | 12 ++++++------ text2term/config.py | 2 +- text2term/t2t.py | 17 +++++++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/test/simple-test.py b/test/simple-test.py index 0145ca7..7143c1f 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -6,12 +6,12 @@ def main(): pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" # print(bioregistry.get_owl_download("eFo")) - # if not text2term.cache_exists("EFO"): - # cached_onto = text2term.cache_ontology("EFO") - # # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") - # print("Cache exists:", cached_onto.cache_exists()) - caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", excl_deprecated=True, use_cache=True, term_type="classes") + if not text2term.cache_exists("EFO"): + cached_onto = text2term.cache_ontology("EFO") + # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") + print("Cache exists:", cached_onto.cache_exists()) + # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") + df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") print(df.to_string()) diff --git a/text2term/config.py b/text2term/config.py index 9b332ca..388faa9 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.3.1" \ No newline at end of file +VERSION = "2.3.2" \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index 711c3c0..12cc402 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -219,18 +219,27 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + mappings_df = term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) elif mapper == Mapper.ZOOMA: term_mapper = ZoomaMapper() - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper == Mapper.BIOPORTAL: term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - return term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms) - return term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + mappings_df = term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + df = _filter_mappings(mappings_df, min_score) + return df + +def _filter_mappings(mappings_df, min_score): + new_df = pd.DataFrame(columns=mappings_df.columns) + for index, row in mappings_df.iterrows(): + if row['Mapping Score'] >= min_score: + new_df.loc[len(new_df.index)] = row + return new_df def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type): if os.path.dirname(output_file): # create output directories if needed From d2634fb3af2be123453c34a96d071deeb49ff358 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 3 Aug 2023 10:33:06 -0400 Subject: [PATCH 089/185] Remove extra functions and add unmapped Major update. This removes the functionality for map_file and map_tagged_terms, as well as several other similar functions. Also adds functionality to include unmapped terms and ignore items tagged "ignore" --- .readthedocs.yaml | 32 ++++++ README.md | 76 ++++++------- docs/Makefile | 20 ++++ docs/conf.py | 27 +++++ docs/index.rst | 20 ++++ docs/make.bat | 35 ++++++ test/simple-test.py | 6 +- test/simple_preprocess.txt | 3 + text2term/__init__.py | 3 - text2term/preprocess.py | 33 +----- text2term/t2t.py | 223 ++++++++++++++++++++----------------- text2term/tagged_terms.py | 6 + text2term/term_mapping.py | 2 + 13 files changed, 304 insertions(+), 182 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 test/simple_preprocess.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..c409646 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,32 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index d2ed2a4..08a52ee 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,14 @@ pip install text2term import text2term import pandas -df1 = text2term.map_file("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") +df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") +df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl") ``` Below is an example of caching, assuming the same imports as above: ```python text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_file("test/unstruct_terms.txt", "EFO", use_cache=True) +df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True) df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) text2term.clear_cache("EFO") ``` @@ -48,10 +49,10 @@ Then, after running this, the following command is equivalent: `python text2term -s test/unstruct_terms.txt -t EFO` ## Programmatic Usage -The tool can be executed in Python with any of the three following functions: +The tool can be executed in Python with the `map_terms` function: ```python -text2term.map_file(input_file='/some/file.txt', +text2term.map_terms(source_terms, target_ontology='http://some.ontology/v1.owl', base_iris=(), csv_columns=(), @@ -64,45 +65,15 @@ text2term.map_file(input_file='/some/file.txt', save_mappings=False, separator=',', use_cache=False, - term_type='classes') -``` -or -```python -text2term.map_terms(source_terms=['term one', 'term two'], - target_ontology='http://some.ontology/v1.owl', - base_iris=(), - excl_deprecated=False, - max_mappings=3, - min_score=0.3, - mapper=Mapper.TFIDF, - output_file='', - save_graphs=False, - save_mappings=False, - source_terms_ids=(), - use_cache=False, - term_type='classes') -``` -or -```python -text2term.map_tagged_terms(tagged_terms_dict={'term one': ["tag 1", "tag 2"]}, - target_ontology='http://some.ontology/v1.owl', - base_iris=(), - excl_deprecated=False, - max_mappings=3, - min_score=0.3, - mapper=Mapper.TFIDF, - output_file='', - save_graphs=False, - save_mappings=False, - source_terms_ids=(), - use_cache=False, - term_type='classes') + term_type='classes', + incl_unmapped=False) + ``` +NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one. ### Arguments -For `map_file`, the first argument 'input_file' specifies a path to a file containing the terms to be mapped. It also has a `csv_column` argument that allows the user to specify a column to map if a csv is passed in as the input file. -For `map_terms`, the first argument 'source_terms' takes in a list of the terms to be mapped. -For `map_tagged_terms`, everything is the same as `map_terms` except the first argument is either a dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. +For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3)dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). +Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below). All other arguments are the same, and have the same functionality: @@ -115,6 +86,9 @@ All other arguments are the same, and have the same functionality: Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +`csv_column` : tuple + Allows the user to specify a column to map if a csv is passed in as the input file. Ignored if the input is not a file path. + `source_terms_ids` : tuple Collection of identifiers for the given source terms WARNING: While this is still available for the tagged term function, it is worth noting that dictionaries do not necessarily preserve order, so it is not recommended. If using the TaggedTerm object, the source terms can be attached there to guarantee order. @@ -141,12 +115,18 @@ All other arguments are the same, and have the same functionality: `save_mappings` : bool Save the generated mappings to a file (specified by `output_file`) +`seperator` : str + Character that seperates the source term values if a file input is given. Ignored if the input is not a file path. + `use_cache` : bool Use the cache for the ontology. More details are below. `term_type` : str Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError. +`incl_unmapped` : bool + Include all unmapped terms in the output. If something has been tagged Ignore (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output. + All default values, if they exist, can be seen above. ### Return Value @@ -185,9 +165,6 @@ As of version 1.2.0, text2term includes regex-based preprocessing functionality Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings: ```python -preprocess_file(file_path, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) -``` -```python preprocess_terms(terms, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) ``` ```python @@ -202,7 +179,7 @@ NOTE: As of version 2.1.0, the arguments were changed to "blocklist" from "black The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. -The functions `preprocess_file()` and `preprocess_terms()` both return a dictionary where the keys are the original terms and the values are the preprocessed terms. +The function `preprocess_terms()` returns a dictionary where the keys are the original terms and the values are the preprocessed terms. The `preprocess_tagged_terms()` function returns a list of TaggedTerm items with the following function contracts: ```python def __init__(self, term=None, tags=[], original_term=None, source_term_id=None) @@ -214,10 +191,19 @@ def get_term(self) def get_tags(self) def get_source_term_id(self) ``` -As mentioned in the mapping section above, this can then be passed directly to map_tagged_terms(), allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. +As mentioned in the mapping section above, this can then be passed directly to `map_terms`, allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. **Note on NA values in input**: As of v2.0.3, when the input to text2term is a table file, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. +### Tag Usage +As of 3.0.0, some tags have additional functionality that is added when attached to a term: + +IGNORE: + If an ignore tag is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. Here are the following values that count as ignore tags: +```python + IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] +``` + ## Command Line Usage After installation, execute the tool from a command line as follows: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ded1330 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,27 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'text2term' +copyright = '2023, Harvard Medical School' +author = 'Rafael Goncalves and Jason Payne' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["myst_parser"] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'alabaster' +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..6456e30 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. text2term documentation master file, created by + sphinx-quickstart on Tue Jul 11 10:34:29 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to text2term's documentation! +===================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/test/simple-test.py b/test/simple-test.py index 7143c1f..44577d0 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -5,14 +5,16 @@ def main(): efo = "http://www.ebi.ac.uk/efo/efo.owl#" pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - # print(bioregistry.get_owl_download("eFo")) if not text2term.cache_exists("EFO"): cached_onto = text2term.cache_ontology("EFO") # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") print("Cache exists:", cached_onto.cache_exists()) # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") + # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") + df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":""}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + # taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt") + # df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) print(df.to_string()) if __name__ == '__main__': diff --git a/test/simple_preprocess.txt b/test/simple_preprocess.txt new file mode 100644 index 0000000..fdd7467 --- /dev/null +++ b/test/simple_preprocess.txt @@ -0,0 +1,3 @@ +asthma;:;disease +acute bronchitis;:;important,tags +colon disease diff --git a/text2term/__init__.py b/text2term/__init__.py index 9e3c8a0..33b75b5 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -1,12 +1,9 @@ from .t2t import map_terms -from .t2t import map_file -from .t2t import map_tagged_terms from .t2t import cache_ontology from .onto_cache import cache_ontology_set from .onto_cache import cache_exists from .onto_cache import clear_cache from .mapper import Mapper -from .preprocess import preprocess_file from .preprocess import preprocess_terms from .preprocess import preprocess_tagged_terms from .tagged_terms import TaggedTerm \ No newline at end of file diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 17ea7ea..44e4f0f 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -3,32 +3,11 @@ from enum import Enum from .tagged_terms import TaggedTerm -def preprocess_file(file_path, template_path, output_file="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path - terms = _get_values(file_path) - processed_terms = preprocess_terms(terms, template_path, output_file=output_file, \ - blocklist_path=blocklist_path, blocklist_char=blocklist_char, \ - rem_duplicates=rem_duplicates) - - return processed_terms - ## Tags should be stored with their terms in the same line, delineated by ";:;" ## ex: Age when diagnosed with (.*) ;:; age,diagnosis ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False, separator=";:;"): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path + blocklist_char='', rem_duplicates=False, separator=";:;"): # Seperate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] @@ -80,13 +59,9 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ return processed_terms def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ - blocklist_char='', blacklist_path="", blacklist_char='', \ - rem_duplicates=False): - # Allows backwards compatibility to blacklist. Will eventually be deleted - if blocklist_char == '': - blocklist_char = blacklist_char - if blocklist_path == "": - blocklist_path = blacklist_path + blocklist_char='', rem_duplicates=False): + if isinstance(terms, str): + terms = _get_values(file_path) # Form the templates as regular expressions template_strings = [] if template_path != "": diff --git a/text2term/t2t.py b/text2term/t2t.py index 12cc402..cb7b18c 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -16,99 +16,11 @@ from text2term.tfidf_mapper import TFIDFMapper from text2term.zooma_mapper import ZoomaMapper from text2term.config import VERSION +from text2term.tagged_terms import TaggedTerm +from text2term.term_mapping import TermMapping -""" -Maps the terms in the given input file to the specified target ontology. - -Parameters ----------- -input_file : str - Path to input file containing 'source' terms to map to ontology terms (list of terms or CSV file) -target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies -base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') -csv_columns : tuple - Name of the column that contains the terms to map, optionally followed by the name of the column that - contains identifiers for the terms (eg 'my_terms,my_term_ids') -separator : str - Specifies the cell separator to be used when reading a non-comma-separated tabular file -excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` -mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal -max_mappings : int - Maximum number of top-ranked mappings returned per source term -min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) -output_file : str - Path to desired output file for the mappings -save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term -save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - -Returns ----------- -df - Data frame containing the generated ontology mappings -""" -def map_file(input_file, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=',', use_cache=False, term_type='classes'): - source_terms, source_terms_ids = _load_data(input_file, csv_columns, separator) - return map_terms(source_terms, target_ontology, source_terms_ids=source_terms_ids, base_iris=base_iris, - excl_deprecated=excl_deprecated, max_mappings=max_mappings, mapper=mapper, min_score=min_score, - output_file=output_file, save_graphs=save_graphs, save_mappings=save_mappings, - use_cache=use_cache, term_type=term_type) - -""" -All parameters are the same as below, but tagged_terms_dict is a dictionary where the - key is the source term and the value is a list of all tags (or a single string for - one tag). It can also be a list of TaggedTerm objects. - The dataframe returned is the same but contains a tags column -""" -def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), use_cache=False, - term_type='classes'): - # If the input is a dict, use keys. If it is a list, it is a list of TaggedTerms - if isinstance(tagged_terms_dict, dict): - terms = list(tagged_terms_dict.keys()) - else: - terms = [] - source_terms_id_list = [] - for tagged_term in tagged_terms_dict: - terms.append(tagged_term.get_term()) - if tagged_term.get_source_term_id() != None: - source_terms_id_list.append(tagged_term.get_source_term_id()) - if len(source_terms_id_list) > 0: - source_terms_ids = tuple(source_terms_id_list) - - # Run the mapper - df = map_terms(terms, target_ontology, base_iris=base_iris, excl_deprecated=excl_deprecated, \ - max_mappings=max_mappings, min_score=min_score, mapper=mapper, output_file=output_file, \ - save_graphs=save_graphs, source_terms_ids=source_terms_ids, use_cache=use_cache, \ - term_type=term_type) - - # For each term in dict, add tags to corresponding mappings row in "Tags" Column - if isinstance(tagged_terms_dict, dict): - for key, value in tagged_terms_dict.items(): - if isinstance(value, list): - to_store = ','.join(value) - else: - to_store = str(value) - df.loc[df['Source Term'] == key, "Tags"] = to_store - else: - for term in tagged_terms_dict: - to_store = ','.join(term.get_tags()) - df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store - - if save_mappings: - _save_mappings(df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) - return df +IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] +UNMAPPED_TAG = "unmapped" """ Maps the terms in the given list to the specified target ontology. @@ -146,23 +58,31 @@ def map_tagged_terms(tagged_terms_dict, target_ontology, base_iris=(), excl_depr df Data frame containing the generated ontology mappings """ -def map_terms(source_terms, target_ontology, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - use_cache=False, term_type='classes'): +def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + separator=',', use_cache=False, term_type='classes', incl_unmapped=False): + # Parse the possible source terms options and tags + source_terms, source_term_ids, tags = _parse_source_terms(source_terms, source_terms_ids, csv_columns, separator) + # Create Source Term Ids if they are not provided if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) + # Create the output file if output_file == '': timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") output_file = "t2t-mappings-" + timestamp + ".csv" + # Load the ontology for either Zooma, Bioportal, or directly if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: target_terms = '' if target_ontology.lower() == 'all' else target_ontology else: target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) - mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score) + # Run the mapper + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) + mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: - _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, \ + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df @@ -184,6 +104,31 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): """ PRIVATE/HELPER FUNCTIONS """ +# Parses the source terms and returns what is to be mapped, the term ids, and the tags +def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separator=','): + # If source_terms is a string, we assume it is a file location + if isinstance(source_terms, str): + terms, source_terms_ids = _load_data(source_terms, csv_columns, separator) + tags = dict.fromkeys(terms) + # If source_terms is a dictionary, the keys are terms and the values are tags + elif isinstance(source_terms, dict): + terms = list(source_terms.keys()) + tags = source_terms + # Otherwise, it is a list of either TaggedTerms or strings + elif isinstance(source_terms[0], TaggedTerm): + terms = [] + source_terms_id_list = [] + for tagged_term in source_terms: + terms.append(tagged_term.get_term()) + if tagged_term.get_source_term_id() != None: + source_terms_id_list.append(tagged_term.get_source_term_id()) + source_terms_ids = source_terms_id_list + tags = source_terms + else: + terms = source_terms + tags = dict.fromkeys(terms) + return terms, source_terms_ids, tags + def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): start = time.time() with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: @@ -216,22 +161,62 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms -def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score): +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): + to_map, tags = _process_tags(source_terms, tags) if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) - mappings_df = term_mapper.map(source_terms, source_term_ids, max_mappings=max_mappings, min_score=min_score) + mappings_df = term_mapper.map(to_map, source_term_ids, max_mappings=max_mappings, min_score=min_score) elif mapper == Mapper.ZOOMA: term_mapper = ZoomaMapper() - mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper == Mapper.BIOPORTAL: term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") - mappings_df = term_mapper.map(source_terms, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms) - mappings_df = term_mapper.map(source_terms, source_term_ids, mapper, max_mappings=max_mappings) + mappings_df = term_mapper.map(to_map, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + + # Add tags, process, and filter df = _filter_mappings(mappings_df, min_score) + if incl_unmapped: + df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) + df = _add_tags_to_df(mappings_df, tags) + return df + +# Takes in the tags and source terms and processes them accordingly +def _process_tags(source_terms, tags): + to_map = [] + # IGNORE TAGS SECTION + for term in source_terms: + if isinstance(tags, dict): + term_tags = tags[term] + else: + for tag in tags: + if tag.get_term() == term: + term_tags = tag.get_tags() + break + if isinstance(term_tags, list): + if not any(tag in IGNORE_TAGS for tag in term_tags): + to_map.append(term) + else: + if term_tags not in IGNORE_TAGS: + to_map.append(term) + return to_map, tags + +def _add_tags_to_df(df, tags): + if isinstance(tags, dict): + for key, value in tags.items(): + if isinstance(value, list): + to_store = ','.join(value) + else: + to_store = str(value) + df.loc[df['Source Term'] == key, "Tags"] = to_store + else: + for term in tags: + to_store = ','.join(term.get_tags()) + df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store return df def _filter_mappings(mappings_df, min_score): @@ -241,7 +226,34 @@ def _filter_mappings(mappings_df, min_score): new_df.loc[len(new_df.index)] = row return new_df -def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type): +def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): + mapped = pd.unique(mappings_df["Source Term"]) + for (term, term_id) in zip(source_terms, source_terms_ids): + if term not in mapped: + non_mapping = TermMapping(term, term_id, "", "", 0) + _add_tag(tags, term, UNMAPPED_TAG, ignore=True) + mappings_df.loc[len(mappings_df.index)] = non_mapping.to_dict() + return mappings_df + +def _add_tag(tags, term, to_add, ignore=False): + if isinstance(tags, dict): + new_tags = tags.get(term, []) + if not any(tag in IGNORE_TAGS for tag in new_tags): + if isinstance(new_tags, list): + new_tags.append(to_add) + elif new_tags != "": + new_tags = [new_tags, to_add] + else: + new_tags = [to_add] + tags[term] = new_tags + else: + for tagged_term in tags: + check_ignore = not ignore and not any(tagged_term.has_tag(tag) for tag in IGNORE_TAGS) + if tagged_term.get_term() == term and check_ignore: + tagged_term.add_tags([to_add]) + +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, \ + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "a") as f: @@ -255,6 +267,11 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Term Type: %s\n" % term_type) f.write("# Deprecated Terms ") f.write("Excluded\n" if excl_deprecated else "Included\n") + f.write("# Unmapped Terms ") + f.write("Excluded\n" if not incl_unmapped else "Included\n") + writestring = "# Of " + str(len(source_terms)) + " entries, " + str(len(pd.unique(mappings["Source Term ID"]))) + writestring += " were successfully mapped to " + str(len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" + f.write(writestring) mappings.to_csv(output_file, index=False, mode='a') def _save_graphs(terms, output_file): diff --git a/text2term/tagged_terms.py b/text2term/tagged_terms.py index d845999..53d3441 100644 --- a/text2term/tagged_terms.py +++ b/text2term/tagged_terms.py @@ -18,6 +18,9 @@ def update_term(self, term): def update_source_term_id(self, source_term_id): self.source_term_id = source_term_id + def has_tag(self, tag): + return tag in self.tags + def get_original_term(self): return self.original_term @@ -29,4 +32,7 @@ def get_tags(self): def get_source_term_id(self): return self.source_term_id + + def to_dict(self): + return {term : tags} \ No newline at end of file diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 39ef795..8da155c 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -37,6 +37,8 @@ def mapped_term_iri(self): @property def mapped_term_curie(self): + if self.mapped_term_iri == "": + return "" return onto_utils.curie_from_iri(self.mapped_term_iri) @property From e7adade4e6860cdae920732bce1a3ca1578af7e0 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 3 Aug 2023 13:08:15 -0400 Subject: [PATCH 090/185] Add version update Makes the version 3.0.0 --- text2term/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/config.py b/text2term/config.py index 388faa9..91c27a9 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "2.3.2" \ No newline at end of file +VERSION = "3.0.0" \ No newline at end of file From 7ecae8b2dbfe611d39e8e5f5be08f79193d9807a Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 9 Aug 2023 11:08:02 -0400 Subject: [PATCH 091/185] Fixes None bug Fixes a bug that prevents adding tags when the current tag variable is just None --- test/simple-test.py | 2 +- text2term/config.py | 2 +- text2term/t2t.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test/simple-test.py b/test/simple-test.py index 44577d0..be7ddaa 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -12,7 +12,7 @@ def main(): # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") - df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":""}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":None}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) # taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt") # df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) print(df.to_string()) diff --git a/text2term/config.py b/text2term/config.py index 91c27a9..2e42105 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "3.0.0" \ No newline at end of file +VERSION = "3.0.1" \ No newline at end of file diff --git a/text2term/t2t.py b/text2term/t2t.py index cb7b18c..66f1233 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -238,7 +238,9 @@ def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): def _add_tag(tags, term, to_add, ignore=False): if isinstance(tags, dict): new_tags = tags.get(term, []) - if not any(tag in IGNORE_TAGS for tag in new_tags): + if new_tags is None: + new_tags = [] + if not (ignore and any(tag in IGNORE_TAGS for tag in new_tags)): if isinstance(new_tags, list): new_tags.append(to_add) elif new_tags != "": From 6845220e5147c880f681078f9fe91024a6798f3d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 15 Aug 2023 13:10:02 -0400 Subject: [PATCH 092/185] Update some dependencies and allow compatible versions --- requirements.txt | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3ddde78..0617121 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -Owlready2==0.40 -argparse==1.4.0 -pandas==1.5.3 -numpy==1.24.2 -gensim==4.3.0 -scipy==1.10.1 -scikit-learn==1.2.1 -setuptools==67.6.0 -requests==2.28.2 -tqdm==4.65.0 -sparse_dot_topn==0.3.4 -bioregistry==0.6.92 -nltk==3.8.1 -rapidfuzz==2.13.7 -shortuuid==1.0.11 +Owlready2~=0.44 +argparse~=1.4.0 +pandas~=2.0.3 +numpy~=1.24.2 +gensim~=4.3.0 +scipy~=1.10.1 +scikit-learn~=1.2.1 +setuptools~=67.6.0 +requests~=2.31.0 +tqdm~=4.66.1 +sparse_dot_topn~=0.3.4 +bioregistry~=0.10.6 +nltk~=3.8.1 +rapidfuzz~=2.13.7 +shortuuid~=1.0.11 From 8e85ee7d0724daaebbfea559b9bb68e08fd144e9 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 15 Aug 2023 14:24:50 -0400 Subject: [PATCH 093/185] Fix command-line interface imports. Increment t2t version --- text2term/__main__.py | 12 +++++++----- text2term/config.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index 34e1d39..39fa830 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -1,7 +1,8 @@ import argparse import os import sys -from t2t import map_file, cache_ontology, cache_exists +from t2t import map_terms, cache_ontology +from onto_cache import cache_exists from mapper import Mapper if __name__ == "__main__": @@ -56,7 +57,8 @@ if acronym != "": cache_ontology(target, acronym, iris) target = acronym - map_file(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, - excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, - min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, - save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), term_type=arguments.term_type) + map_terms(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, + excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, + min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, + save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), + term_type=arguments.term_type) diff --git a/text2term/config.py b/text2term/config.py index 2e42105..a2ded2f 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "3.0.1" \ No newline at end of file +VERSION = "3.0.2" From 7ffa22f6bbeee3bc84e1c2a76db0eb66166bf747 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 15 Aug 2023 15:41:36 -0400 Subject: [PATCH 094/185] Fix base_iris and mapper reference in test-pypi --- test/test-pypi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-pypi.py b/test/test-pypi.py index 813fca1..54e2390 100644 --- a/test/test-pypi.py +++ b/test/test-pypi.py @@ -9,8 +9,8 @@ def main(): text2term.map_terms(["fever", "headache"], "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) - text2term.map_terms(["fever", "headache"], "EFO", base_iris=("www."), mapper=text2term.mapper.Mapper.levenshtein, max_mappings=4, use_cache=True) - + text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) + # Properties and classes tests text2term.map_terms(["fever", "headache"], "EFO", term_type="classes", use_cache=True) text2term.map_terms(["contains", "location"], "EFO", term_type="properties", use_cache=True) From 94688ccc00c1c5a7afbabf368d70a7a978313ad8 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 19:36:02 -0400 Subject: [PATCH 095/185] Expose OntologyTermCollector module --- text2term/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/text2term/__init__.py b/text2term/__init__.py index 33b75b5..ad9f676 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -6,4 +6,5 @@ from .mapper import Mapper from .preprocess import preprocess_terms from .preprocess import preprocess_tagged_terms -from .tagged_terms import TaggedTerm \ No newline at end of file +from .tagged_term import TaggedTerm +from .term_collector import OntologyTermCollector From 3bbdefceae7c8a6fdcbf279dca9020ff22c20968 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:03:17 -0400 Subject: [PATCH 096/185] Modify OntologyTerm to have a dictionary of complex OWL restrictions Add OntologyTermType enumeration , some missing docs and minor refactoring --- text2term/term.py | 93 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/text2term/term.py b/text2term/term.py index 4698982..334e218 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -1,9 +1,18 @@ -"""Provides OntologyTerm class""" +"""Provides OntologyTerm and OntologyTermType classes""" + +from enum import Enum + + +class OntologyTermType(str, Enum): + CLASS = "class" + PROPERTY = "property" + ANY = "any" class OntologyTerm: - def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=(), deprecated=False, termtype='class'): + def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), children=(), instances=(), restrictions=(), + deprecated=False, term_type=OntologyTermType.CLASS): """ Constructor for a succinct representation of an ontology term :param iri: IRI of the ontology term @@ -13,6 +22,9 @@ def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), childre :param parents: Dictionary containing the IRIs of parent terms (superclasses) and their label(s) :param children: Dictionary containing the IRIs of child terms (subclasses) and their label(s) :param instances: Dictionary containing the IRIs of instances of the term (rdf:type) and their label(s) + :param restrictions: Dictionary containing complex class restrictions (such as located_in.Hand) on this term + :param deprecated: true if term is stated to be owl:deprecated, false otherwise + :param term_type: Type of term: class or property """ self._iri = iri self._labels = labels @@ -21,49 +33,103 @@ def __init__(self, iri, labels, definitions=(), synonyms=(), parents=(), childre self._parents = parents self._children = children self._instances = instances + self._restrictions = restrictions self._deprecated = deprecated - self._termtype = termtype + self._term_type = term_type @property def iri(self): + """ + Returns the IRI of this term + :return: str + """ return self._iri @property def labels(self): + """ + Returns the set of human-readable labels for the term specified using rdfs:label or skos:prefLabel properties + :return: set + """ return self._labels - @property - def synonyms(self): - return self._synonyms - @property def definitions(self): + """ + Returns the set of textual definitions of the term specified using either the skos:definition or the + IAO:0000115 ('definition') annotation properties + :return: set + """ return self._definitions + @property + def synonyms(self): + """ + Returns the set of synonyms of the term specified using obo:hasExactSynonym or ncit:P90 properties + :return: set + """ + return self._synonyms + @property def parents(self): + """ + Returns a dictionary containing the IRIs of parent terms as keys, and their respective labels as values + :return: dict + """ return self._parents @property def children(self): + """ + Returns a dictionary containing the IRIs of child terms as keys, and their respective labels as values + :return: dict + """ return self._children @property def instances(self): + """ + Returns a dictionary containing the IRIs of instance terms as keys, and their respective labels as values + :return: dict + """ return self._instances + @property + def restrictions(self): + """ + Returns a dictionary containing the IRIs of properties as keys, and the respective fillers as values + For example, for a restriction such as ':has_disease_location :pancreas', the dictionary would have: + {':has_disease_location': ':pancreas'} + For nested expressions such as 'has_disease_location (:pancreas or :liver);, the dictionary would have a string + representation of that expression (using owlready2s to_str): + {':has_disease_location': ':pancreas | :liver'} + :return: dict + """ + return self._restrictions + @property def label(self): - """Return a single label for this term""" + """ + Returns a single label for this term + :return: str + """ return next(iter(self.labels)) @property def deprecated(self): + """ + Returns true if this term is stated to be 'owl:deprecated True', false otherwise + :return: bool + """ return self._deprecated @property - def termtype(self): - return self._termtype + def term_type(self): + """ + Returns the ontology term type specified using OntologyTermType enum + :return: OntologyTermType + """ + return self._term_type def __eq__(self, other): if isinstance(other, OntologyTerm): @@ -74,6 +140,7 @@ def __hash__(self): return hash(str(self._iri)) def __str__(self): - return "Ontology Term: " + self.iri + ", Labels: " + str(self.labels) + ", Synonyms: " + \ - str(self.synonyms) + ", Definitions: " + str(self.definitions) + ", Parents: " + str(self.parents) + \ - ", Children: " + str(self.children) + ", Instances: " + str(self.instances) + return "Ontology Term: " + self.iri + ", Type: " + self.term_type + ", Labels: " + str(self.labels) + \ + ", Synonyms: " + str(self.synonyms) + ", Definitions: " + str(self.definitions) + \ + ", Parents: " + str(self.parents) + ", Children: " + str(self.children) + \ + ", Instances: " + str(self.instances) + ", Restrictions: " + str(self.restrictions) From c60bc2c0cc755397b74f12a4362eebb8e6b50979 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:08:57 -0400 Subject: [PATCH 097/185] Modify OntologyTermCollector to include complex OWL restrictions --- text2term/term_collector.py | 89 ++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 4074728..3ee622f 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -2,7 +2,7 @@ from owlready2 import * from text2term import onto_utils -from text2term.term import OntologyTerm +from text2term.term import OntologyTerm, OntologyTermType import logging import bioregistry @@ -12,13 +12,15 @@ class OntologyTermCollector: def __init__(self, log_level=logging.INFO): self.logger = onto_utils.get_logger(__name__, level=log_level) - def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False, term_type="classes"): + def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False, + term_type=OntologyTermType.ANY): """ Collect the terms described in the ontology at the specified IRI :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple :param use_reasoning: Use a reasoner to compute inferred class hierarchy :param exclude_deprecated: Exclude ontology terms stated as deprecated using owl:deprecated 'true' + :param term_type: Type of term--can be 'class' or 'property' or 'any' (individuals may be added in the future) :return: Dictionary of ontology term IRIs and their respective details in the specified ontology """ ontology = self._load_ontology(ontology_iri) @@ -48,7 +50,7 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex self.logger.debug("Unable to destroy ontology: ", err) return ontology_terms - def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type='classes'): + def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): filtered_onto_terms = {} for base_iri, term in onto_terms.items(): if type(iris) == str: @@ -61,10 +63,10 @@ def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type='cl filtered_onto_terms.update({base_iri: term}) return filtered_onto_terms - def _get_ontology_signature(self, ontology, term_type='classes'): + def _get_ontology_signature(self, ontology): signature = list(ontology.classes()) signature.extend(list(ontology.properties())) - # ontology.classes() does not include classes in imported ontologies; we need to explicitly add them to our list + # owlready2::ontology.classes() does not include classes in imported ontologies; we need to explicitly add them for imported_ontology in ontology.imported_ontologies: signature.extend(list(imported_ontology.classes())) signature.extend(list(imported_ontology.properties())) @@ -80,55 +82,79 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type iri = ontology_term.iri labels = self._get_labels(ontology_term) synonyms = self._get_synonyms(ontology_term) - parents = self._get_parents(ontology_term) + named_parents, complex_parents = self._get_parents(ontology_term) children = self._get_children(ontology_term, ontology) instances = self._get_instances(ontology_term, ontology) definitions = self._get_definitions(ontology_term) is_deprecated = deprecated[ontology_term] == [True] - if self._filter_term_type(ontology_term, "classes", False): - termtype = 'class' - elif self._filter_term_type(ontology_term, "properties", False): - termtype = 'property' - else: - termtype = None + if self._filter_term_type(ontology_term, OntologyTermType.CLASS, False): + term_type = OntologyTermType.CLASS + elif self._filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): + term_type = OntologyTermType.PROPERTY term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, - parents=parents, children=children, instances=instances, - deprecated=is_deprecated, termtype=termtype) + parents=named_parents, children=children, instances=instances, + restrictions=complex_parents, deprecated=is_deprecated, term_type=term_type) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) return ontology_terms def _filter_term_type(self, ontology_term, term_type, cached): - if term_type == 'classes': + if term_type == OntologyTermType.CLASS: if cached: - return ontology_term.termtype == 'class' + return ontology_term.term_type == OntologyTermType.CLASS else: - return not isinstance(ontology_term, PropertyClass) - elif term_type == 'properties': + return isinstance(ontology_term, ThingClass) + elif term_type == OntologyTermType.PROPERTY: if cached: - return ontology_term.termtype == 'property' + return ontology_term.term_type == OntologyTermType.PROPERTY else: return isinstance(ontology_term, PropertyClass) - elif term_type == 'both': + elif term_type == OntologyTermType.ANY: return True else: - raise ValueError("Option to include Properties or Classes is not valid") + raise ValueError("Invalid term-type option. Acceptable term types are: 'class' or 'property' or 'any'") def _get_parents(self, ontology_term): parents = dict() # named/atomic superclasses except owl:Thing + restrictions = dict() # restrictions are class expressions such as 'pancreatitis disease_has_location pancreas' try: all_parents = ontology_term.is_a # obtain direct parents of this entity for parent in all_parents: - # exclude OWL restrictions and owl:Thing and Self - if isinstance(parent, ThingClass) and parent is not Thing and parent is not ontology_term: - if len(parent.label) > 0: - parents.update({parent.iri: parent.label[0]}) - else: - parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) + # exclude owl:Thing and Self + if parent is not Thing and parent is not ontology_term: + if isinstance(parent, ThingClass): # get named parents (i.e. classes with IRIs) + self._add_named_parent(parent, parents) + elif isinstance(parent, And): # get conjuncts and add them to the respective structures + for conjunct in parent.Classes: + if isinstance(conjunct, ThingClass): # if conjunct is a named class, add it to parents dict + self._add_named_parent(conjunct, parents) + else: + self._add_complex_parent(conjunct, restrictions) + elif isinstance(parent, Restriction): # get complex parents, i.e. restrictions or class expressions + self._add_complex_parent(parent, restrictions) except (AttributeError, ValueError) as err: self.logger.debug(err) - return parents + return parents, restrictions + + def _add_named_parent(self, parent, parents): + if len(parent.label) > 0: + parents.update({parent.iri: parent.label[0]}) + else: + parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) + + def _add_complex_parent(self, parent, restrictions): + property_iri = parent.property.iri + if isinstance(parent.value, ThingClass): # the filler is a named term (i.e., it has an IRI) + value = parent.value.iri + else: # the filler is another complex class expression + value = parent.value + if property_iri in restrictions.keys(): + current_restrictions = restrictions[property_iri] + current_restrictions.add(value) + restrictions.update({property_iri: current_restrictions}) + else: + restrictions.update({property_iri: str(value)}) def _get_children(self, ontology_term, ontology): children = dict() @@ -175,7 +201,7 @@ def _get_labels(self, ontology_term): self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) return labels - def _get_synonyms(self, ontology_term, include_broad_synonyms=False): + def _get_synonyms(self, ontology_term, include_related_synonyms=False, include_broad_synonyms=False): """ Collect the synonyms of the given ontology term :param ontology_term: Ontology term @@ -185,12 +211,13 @@ def _get_synonyms(self, ontology_term, include_broad_synonyms=False): synonyms = set() for synonym in self._get_obo_exact_synonyms(ontology_term): synonyms.add(synonym) - for synonym in self._get_obo_related_synonyms(ontology_term): - synonyms.add(synonym) for nci_synonym in self._get_nci_synonyms(ontology_term): synonyms.add(nci_synonym) for efo_alt_term in self._get_efo_alt_terms(ontology_term): synonyms.add(efo_alt_term) + if include_related_synonyms: + for synonym in self._get_obo_related_synonyms(ontology_term): + synonyms.add(synonym) if include_broad_synonyms: for synonym in self._get_obo_broad_synonyms(ontology_term): synonyms.add(synonym) From fbfe58322da338b9db377c40534dd006be73f86b Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:11:15 -0400 Subject: [PATCH 098/185] Rename tagged_terms -> tagged_term and fix some references --- text2term/{tagged_terms.py => tagged_term.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename text2term/{tagged_terms.py => tagged_term.py} (96%) diff --git a/text2term/tagged_terms.py b/text2term/tagged_term.py similarity index 96% rename from text2term/tagged_terms.py rename to text2term/tagged_term.py index 53d3441..f0204e1 100644 --- a/text2term/tagged_terms.py +++ b/text2term/tagged_term.py @@ -34,5 +34,5 @@ def get_source_term_id(self): return self.source_term_id def to_dict(self): - return {term : tags} + return {self.term: self.tags} \ No newline at end of file From f2eab117a1093162608fb5413c04f79b2e66782f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:17:14 -0400 Subject: [PATCH 099/185] Fix imports to deal with file rename --- text2term/preprocess.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 44e4f0f..2ef2838 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -1,13 +1,12 @@ import re -import os -from enum import Enum -from .tagged_terms import TaggedTerm +from .tagged_term import TaggedTerm + ## Tags should be stored with their terms in the same line, delineated by ";:;" ## ex: Age when diagnosed with (.*) ;:; age,diagnosis ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} -def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ - blocklist_char='', rem_duplicates=False, separator=";:;"): +def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", + blocklist_char='', rem_duplicates=False, separator=";:;"): # Seperate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] @@ -58,10 +57,10 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", \ return processed_terms -def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ - blocklist_char='', rem_duplicates=False): + +def preprocess_terms(terms, template_path, output_file="", blocklist_path="", blocklist_char='', rem_duplicates=False): if isinstance(terms, str): - terms = _get_values(file_path) + terms = _get_values(file_path) # TODO: Unresolved reference 'file_path' # Form the templates as regular expressions template_strings = [] if template_path != "": @@ -96,6 +95,7 @@ def preprocess_terms(terms, template_path, output_file="", blocklist_path="", \ fp.write('\n'.join(processed_terms.values())) return processed_terms + ## Note: Because Python Dictionaries and Lists are passed by reference (sort of), updating the ## dictionary/list here will update the dictionary in the caller def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=False): @@ -110,20 +110,24 @@ def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=Fal return True return False + def _update_tagged_term(processed_terms, term, new_term, tags=[]): term.update_term(new_term) term.add_tags(tags) processed_terms.append(term) + def _get_values(path): return open(path).read().splitlines() + def _make_regex_list(strings): regexes = [] for string in strings: regexes.append(re.compile(string)) return regexes + def _remove_duplicates(terms): if type(terms) is dict: temp = {val : key for key, val in terms.items()} From eeb7f7e9a95613f0aff3c5f94cefa735ea0b6359 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:20:29 -0400 Subject: [PATCH 100/185] Fix imports in t2t, minor fixes & shift docs to inside function --- text2term/t2t.py | 135 +++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 58 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 66f1233..14a7342 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -2,13 +2,12 @@ import sys import json import pickle -import time import datetime -import owlready2 import pandas as pd from text2term import onto_utils -from text2term.mapper import Mapper from text2term import onto_cache +from text2term.mapper import Mapper +from text2term.term import OntologyTermType from text2term.term_collector import OntologyTermCollector from text2term.term_graph_generator import TermGraphGenerator from text2term.bioportal_mapper import BioPortalAnnotatorMapper @@ -16,57 +15,60 @@ from text2term.tfidf_mapper import TFIDFMapper from text2term.zooma_mapper import ZoomaMapper from text2term.config import VERSION -from text2term.tagged_terms import TaggedTerm +from text2term.tagged_term import TaggedTerm from text2term.term_mapping import TermMapping IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" -""" -Maps the terms in the given list to the specified target ontology. - -Parameters ----------- -source_terms : list - List of 'source' terms to map to ontology terms -target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies -base_iris : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') -source_terms_ids : tuple - Collection of identifiers for the given source terms -excl_deprecated : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` -mapper : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, - fuzzy, tfidf, zooma, bioportal -max_mappings : int - Maximum number of top-ranked mappings returned per source term -min_score : float - Minimum similarity score [0,1] for the mappings (1=exact match) -output_file : str - Path to desired output file for the mappings -save_graphs : bool - Save vis.js graphs representing the neighborhood of each ontology term -save_mappings : bool - Save the generated mappings to a file (specified by `output_file`) - -Returns ----------- -df - Data frame containing the generated ontology mappings -""" + +# TODO missing parameters in docs def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - separator=',', use_cache=False, term_type='classes', incl_unmapped=False): + min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, + source_terms_ids=(), separator=',', use_cache=False, term_type=OntologyTermType.CLASS, + incl_unmapped=False): + """ + Maps the terms in the given list to the specified target ontology. + + Parameters + ---------- + source_terms : list + List of 'source' terms to map to ontology terms + target_ontology : str + Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ # Parse the possible source terms options and tags source_terms, source_term_ids, tags = _parse_source_terms(source_terms, source_terms_ids, csv_columns, separator) - # Create Source Term Ids if they are not provided + # Create source term IDs if they are not provided if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: - sys.stderr.write("Warning: Source Term Ids are non-zero, but will not be used.") + sys.stderr.write("Warning: Source Term IDs are non-zero, but will not be used.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) # Create the output file if output_file == '': @@ -81,17 +83,18 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: - _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, \ - excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) if save_graphs: _save_graphs(target_terms, output_file) return mappings_df + # Caches a single ontology def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): if ontology_acronym == "": ontology_acronym = ontology_url - ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type='both') + ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type=OntologyTermType.ANY) cache_dir = "cache/" + ontology_acronym + "/" if not os.path.exists(cache_dir): os.makedirs(cache_dir) @@ -101,9 +104,12 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): ontology_terms.clear() return onto_cache.OntologyCache(ontology_acronym) + """ PRIVATE/HELPER FUNCTIONS """ + + # Parses the source terms and returns what is to be mapped, the term ids, and the tags def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separator=','): # If source_terms is a string, we assume it is a file location @@ -120,7 +126,7 @@ def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separ source_terms_id_list = [] for tagged_term in source_terms: terms.append(tagged_term.get_term()) - if tagged_term.get_source_term_id() != None: + if tagged_term.get_source_term_id() is None: source_terms_id_list.append(tagged_term.get_source_term_id()) source_terms_ids = source_terms_id_list tags = source_terms @@ -129,11 +135,11 @@ def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separ tags = dict.fromkeys(terms) return terms, source_terms_ids, tags + def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): - start = time.time() with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: pickle.dump(ontology_terms, out_file) - end = time.time() + def _load_data(input_file_path, csv_column_names, separator): if len(csv_column_names) >= 1: @@ -148,19 +154,22 @@ def _load_data(input_file_path, csv_column_names, separator): term_ids = onto_utils.generate_iris(len(terms)) return terms, term_ids + def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type='classes'): term_collector = OntologyTermCollector() if use_cache: - pickle_file = "cache/" + ontology + "/" + ontology + "-term-details.pickle" + pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: - onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) + onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, + term_type=term_type) if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms + def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): to_map, tags = _process_tags(source_terms, tags) if mapper == Mapper.TFIDF: @@ -185,6 +194,7 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi df = _add_tags_to_df(mappings_df, tags) return df + # Takes in the tags and source terms and processes them accordingly def _process_tags(source_terms, tags): to_map = [] @@ -197,14 +207,17 @@ def _process_tags(source_terms, tags): if tag.get_term() == term: term_tags = tag.get_tags() break + # TODO: Local variable 'term_tags' might be referenced before assignmen if isinstance(term_tags, list): if not any(tag in IGNORE_TAGS for tag in term_tags): to_map.append(term) + # TODO: Local variable 'term_tags' might be referenced before assignmen else: if term_tags not in IGNORE_TAGS: to_map.append(term) return to_map, tags + def _add_tags_to_df(df, tags): if isinstance(tags, dict): for key, value in tags.items(): @@ -213,12 +226,13 @@ def _add_tags_to_df(df, tags): else: to_store = str(value) df.loc[df['Source Term'] == key, "Tags"] = to_store - else: + else: for term in tags: to_store = ','.join(term.get_tags()) df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store return df + def _filter_mappings(mappings_df, min_score): new_df = pd.DataFrame(columns=mappings_df.columns) for index, row in mappings_df.iterrows(): @@ -226,6 +240,7 @@ def _filter_mappings(mappings_df, min_score): new_df.loc[len(new_df.index)] = row return new_df + def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): mapped = pd.unique(mappings_df["Source Term"]) for (term, term_id) in zip(source_terms, source_terms_ids): @@ -235,6 +250,7 @@ def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): mappings_df.loc[len(mappings_df.index)] = non_mapping.to_dict() return mappings_df + def _add_tag(tags, term, to_add, ignore=False): if isinstance(tags, dict): new_tags = tags.get(term, []) @@ -254,14 +270,15 @@ def _add_tag(tags, term, to_add, ignore=False): if tagged_term.get_term() == term and check_ignore: tagged_term.add_tags([to_add]) -def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, \ - excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): + +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): if os.path.dirname(output_file): # create output directories if needed os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "a") as f: - f.write("# Date and time run: %s\n" % datetime.datetime.now()) + f.write("# Timestamp: %s\n" % datetime.datetime.now()) f.write("# Target Ontology: %s\n" % target_ontology) - f.write("# Text2term version: %s\n" % VERSION) + f.write("# text2term version: %s\n" % VERSION) f.write("# Minimum Score: %.2f\n" % min_score) f.write("# Mapper: %s\n" % mapper.value) f.write("# Base IRIs: %s\n" % (base_iris,)) @@ -272,10 +289,12 @@ def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, ba f.write("# Unmapped Terms ") f.write("Excluded\n" if not incl_unmapped else "Included\n") writestring = "# Of " + str(len(source_terms)) + " entries, " + str(len(pd.unique(mappings["Source Term ID"]))) - writestring += " were successfully mapped to " + str(len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" + writestring += " were mapped to " + str( + len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" f.write(writestring) mappings.to_csv(output_file, index=False, mode='a') + def _save_graphs(terms, output_file): term_graphs = TermGraphGenerator(terms).graphs_dicts() with open(output_file + "-term-graphs.json", 'w') as json_file: From 9a3f375780f548ab351a16053a0c943af3683268 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 5 Oct 2023 20:21:07 -0400 Subject: [PATCH 101/185] Bump version --- text2term/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/config.py b/text2term/config.py index a2ded2f..189c03b 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "3.0.2" +VERSION = "4.0.0" From ee5a0cc1753c8cd85b469eb049019b703354bbb7 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 15:55:43 -0400 Subject: [PATCH 102/185] Minor changes to adhere to PEP 8 code style --- text2term/__main__.py | 2 +- text2term/onto_utils.py | 6 ++---- text2term/tagged_term.py | 2 +- text2term/term.py | 2 +- text2term/term_collector.py | 8 +++++--- text2term/term_graph_generator.py | 3 +++ text2term/tfidf_mapper.py | 3 ++- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index 39fa830..193f16d 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -9,7 +9,7 @@ parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' 'entities to controlled terms in an ontology') parser.add_argument("-s", "--source", required=True, type=str, - help="Input file containing 'source' terms to map to ontology terms (list of terms or CSV file)") + help="Input file containing 'source' terms to map to ontology terms: list of terms or CSV file") parser.add_argument("-t", "--target", required=True, type=str, help="Path or URL of 'target' ontology to map source terms to. When the chosen mapper is " "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 222f6c2..9cbd9ac 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -1,6 +1,4 @@ import logging -import re -import sys import pandas as pd import bioregistry import shortuuid @@ -21,8 +19,8 @@ 'later', 'trimester'} QUANTITY_WORDS = {'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'frequently', 'per', 'hour', 'day', 'week', 'month', - 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', 'percentage', - 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} + 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', + 'percentage', 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} def normalize_list(token_list): diff --git a/text2term/tagged_term.py b/text2term/tagged_term.py index f0204e1..20d6468 100644 --- a/text2term/tagged_term.py +++ b/text2term/tagged_term.py @@ -1,6 +1,6 @@ class TaggedTerm: - def __init__(self, term=None, tags=[], original_term=None, source_term_id=None): + def __init__(self, term=None, tags=(), original_term=None, source_term_id=None): self.term = term self.tags = tags self.original_term = original_term diff --git a/text2term/term.py b/text2term/term.py index 334e218..618bda8 100644 --- a/text2term/term.py +++ b/text2term/term.py @@ -1,4 +1,4 @@ -"""Provides OntologyTerm and OntologyTermType classes""" +"""Provides OntologyTerm class and OntologyTermType string enumeration""" from enum import Enum diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 3ee622f..f96548b 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -35,7 +35,8 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex query = iri + "*" self.logger.info("...collecting terms with IRIs starting in: " + iri) iris = list(default_world.search(iri=query)) - ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated, term_type) + ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated, + term_type) else: ontology_signature = self._get_ontology_signature(ontology) ontology_terms = self._get_ontology_terms(ontology_signature, ontology, exclude_deprecated, term_type) @@ -93,7 +94,8 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type term_type = OntologyTermType.PROPERTY term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=named_parents, children=children, instances=instances, - restrictions=complex_parents, deprecated=is_deprecated, term_type=term_type) + restrictions=complex_parents, deprecated=is_deprecated, + term_type=term_type) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) @@ -366,7 +368,7 @@ def _load_ontology(self, ontology_iri): self.logger.info("Loading ontology %s...", ontology_iri) start = time.time() owl_link = bioregistry.get_owl_download(ontology_iri) - if owl_link != None: + if owl_link is not None: ontology_iri = owl_link ontology = get_ontology(ontology_iri).load() end = time.time() diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py index 231c602..c2a061b 100644 --- a/text2term/term_graph_generator.py +++ b/text2term/term_graph_generator.py @@ -1,6 +1,9 @@ +"""Provides TermGraphGenerator class""" + from text2term import onto_utils from text2term.term_graph import TermGraph, Node, Edge + class TermGraphGenerator: def __init__(self, terms): diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 098c04f..2e5566f 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -28,7 +28,8 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): Default set to 0, so consider all candidates """ self.logger.info("Mapping %i source terms...", len(source_terms)) - self.logger.info("...against %i ontology terms (%i labels/synonyms)", len(self.target_ontology_terms), len(self.target_labels)) + self.logger.info("...against %i ontology terms (%i labels/synonyms)", len(self.target_ontology_terms), + len(self.target_labels)) start = time.time() source_terms_norm = onto_utils.normalize_list(source_terms) vectorizer = self._tokenize(source_terms_norm, self.target_labels) From a7332ed6592f80bff3f6cb08be2014fe89109ae6 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 16:06:58 -0400 Subject: [PATCH 103/185] Add documentation about parameters in map_terms Specify cache directory using os.path.join --- text2term/t2t.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 14a7342..45946bd 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -22,7 +22,6 @@ UNMAPPED_TAG = "unmapped" -# TODO missing parameters in docs def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), separator=',', use_cache=False, term_type=OntologyTermType.CLASS, @@ -35,11 +34,15 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ source_terms : list List of 'source' terms to map to ontology terms target_ontology : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies + Filepath or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. + When the target ontology has been previously cached, provide the ontology name as used when it was cached base_iris : tuple Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + csv_columns : tuple + Name of column containing the terms to map, optionally followed by another column name containing the term IDs, + for example: ('disease', 'disease_identifier') source_terms_ids : tuple Collection of identifiers for the given source terms excl_deprecated : bool @@ -57,6 +60,14 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ Save vis.js graphs representing the neighborhood of each ontology term save_mappings : bool Save the generated mappings to a file (specified by `output_file`) + separator : str + Symbol used to separate columns in the input table (eg ',' or '\t' for csv or tsv, respectively) + use_cache : bool + Use a previously cached ontology + term_type : OntologyTermType + The type(s) of ontology terms to map to, which can be 'class' or 'property' or 'any' + incl_unmapped : bool + Include unmapped terms in the output data frame Returns ---------- @@ -80,7 +91,8 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ else: target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) # Run the mapper - mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, + incl_unmapped) mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, @@ -95,7 +107,7 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): if ontology_acronym == "": ontology_acronym = ontology_url ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type=OntologyTermType.ANY) - cache_dir = "cache/" + ontology_acronym + "/" + cache_dir = os.path.join("cache", ontology_acronym) if not os.path.exists(cache_dir): os.makedirs(cache_dir) @@ -207,11 +219,9 @@ def _process_tags(source_terms, tags): if tag.get_term() == term: term_tags = tag.get_tags() break - # TODO: Local variable 'term_tags' might be referenced before assignmen if isinstance(term_tags, list): if not any(tag in IGNORE_TAGS for tag in term_tags): to_map.append(term) - # TODO: Local variable 'term_tags' might be referenced before assignmen else: if term_tags not in IGNORE_TAGS: to_map.append(term) From 48cfbfb745b3c7ab1899aa00b3ad58cd103864f4 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 16:10:06 -0400 Subject: [PATCH 104/185] Fix variable reference and typo --- text2term/preprocess.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 2ef2838..06cd199 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -7,14 +7,14 @@ ## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", blocklist_char='', rem_duplicates=False, separator=";:;"): - # Seperate tags from the terms, put in TaggedTerm and add to list + # Separate tags from the terms, put in TaggedTerm and add to list raw_terms = _get_values(file_path) terms = [] for raw_term in raw_terms: - seperated = raw_term.split(separator) + separated = raw_term.split(separator) try: - tags = seperated[1].split(",") - term = TaggedTerm(original_term=seperated[0], tags=tags) + tags = separated[1].split(",") + term = TaggedTerm(original_term=separated[0], tags=tags) except IndexError: term = TaggedTerm(original_term=raw_term) terms.append(term) @@ -24,10 +24,10 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", if template_path != "": raw_templates = _get_values(template_path) for raw_template in raw_templates: - seperated = raw_template.split(separator) + separated = raw_template.split(separator) try: - tags = seperated[1].split(",") - regex_term = re.compile(seperated[0]) + tags = separated[1].split(",") + regex_term = re.compile(separated[0]) templates[regex_term] = tags except IndexError: regex_term = re.compile(raw_template) @@ -60,7 +60,7 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", def preprocess_terms(terms, template_path, output_file="", blocklist_path="", blocklist_char='', rem_duplicates=False): if isinstance(terms, str): - terms = _get_values(file_path) # TODO: Unresolved reference 'file_path' + terms = _get_values(terms) # if 'terms' is a string, we assume it is a filepath # Form the templates as regular expressions template_strings = [] if template_path != "": @@ -111,7 +111,7 @@ def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=Fal return False -def _update_tagged_term(processed_terms, term, new_term, tags=[]): +def _update_tagged_term(processed_terms, term, new_term, tags=()): term.update_term(new_term) term.add_tags(tags) processed_terms.append(term) From ccd9119a5029baebe4e366bdfb426060c9466301 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 16:12:54 -0400 Subject: [PATCH 105/185] Remove no longer needed mapping functions Specify paths using os.path.join. Fix typo in function name. PEP8 style changes --- text2term/onto_cache.py | 65 +++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py index 7af3e40..a2ccc61 100644 --- a/text2term/onto_cache.py +++ b/text2term/onto_cache.py @@ -1,14 +1,19 @@ -import text2term -from .mapper import Mapper import os -from shutil import rmtree import sys -import pandas as pd +import text2term import owlready2 +import pandas as pd +from .term import OntologyTermType +from .mapper import Mapper +from shutil import rmtree + +CACHE_FOLDER = "cache" """ CACHING FUNCTIONS -- Public """ + + # Caches many ontologies from a csv def cache_ontology_set(ontology_registry_path): registry = pd.read_csv(ontology_registry_path) @@ -16,62 +21,46 @@ def cache_ontology_set(ontology_registry_path): for index, row in registry.iterrows(): try: cache = text2term.cache_ontology(row.url, row.acronym) - cache_set.update({row.acronym : cache}) + cache_set.update({row.acronym: cache}) except Exception as err: err_message = "Could not cache ontology " + row.acronym + " due to error: " + str(err) sys.stderr.write(err_message) owlready2.default_world.ontologies.clear() return cache_set + # Will check if an acronym exists in the cache def cache_exists(ontology_acronym=''): - return os.path.exists("cache/" + ontology_acronym) + return os.path.exists(os.path.join(CACHE_FOLDER, ontology_acronym)) + # Clears the cache def clear_cache(ontology_acronym=''): - cache_dir = "cache/" if ontology_acronym != '': - cache_dir = os.path.join(cache_dir, ontology_acronym) + cache_dir = os.path.join(CACHE_FOLDER, ontology_acronym) # Is equivalent to: rm -r cache_dir try: - rmtree(cache_dir) + rmtree(CACHE_FOLDER) sys.stderr.write("Cache has been cleared successfully\n") except OSError as error: sys.stderr.write("Cache cannot be removed:") - sys.stderr.write(error) + sys.stderr.write(str(error)) + -## Class that is returned to run +# Class that is returned to run class OntologyCache: def __init__(self, ontology_acronym): self.acronym = ontology_acronym - self.ontology = "cache/" + ontology_acronym + "/" + self.ontology = os.path.join(CACHE_FOLDER, ontology_acronym) def map_terms(self, source_terms, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - term_type='classes'): - return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) - - def map_tagged_terms(self, tagged_terms_dict, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, - mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), - term_type='classes'): - return text2term.map_tagged_terms(tagged_terms_dict, self.acronym, base_iris=base_iris, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) - - def map_file(self, input_file, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, - mapper=Mapper.TFIDF, min_score=0.3, output_file='', save_graphs=False, save_mappings=False, - separator=',', term_type='classes'): - return text2term.map_file(source_terms, self.acronym, base_iris=base_iris, csv_columns=csv_columns, \ - excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, \ - mapper=mapper, output_file=output_file, save_graphs=save_graphs, separator=separator, \ - save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, \ - term_type=term_type) + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + term_type=OntologyTermType.CLASS): + return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, + mapper=mapper, output_file=output_file, save_graphs=save_graphs, + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, + term_type=term_type) def clear_cache(self): clear_cache(self.acronym) @@ -79,5 +68,5 @@ def clear_cache(self): def cache_exists(self): return cache_exists(self.acronym) - def acroynm(self): + def acronym(self): return self.acronym From 1ea32a90e99cbc1e11cb9bdab3be3a054936c8ac Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 17:42:59 -0400 Subject: [PATCH 106/185] Update docs of source_terms parameter in map_terms() Use os.path.join to specify file paths --- text2term/t2t.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 45946bd..764c83d 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -31,8 +31,9 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ Parameters ---------- - source_terms : list - List of 'source' terms to map to ontology terms + source_terms : str or list or dict + Path to file containing the terms to map to. Or list of terms to map to an ontology. Or dictionary containing + tagged terms, where the keys are the source terms and the values are tags attached to those terms target_ontology : str Filepath or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. @@ -112,7 +113,7 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): os.makedirs(cache_dir) _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) - _save_graphs(ontology_terms, output_file=cache_dir + ontology_acronym) + _save_graphs(ontology_terms, output_file=os.path.join(cache_dir, ontology_acronym)) ontology_terms.clear() return onto_cache.OntologyCache(ontology_acronym) @@ -149,7 +150,7 @@ def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separ def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): - with open(cache_dir + ontology_acronym + "-term-details.pickle", 'wb+') as out_file: + with open(os.path.join(cache_dir, ontology_acronym + "-term-details.pickle"), 'wb+') as out_file: pickle.dump(ontology_terms, out_file) @@ -167,14 +168,13 @@ def _load_data(input_file_path, csv_column_names, separator): return terms, term_ids -def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type='classes'): +def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type=OntologyTermType.CLASS): term_collector = OntologyTermCollector() if use_cache: pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: - onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) if len(onto_terms) == 0: From ae45a81389c5dfa62e381a46aea2060eb12be476 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 19:06:52 -0400 Subject: [PATCH 107/185] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 08a52ee..f936def 100644 --- a/README.md +++ b/README.md @@ -65,22 +65,22 @@ text2term.map_terms(source_terms, save_mappings=False, separator=',', use_cache=False, - term_type='classes', + term_type=OntologyTermType.CLASS, incl_unmapped=False) ``` NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one. ### Arguments -For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3)dictionary of terms to a list of tags, or a list of TaggedTerm objects (see below). +For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3) a dictionary where the keys are the terms to be mapped, and values can be a list of tags or a list of TaggedTerm objects (see below). Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below). All other arguments are the same, and have the same functionality: `target_ontology` : str - Path or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies - As of version 2.3.0, passing a recognized acronym to `target_ontology` will generate the download link automatically. This is done using the `bioregistry` python package. + Path or URL or acronym of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. When the target ontology has been previously cached, provide the ontology name that was used to cache it. + As of version 2.3.0, it is possible to specify ontology acronyms as the `target_ontology` (eg "EFO" or "CL"), which is achieved using [bioregistry](https://bioregistry.io) to retrieve URLs for those acronyms. `base_iris` : tuple Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: @@ -116,16 +116,16 @@ All other arguments are the same, and have the same functionality: Save the generated mappings to a file (specified by `output_file`) `seperator` : str - Character that seperates the source term values if a file input is given. Ignored if the input is not a file path. + Character that separates the source term values if a file input is given. Ignored if the input is not a file path. `use_cache` : bool Use the cache for the ontology. More details are below. -`term_type` : str - Determines whether the ontology should be parsed for its classes (ThingClass), properties (PropertyClass), or both. Possible values are ['classes', 'properties', 'both']. If it does not match one of these values, the program will throw a ValueError. +`term_type` : term.OntologyTermType + Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any']. `incl_unmapped` : bool - Include all unmapped terms in the output. If something has been tagged Ignore (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output. + Include all unmapped terms in the output. If something has been tagged 'Ignore' (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output data frame. All default values, if they exist, can be seen above. From eeddaf2cad53a6077e66d708a1f68d73f1c6f7ff Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 6 Oct 2023 19:37:07 -0400 Subject: [PATCH 108/185] Fix error thrown when concatenating list and tuple --- text2term/tagged_term.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/text2term/tagged_term.py b/text2term/tagged_term.py index 20d6468..7891f63 100644 --- a/text2term/tagged_term.py +++ b/text2term/tagged_term.py @@ -1,3 +1,5 @@ +"""Provides TaggedTerm class""" + class TaggedTerm: def __init__(self, term=None, tags=(), original_term=None, source_term_id=None): @@ -10,7 +12,7 @@ def __repr__(self): return f" Date: Fri, 6 Oct 2023 19:38:08 -0400 Subject: [PATCH 109/185] Use enums for mapper and term_type where possible --- text2term/__main__.py | 5 +++-- text2term/preprocess.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index 193f16d..df9863b 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -4,6 +4,7 @@ from t2t import map_terms, cache_ontology from onto_cache import cache_exists from mapper import Mapper +from term import OntologyTermType if __name__ == "__main__": parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' @@ -16,7 +17,7 @@ "'all' to search all ontologies") parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", + parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), @@ -38,7 +39,7 @@ help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", help="Store the target ontology into local cache under acronym") - parser.add_argument("-type", "--term_type", required=False, type=str, default="classes", + parser.add_argument("-type", "--term_type", required=False, type=str, default=OntologyTermType.CLASS, help="Define whether to return ontology classes, properties, or both") arguments = parser.parse_args() diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 06cd199..2e97883 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -44,12 +44,12 @@ def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", for term in terms: if _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=True): continue - for template, tem_tags in templates.items(): + for template, term_tags in templates.items(): match = template.fullmatch(term.get_original_term()) if match: combined_matches = ' '.join(map(str, match.groups())) if combined_matches: - _update_tagged_term(processed_terms, term, combined_matches, tem_tags) + _update_tagged_term(processed_terms, term, combined_matches, term_tags) break if rem_duplicates: From b9f776a2e9bb1cd3dc14ba5cce2ba571bca39852 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 7 Oct 2023 15:47:11 -0400 Subject: [PATCH 110/185] Add logging to t2t module --- text2term/t2t.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 764c83d..3c9d8d1 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -1,7 +1,7 @@ import os -import sys import json import pickle +import logging import datetime import pandas as pd from text2term import onto_utils @@ -21,6 +21,8 @@ IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" +LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) + def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, @@ -80,7 +82,8 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ # Create source term IDs if they are not provided if len(source_terms_ids) != len(source_terms): if len(source_terms_ids) > 0: - sys.stderr.write("Warning: Source Term IDs are non-zero, but will not be used.") + LOGGER.warning(f"The number of Source Term IDs provided ({len(source_terms_ids)}) is different than the " + f"number of Source Terms ({len(source_terms)}). New Source Term IDs will be used instead.") source_terms_ids = onto_utils.generate_iris(len(source_terms)) # Create the output file if output_file == '': @@ -109,9 +112,9 @@ def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): ontology_acronym = ontology_url ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type=OntologyTermType.ANY) cache_dir = os.path.join("cache", ontology_acronym) + LOGGER.info(f"Caching ontology {ontology_url} to: {cache_dir}") if not os.path.exists(cache_dir): os.makedirs(cache_dir) - _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) _save_graphs(ontology_terms, output_file=os.path.join(cache_dir, ontology_acronym)) ontology_terms.clear() @@ -172,11 +175,13 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ term_collector = OntologyTermCollector() if use_cache: pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") + LOGGER.info(f"Loading cached ontology from: {pickle_file}") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) + LOGGER.info(f"Filtered ontology terms to those of type: {term_type}") if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") return onto_terms From 5b69d13edabab5aff2843137d67f8ee45f07409a Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 7 Oct 2023 15:50:52 -0400 Subject: [PATCH 111/185] Rename variable used both for term type and term type filter --- text2term/term_collector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index f96548b..85a5d61 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -89,13 +89,13 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type definitions = self._get_definitions(ontology_term) is_deprecated = deprecated[ontology_term] == [True] if self._filter_term_type(ontology_term, OntologyTermType.CLASS, False): - term_type = OntologyTermType.CLASS + owl_term_type = OntologyTermType.CLASS elif self._filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): - term_type = OntologyTermType.PROPERTY + owl_term_type = OntologyTermType.PROPERTY term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=named_parents, children=children, instances=instances, restrictions=complex_parents, deprecated=is_deprecated, - term_type=term_type) + term_type=owl_term_type) ontology_terms[iri] = term_details else: self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) From 480952a3c17777c2fa4dcac5df53de32b40cbe3b Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 7 Oct 2023 18:28:51 -0400 Subject: [PATCH 112/185] Update ontologies with latest releases. Add FoodOn --- text2term/resources/ontologies.csv | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/text2term/resources/ontologies.csv b/text2term/resources/ontologies.csv index 77edfb6..910acbd 100644 --- a/text2term/resources/ontologies.csv +++ b/text2term/resources/ontologies.csv @@ -1,11 +1,9 @@ acronym,version,url -CLO,2.1.178,http://purl.obolibrary.org/obo/clo.owl -CL,9/15/22,http://purl.obolibrary.org/obo/cl/releases/2022-09-15/cl.owl -EFO,3.46.0,https://github.com/EBISPOT/efo/releases/download/v3.46.0/efo.owl -GO,9/19/22,http://purl.obolibrary.org/obo/go/releases/2022-09-19/go.owl -HPO,6/11/22,http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl -MONDO,8/1/22,http://purl.obolibrary.org/obo/mondo/releases/2022-08-01/mondo.owl -NCIT,22.07d,http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl -PRO,67,http://purl.obolibrary.org/obo/pr/67.0/pr.owl -UBERON,8/19/22,http://purl.obolibrary.org/obo/uberon/releases/2022-08-19/uberon.owl -MP,8/4/22,http://purl.obolibrary.org/obo/mp/releases/2022-08-04/mp.owl \ No newline at end of file +CL,2023-09-21,https://github.com/obophenotype/cell-ontology/releases/download/v2023-09-21/cl.owl +EFO,3.57.0,https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl +FOODON,0.6.0,https://github.com/FoodOntology/foodon/raw/v0.6.0/foodon.owl +GO,2023-07-27,http://purl.obolibrary.org/obo/go/releases/2023-07-27/go.owl +HPO,2023-09-01,https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-09-01/hp.owl +MONDO,2023-09-12,https://github.com/monarch-initiative/mondo/releases/download/v2023-08-02/mondo.owl +NCIT,2022-08-19,https://github.com/NCI-Thesaurus/thesaurus-obo-edition/releases/download/v2022-08-19/ncit.owl +UBERON,2023-09-05,https://github.com/obophenotype/uberon/releases/download/v2023-09-05/uberon.owl \ No newline at end of file From 9496ec2792e807a53cb1aa42cf736b406cc3a265 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 15:37:31 -0400 Subject: [PATCH 113/185] Add some tests and documentation about the tests --- test/simple-test.py | 119 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 18 deletions(-) diff --git a/test/simple-test.py b/test/simple-test.py index be7ddaa..2e6a6f3 100644 --- a/test/simple-test.py +++ b/test/simple-test.py @@ -1,21 +1,104 @@ +import os +import pandas as pd import text2term -import bioregistry - -def main(): - efo = "http://www.ebi.ac.uk/efo/efo.owl#" - pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" - ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - if not text2term.cache_exists("EFO"): - cached_onto = text2term.cache_ontology("EFO") - # df = cached_onto.map_terms(["asthma", "disease location", "obsolete food allergy"], excl_deprecated=True, term_type="classes") - print("Cache exists:", cached_onto.cache_exists()) - # caches = text2term.cache_ontology_set("text2term/resources/ontologies.csv") - # df = text2term.map_terms(["asthma", "disease location", "obsolete food allergy"], "EFO", min_score=.8, mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, term_type="classes") - # df = text2term.map_terms(["contains", "asthma"], "EFO", term_type="classes") - df = text2term.map_terms({"asthma":"disease", "allergy":["ignore", "response"], "assdhfbswif":["sent"], "isdjfnsdfwd":None}, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) - # taggedterms = text2term.preprocess_tagged_terms("test/simple_preprocess.txt") - # df = text2term.map_terms(taggedterms, "EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) - print(df.to_string()) +from term import OntologyTermType + +pd.set_option('display.max_columns', None) + + +def run_tests(): + efo_url = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" + pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" + ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" + hpo = "http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl" + ecto = "http://purl.obolibrary.org/obo/ecto/releases/2022-12-12/ecto.owl" + + # ONTOLOGY CACHING + # Test caching an ontology loaded from a URL + print("Test caching an ontology loaded from a URL...") + efo_cache = text2term.cache_ontology(ontology_url=efo_url, ontology_acronym="EFO") + print(f"Cache exists: {efo_cache.cache_exists()}\n") + + # Test caching an ontology by resolving its acronym using bioregistry + print("Test caching an ontology by resolving its acronym using bioregistry...") + clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") + print(f"Cache exists: {clo_cache.cache_exists()}\n") + print() + + # Test caching the set of ontologies specified in resources/ontologies.csv + caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) + + # MAPPING TO A (CACHED) ONTOLOGY + # Test mapping a list of terms to cached EFO ontology + print("Test mapping a list of terms to cached EFO ontology...") + mappings_efo_cache = efo_cache.map_terms(["asthma", "disease location", "food allergy"], + term_type=OntologyTermType.ANY) + print(f"{mappings_efo_cache}\n") + + # Test mapping a list of terms to EFO loaded from a URL + print("Test mapping a list of terms to EFO loaded from a URL...") + mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology=efo_url, + term_type=OntologyTermType.ANY) + print(f"{mappings_efo_url}\n") + + # Test that mapping to cached ontology is the same as to ontology loaded from its URL + print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") + mappings_match = test_df_equals(drop_source_term_ids(mappings_efo_cache), + drop_source_term_ids(mappings_efo_url)) + print(f"...{mappings_match}") + + # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric + print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") + df1 = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", min_score=.8, + mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, + term_type=OntologyTermType.ANY) + print(f"{df1}\n") + + # Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry + print("Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) + print(f"{df2}\n") + + # TAGGED TERMS + # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output + print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") + df3 = text2term.map_terms( + {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], "isdjfnsdfwd": None}, + target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + print(f"{df3}\n") + + # Test processing tagged terms where the tags are provided in a file + print("Test processing tagged terms where the tags are provided in a file...") + tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") + df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) + print(f"{df4}\n") + + # MAPPING TO PROPERTIES + # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties + print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") + df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=efo_url, + term_type=OntologyTermType.PROPERTY) + print(f"{df5}\n") + + # Test mapping a list of properties to EFO loaded from cache and restrict search to properties + print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") + df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, + term_type=OntologyTermType.PROPERTY) + print(f"{df6}\n") + + # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL + properties_df_match = test_df_equals(drop_source_term_ids(df5), drop_source_term_ids(df6)) + print(f"...{properties_df_match}") + + +def drop_source_term_ids(df): + return df.drop('Source Term ID', axis=1) + + +def test_df_equals(df, expected_df): + pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) + return True + if __name__ == '__main__': - main() \ No newline at end of file + run_tests() From 8af2a07a8ac38fab51b7f6a69c4b8add9bd529ca Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 15:38:18 -0400 Subject: [PATCH 114/185] Rename to simple_tests.py --- test/{simple-test.py => simple_tests.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{simple-test.py => simple_tests.py} (100%) diff --git a/test/simple-test.py b/test/simple_tests.py similarity index 100% rename from test/simple-test.py rename to test/simple_tests.py From aaf253f043de2822b3d22fea80654957e8e0c411 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 16:05:47 -0400 Subject: [PATCH 115/185] Move some logging from individual mappers to t2t --- text2term/bioportal_mapper.py | 3 --- text2term/syntactic_mapper.py | 5 ----- text2term/t2t.py | 4 ++++ text2term/tfidf_mapper.py | 7 ------- text2term/zooma_mapper.py | 4 ---- 5 files changed, 4 insertions(+), 19 deletions(-) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py index dedcb3e..2e08bf0 100644 --- a/text2term/bioportal_mapper.py +++ b/text2term/bioportal_mapper.py @@ -30,12 +30,9 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional BioPortal Annotator-specific parameters to include in the request """ - self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) - start = time.time() mappings = [] for term, term_id in zip(source_terms, source_terms_ids): mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) - self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py index a9ab4ff..5316303 100644 --- a/text2term/syntactic_mapper.py +++ b/text2term/syntactic_mapper.py @@ -1,7 +1,6 @@ """Provides SyntacticMapper class""" import logging -import time import nltk import rapidfuzz from tqdm import tqdm @@ -26,14 +25,10 @@ def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_ma :param mapper: Mapping method to be used for matching :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned """ - self.logger.info("Mapping %i source terms...", len(source_terms)) - start = time.time() mappings = [] for term, term_id in tqdm(zip(source_terms, source_terms_ids), total=len(source_terms)): matches = self._map(term, term_id, mapper, max_mappings) mappings.extend(matches) - end = time.time() - self.logger.info('done (mapping time: %.2fs seconds)', end - start) return TermMappingCollection(mappings).mappings_df() def _map(self, source_term, source_term_id, mapper, max_matches=3): diff --git a/text2term/t2t.py b/text2term/t2t.py index 3c9d8d1..8892c24 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -3,6 +3,7 @@ import pickle import logging import datetime +import time import pandas as pd from text2term import onto_utils from text2term import onto_cache @@ -95,6 +96,7 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ else: target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) # Run the mapper + LOGGER.info(f"Mapping {len(source_terms)} source terms to {target_ontology}") mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, incl_unmapped) mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) @@ -189,6 +191,7 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): to_map, tags = _process_tags(source_terms, tags) + start = time.time() if mapper == Mapper.TFIDF: term_mapper = TFIDFMapper(ontology_terms) mappings_df = term_mapper.map(to_map, source_term_ids, max_mappings=max_mappings, min_score=min_score) @@ -203,6 +206,7 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi mappings_df = term_mapper.map(to_map, source_term_ids, mapper, max_mappings=max_mappings) else: raise ValueError("Unsupported mapper: " + mapper) + LOGGER.info("...done (mapping time: %.2fs seconds)", time.time() - start) # Add tags, process, and filter df = _filter_mappings(mappings_df, min_score) diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index 2e5566f..c90c7f9 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -1,7 +1,6 @@ """Provides TFIDFMapper class""" import logging -import time import sparse_dot_topn as ct from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from text2term import onto_utils @@ -27,16 +26,10 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates """ - self.logger.info("Mapping %i source terms...", len(source_terms)) - self.logger.info("...against %i ontology terms (%i labels/synonyms)", len(self.target_ontology_terms), - len(self.target_labels)) - start = time.time() source_terms_norm = onto_utils.normalize_list(source_terms) vectorizer = self._tokenize(source_terms_norm, self.target_labels) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) - end = time.time() - self.logger.info("...done (mapping time: %.2fs seconds)", end-start) return results_df def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): diff --git a/text2term/zooma_mapper.py b/text2term/zooma_mapper.py index 26df493..8f72377 100644 --- a/text2term/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -2,7 +2,6 @@ import json import logging -import time import requests from text2term import onto_utils from text2term.term_mapping import TermMappingCollection, TermMapping @@ -23,12 +22,9 @@ def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_pa :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param api_params: Additional Zooma API-specific parameters to include in the request """ - self.logger.info("Mapping %i source terms against ontologies: %s...", len(source_terms), ontologies) - start = time.time() mappings = [] for term, term_id in zip(source_terms, source_terms_ids): mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) - self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) return TermMappingCollection(mappings).mappings_df() def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): From 5849bf5dd483f858337b6e22f2825773adbce7ef Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 16:06:41 -0400 Subject: [PATCH 116/185] Add tests for Zooma and Bioportal mappers --- test/simple_tests.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/simple_tests.py b/test/simple_tests.py index 2e6a6f3..f745c2d 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -2,6 +2,7 @@ import pandas as pd import text2term from term import OntologyTermType +from mapper import Mapper pd.set_option('display.max_columns', None) @@ -90,6 +91,18 @@ def run_tests(): properties_df_match = test_df_equals(drop_source_term_ids(df5), drop_source_term_ids(df6)) print(f"...{properties_df_match}") + # Test mapping a list of terms to multiple ontologies using the Zooma mapper + print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") + df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) + print(f"{df_zooma}\n") + + # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper + print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") + df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + print(f"{df_bioportal}\n") + def drop_source_term_ids(df): return df.drop('Source Term ID', axis=1) From 04704de1ebe96379b470dbc1ebbed70d81542696 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 16:12:16 -0400 Subject: [PATCH 117/185] Fix cache folder misnaming introduced after refactoring --- text2term/onto_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py index a2ccc61..614f912 100644 --- a/text2term/onto_cache.py +++ b/text2term/onto_cache.py @@ -36,11 +36,12 @@ def cache_exists(ontology_acronym=''): # Clears the cache def clear_cache(ontology_acronym=''): + cache_dir = CACHE_FOLDER if ontology_acronym != '': cache_dir = os.path.join(CACHE_FOLDER, ontology_acronym) # Is equivalent to: rm -r cache_dir try: - rmtree(CACHE_FOLDER) + rmtree(cache_dir) sys.stderr.write("Cache has been cleared successfully\n") except OSError as error: sys.stderr.write("Cache cannot be removed:") From db84bcd29e7eb6c474a4dc83f1258be1fd2044c5 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 16:12:42 -0400 Subject: [PATCH 118/185] Rename to test_pypi.py to be consistent --- test/test-t2t.py | 17 +++++++++++++++++ test/test_pypi.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 test/test-t2t.py create mode 100644 test/test_pypi.py diff --git a/test/test-t2t.py b/test/test-t2t.py new file mode 100644 index 0000000..0470d94 --- /dev/null +++ b/test/test-t2t.py @@ -0,0 +1,17 @@ +import text2term +from text2term import Mapper + +text2term.map_file(input_file="/Users/rsgoncalves/Documents/Harvard/gwaslake/all-traits-list-July23.csv", + target_ontology="EFO", use_cache=True, + csv_columns=("trait", "trait_id"), + max_mappings=1, + min_score=0.6, + excl_deprecated=True, + separator=",", + save_mappings=True, + mapper=Mapper.TFIDF, + output_file="/Users/rsgoncalves/Documents/Workspace/text2term/test/output/opengwas-mappings.csv", + base_iris=("http://www.ebi.ac.uk/efo/", "http://purl.obolibrary.org/obo/MONDO", + "http://purl.obolibrary.org/obo/HP"), + save_graphs=False + ) diff --git a/test/test_pypi.py b/test/test_pypi.py new file mode 100644 index 0000000..6d04fe2 --- /dev/null +++ b/test/test_pypi.py @@ -0,0 +1,46 @@ +import os +import sys +import text2term +from text2term.term import OntologyTermType +from contextlib import contextmanager + + +def main(): + try: + with suppress_stdout(): + # Simple set up and testing + text2term.map_terms(["fever", "headache"], + "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") + text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") + text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) + text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), + mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) + + # Properties and classes tests + text2term.map_terms(["fever", "headache"], "EFO", term_type=OntologyTermType.CLASS, use_cache=True) + text2term.map_terms(["contains", "location"], "EFO", term_type=OntologyTermType.PROPERTY, use_cache=True) + text2term.map_terms(["fever", "contains"], "EFO", term_type=OntologyTermType.ANY, use_cache=True) + + # Clear cache and set down + text2term.clear_cache("EFO") + except: + print("ERROR") + + +# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python +@contextmanager +def suppress_stdout(): + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + old_stderr = sys.stderr + sys.stdout = devnull + sys.stderr = devnull + try: + yield + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + + +if __name__ == '__main__': + main() From dcec08e281dbecf8b421f851d5617e2bf05ea3b1 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 17:58:13 -0400 Subject: [PATCH 119/185] Revert adding test with local file system references This partly reverts commit db84bcd29e7eb6c474a4dc83f1258be1fd2044c5. --- test/test-t2t.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 test/test-t2t.py diff --git a/test/test-t2t.py b/test/test-t2t.py deleted file mode 100644 index 0470d94..0000000 --- a/test/test-t2t.py +++ /dev/null @@ -1,17 +0,0 @@ -import text2term -from text2term import Mapper - -text2term.map_file(input_file="/Users/rsgoncalves/Documents/Harvard/gwaslake/all-traits-list-July23.csv", - target_ontology="EFO", use_cache=True, - csv_columns=("trait", "trait_id"), - max_mappings=1, - min_score=0.6, - excl_deprecated=True, - separator=",", - save_mappings=True, - mapper=Mapper.TFIDF, - output_file="/Users/rsgoncalves/Documents/Workspace/text2term/test/output/opengwas-mappings.csv", - base_iris=("http://www.ebi.ac.uk/efo/", "http://purl.obolibrary.org/obo/MONDO", - "http://purl.obolibrary.org/obo/HP"), - save_graphs=False - ) From 4475e2617bb1f057e64d56411f5e363205247e12 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 17:59:50 -0400 Subject: [PATCH 120/185] Delete test-pypi.py --- test/test-pypi.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 test/test-pypi.py diff --git a/test/test-pypi.py b/test/test-pypi.py deleted file mode 100644 index 54e2390..0000000 --- a/test/test-pypi.py +++ /dev/null @@ -1,39 +0,0 @@ -from contextlib import contextmanager -import sys, os -import text2term - -def main(): - try: - with suppress_stdout(): - # Simple set up and testing - text2term.map_terms(["fever", "headache"], "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") - text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") - text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) - text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) - - # Properties and classes tests - text2term.map_terms(["fever", "headache"], "EFO", term_type="classes", use_cache=True) - text2term.map_terms(["contains", "location"], "EFO", term_type="properties", use_cache=True) - text2term.map_terms(["fever", "contains"], "EFO", term_type="both", use_cache=True) - - # Clear cache and set down - text2term.clear_cache("EFO") - except: - print("ERROR") - -# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python -@contextmanager -def suppress_stdout(): - with open(os.devnull, "w") as devnull: - old_stdout = sys.stdout - old_stderr = sys.stderr - sys.stdout = devnull - sys.stderr = devnull - try: - yield - finally: - sys.stdout = old_stdout - sys.stderr = old_stderr - -if __name__ == '__main__': - main() \ No newline at end of file From 1f6bfa8fd1e525add03434087f1a597f8a2aa293 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 19:15:09 -0400 Subject: [PATCH 121/185] Modify TermCollector to be able to reuse previously loaded ontology --- text2term/t2t.py | 5 +++-- text2term/term_collector.py | 41 ++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 8892c24..890088c 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -174,15 +174,16 @@ def _load_data(input_file_path, csv_column_names, separator): def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type=OntologyTermType.CLASS): - term_collector = OntologyTermCollector() + term_collector = OntologyTermCollector(ontology_iri=ontology) if use_cache: pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") LOGGER.info(f"Loading cached ontology from: {pickle_file}") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: - onto_terms = term_collector.get_ontology_terms(ontology, base_iris=iris, exclude_deprecated=exclude_deprecated, + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) + term_collector.close() LOGGER.info(f"Filtered ontology terms to those of type: {term_type}") if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 85a5d61..31f9a23 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -9,23 +9,25 @@ class OntologyTermCollector: - def __init__(self, log_level=logging.INFO): + def __init__(self, ontology_iri, use_reasoning=False, log_level=logging.INFO): + """ + Construct an ontology term collector for the ontology at the given IRI + :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) + :param use_reasoning: Use a reasoner to compute inferred class hierarchy + """ self.logger = onto_utils.get_logger(__name__, level=log_level) + self.ontology = self._load_ontology(ontology_iri) + if use_reasoning: + self._classify_ontology(self.ontology) - def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, exclude_deprecated=False, - term_type=OntologyTermType.ANY): + def get_ontology_terms(self, base_iris=(), exclude_deprecated=False, term_type=OntologyTermType.ANY): """ Collect the terms described in the ontology at the specified IRI - :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple - :param use_reasoning: Use a reasoner to compute inferred class hierarchy :param exclude_deprecated: Exclude ontology terms stated as deprecated using owl:deprecated 'true' :param term_type: Type of term--can be 'class' or 'property' or 'any' (individuals may be added in the future) :return: Dictionary of ontology term IRIs and their respective details in the specified ontology """ - ontology = self._load_ontology(ontology_iri) - if use_reasoning: - self._classify_ontology(ontology) self.logger.info("Collecting ontology term details...") start = time.time() ontology_terms = dict() @@ -35,20 +37,14 @@ def get_ontology_terms(self, ontology_iri, base_iris=(), use_reasoning=False, ex query = iri + "*" self.logger.info("...collecting terms with IRIs starting in: " + iri) iris = list(default_world.search(iri=query)) - ontology_terms = ontology_terms | self._get_ontology_terms(iris, ontology, exclude_deprecated, + ontology_terms = ontology_terms | self._get_ontology_terms(iris, self.ontology, exclude_deprecated, term_type) else: - ontology_signature = self._get_ontology_signature(ontology) - ontology_terms = self._get_ontology_terms(ontology_signature, ontology, exclude_deprecated, term_type) + ontology_signature = self._get_ontology_signature(self.ontology) + ontology_terms = self._get_ontology_terms(ontology_signature, self.ontology, exclude_deprecated, term_type) end = time.time() self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end - start) - # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup - # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies - try: - ontology.destroy() - except Exception as err: - self.logger.debug("Unable to destroy ontology: ", err) return ontology_terms def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): @@ -92,6 +88,9 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type owl_term_type = OntologyTermType.CLASS elif self._filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): owl_term_type = OntologyTermType.PROPERTY + else: + owl_term_type = "undetermined" + self.logger.info("Term has undetermined type %s %s", iri, labels) term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=named_parents, children=children, instances=instances, restrictions=complex_parents, deprecated=is_deprecated, @@ -388,6 +387,14 @@ def _classify_ontology(self, ontology): end = time.time() self.logger.info("...done (reasoning time: %.2fs)", end - start) + def close(self): + # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup + # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies + try: + self.ontology.destroy() + except Exception as err: + self.logger.debug("Unable to destroy ontology: ", err) + def _log_ontology_metrics(self, ontology): self.logger.debug(" Ontology IRI: %s", ontology.base_iri) self.logger.debug(" Class count: %i", len(list(ontology.classes()))) From 2a4ccc5ede65d66994ed66746825028523856e7e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 10 Oct 2023 19:27:06 -0400 Subject: [PATCH 122/185] Update setuptools dependency version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0617121..98714ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy~=1.24.2 gensim~=4.3.0 scipy~=1.10.1 scikit-learn~=1.2.1 -setuptools~=67.6.0 +setuptools~=68.2.2 requests~=2.31.0 tqdm~=4.66.1 sparse_dot_topn~=0.3.4 From 9e3a39658899565d1690da571d60a731cc541d3d Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 11 Oct 2023 11:06:21 -0400 Subject: [PATCH 123/185] Add TermCollector tests and assert statements to some tests --- test/simple_tests.py | 86 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 11 deletions(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index f745c2d..90a798a 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -3,12 +3,17 @@ import text2term from term import OntologyTermType from mapper import Mapper +from text2term import OntologyTermCollector pd.set_option('display.max_columns', None) +EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" + +MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" +TAGS_COLUMN = "Tags" + def run_tests(): - efo_url = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" hpo = "http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl" @@ -17,14 +22,13 @@ def run_tests(): # ONTOLOGY CACHING # Test caching an ontology loaded from a URL print("Test caching an ontology loaded from a URL...") - efo_cache = text2term.cache_ontology(ontology_url=efo_url, ontology_acronym="EFO") + efo_cache = text2term.cache_ontology(ontology_url=EFO_URL, ontology_acronym="EFO") print(f"Cache exists: {efo_cache.cache_exists()}\n") # Test caching an ontology by resolving its acronym using bioregistry print("Test caching an ontology by resolving its acronym using bioregistry...") clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") print(f"Cache exists: {clo_cache.cache_exists()}\n") - print() # Test caching the set of ontologies specified in resources/ontologies.csv caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) @@ -38,14 +42,14 @@ def run_tests(): # Test mapping a list of terms to EFO loaded from a URL print("Test mapping a list of terms to EFO loaded from a URL...") - mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology=efo_url, + mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology=EFO_URL, term_type=OntologyTermType.ANY) print(f"{mappings_efo_url}\n") # Test that mapping to cached ontology is the same as to ontology loaded from its URL print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") - mappings_match = test_df_equals(drop_source_term_ids(mappings_efo_cache), - drop_source_term_ids(mappings_efo_url)) + mappings_match = check_df_equals(drop_source_term_ids(mappings_efo_cache), + drop_source_term_ids(mappings_efo_url)) print(f"...{mappings_match}") # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric @@ -60,55 +64,115 @@ def run_tests(): df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) print(f"{df2}\n") - # TAGGED TERMS + +def test_mapping_tagged_terms(): # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") df3 = text2term.map_terms( {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], "isdjfnsdfwd": None}, target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) print(f"{df3}\n") + assert df3.size > 0 + assert df3[TAGS_COLUMN].str.contains("disease").any() + assert df3[TAGS_COLUMN].str.contains("measurement").any() + +def test_preprocessing_from_file(): # Test processing tagged terms where the tags are provided in a file print("Test processing tagged terms where the tags are provided in a file...") tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) print(f"{df4}\n") + assert df4.size > 0 + assert df4[TAGS_COLUMN].str.contains("disease").any() + assert df4[TAGS_COLUMN].str.contains("important").any() + - # MAPPING TO PROPERTIES +def test_mapping_to_properties(): # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") - df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=efo_url, + df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=EFO_URL, term_type=OntologyTermType.PROPERTY) print(f"{df5}\n") + assert df5.size > 0 # Test mapping a list of properties to EFO loaded from cache and restrict search to properties print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") + if not text2term.cache_exists("EFO"): + text2term.cache_ontology(ontology_url=EFO_URL, ontology_acronym="EFO") df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, term_type=OntologyTermType.PROPERTY) print(f"{df6}\n") + assert df6.size > 0 # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL - properties_df_match = test_df_equals(drop_source_term_ids(df5), drop_source_term_ids(df6)) + properties_df_match = check_df_equals(drop_source_term_ids(df5), drop_source_term_ids(df6)) print(f"...{properties_df_match}") + +def test_mapping_zooma_ontologies(): # Test mapping a list of terms to multiple ontologies using the Zooma mapper print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) print(f"{df_zooma}\n") + assert df_zooma.size > 0 + assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() # returns true if any of the values contains EFO + assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + +def test_mapping_bioportal_ontologies(): # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) print(f"{df_bioportal}\n") + assert df_bioportal.size > 0 + assert df_bioportal[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_bioportal[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + +# TEST ONTOLOGY TERM COLLECTOR +EFO_TERM_COLLECTOR = OntologyTermCollector(ontology_iri=EFO_URL) + + +def test_term_collector(): + expected_nr_efo_terms = 50867 + terms = EFO_TERM_COLLECTOR.get_ontology_terms() + assert len(terms) == expected_nr_efo_terms + + +def test_term_collector_classes_only(): + expected_nr_efo_classes = 50643 + terms = EFO_TERM_COLLECTOR.get_ontology_terms(term_type=OntologyTermType.CLASS) + assert len(terms) == expected_nr_efo_classes + + +def test_term_collector_properties_only(): + expected_nr_efo_properties = 224 + terms = EFO_TERM_COLLECTOR.get_ontology_terms(term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_efo_properties + + +def test_term_collector_iri_limit(): + iri = "http://www.ebi.ac.uk/efo/" + expected_nr_terms_with_efo_iri = 17383 + terms = EFO_TERM_COLLECTOR.get_ontology_terms(base_iris=[iri], term_type=OntologyTermType.ANY) + assert len(terms) == expected_nr_terms_with_efo_iri + + +def test_term_collector_iri_limit_properties_only(): + iri = "http://www.ebi.ac.uk/efo/" + expected_nr_properties_with_efo_iri = 29 + terms = EFO_TERM_COLLECTOR.get_ontology_terms(base_iris=[iri], term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_properties_with_efo_iri def drop_source_term_ids(df): return df.drop('Source Term ID', axis=1) -def test_df_equals(df, expected_df): +def check_df_equals(df, expected_df): pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) return True From d1718eacedfb518770f910b307fd0821485a58fc Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 11 Oct 2023 11:07:22 -0400 Subject: [PATCH 124/185] Adds Sphinx Docs Creates a rudimentary Sphinx document with documentation pulled from the READMEs of both text2term and the UI (which is copied) --- README-UI.md | 90 +++++++++++++++++++++++++++++++ docs/source/_static/ccb_logo.jpg | Bin 0 -> 60131 bytes docs/{ => source}/conf.py | 3 +- docs/{ => source}/index.rst | 16 ++++-- 4 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 README-UI.md create mode 100644 docs/source/_static/ccb_logo.jpg rename docs/{ => source}/conf.py (95%) rename docs/{ => source}/index.rst (61%) diff --git a/README-UI.md b/README-UI.md new file mode 100644 index 0000000..a096884 --- /dev/null +++ b/README-UI.md @@ -0,0 +1,90 @@ +# ontology-mapper-ui +The following information pertains to the text2term UI, which is written [here](https://github.com/ccb-hms/ontology-mapper-ui) and runs online [here](https://text2term.hms.harvard.edu/). It supports fewer features than the base package does, but provides a user interface for non-programmers. + +### Running Locally via Node + Python + +##### Requirements + +- Node >= 16.0.0 +- npm >= 8.0.0 +- Python >= 3.9.0 +- pip >= 21.0.0 +- text2term >= 1.1.0 + +**\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions. + +**\*\*** If you are running this locally on Google Chrome, you will likely run into issues with CORS (Cross-Origin Requests) that I have been unable to completely resolve. I would recommend using a different browser, using the Docker method, or finding some way to disable CORS on Chrome while running this. + +#### Instructions + +##### Initial Setup + +When first cloned, run the command: + + +``` +npm install +``` + +to install all necessary packages for the React frontend. + +Next, go into the `flask-api` folder (perhaps by running `cd flask-api`) and run + +``` +pip install -r requirements-flask.txt +``` + +to install necessary packages for the Flask api. + +##### Running + +To run, make sure you are in the root of the repository and run, in two separate command line instances, the command + +``` +npm start +``` + +to start the front-end, which can be seen at `localhost:3000`, and the command + +``` +npm run flask-api +``` + +to start the back-end, which can be interacted with at `localhost:5000`. + +### Running Locally via Docker + +#### Requirements + +- Docker + +#### Instructions + +##### Initial Setup + +Before running, make sure you have the latest version of the repository built by running the command + +``` +docker-compose build +``` + +Docker should build two images: + +- `ontology-mapper-api`: the Flask backend API +- `ontology-mapper-client`: the React frontend + +##### Running + +To run the website, run the command: + +``` +docker-compose up +``` + +Docker should build two containers corresponding to the two images. + +In a browser, navigate to `localhost:8602` to see the front-end. + +### Acknowledgements + +Initial setup of React and Flask and Dockerization aided by an [article series](https://blog.miguelgrinberg.com/post/how-to-dockerize-a-react-flask-project) by Miguel Grinberg. \ No newline at end of file diff --git a/docs/source/_static/ccb_logo.jpg b/docs/source/_static/ccb_logo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..422182be10e0e390a585ee185ff818e24ebe1b50 GIT binary patch literal 60131 zcmdSAcUV(jw=NorA|e7J(uAmh2uKI%#0H26h=@py3P_2F^cE7ON{xUNX;A^`M0zK5 zq#5Z#XrXr!N(iKI^83y{-#Pc*{p{!7``;cbbFJ|_nQLZ_wPqRP9dAzlp3DI_AL{Ds z0vH$ofLrtfaDoHG==eFm1^^5V0g?a!;0)jt!vz2nU1Ojh00sfTsekDJz(WSXf9p>e zZv00bMgSnf8NmD>bu8)Ee}vxhKX?DfmFX+P|8DUs4r^~-|v45$YBSJ9~2tYaW+I!a2P10T1Neh7M3_}v* zA0~!t07fnbCN74P4giqei&G5$l7HPsKQS;eF`qika)$LR8@)mmCxDTGiHVV!>C~xz zeug24{yl)1>(qs-@^?;io4jPX=6+G(L;SZhB6rJLd7cd7MHOFp1hbyK#LLGoAa-5+ zhJ>V&@+}orHT8Sjy~ArqCubK=FK-`TKmUM`(6I1`kC9Oc ziJy~_Q&PXAeb3Ix{gIbnP*`43Syf$ATUX!K-qG3B{i~;UXn16FZ2a%UByN6TacOyF zb#0xnySINpJR}_*|6>>ZdHzqc=;!}w+5gQhF1lTe%*;&8EdSWW!07vr;atq8uF9Xj zaL0t@r91aEg%4*g-i`lO*2*fP_yo`M%46{CB~c}u7~vn&{$<(!JHvwie_8f#!~VBj zSin61X6BE;^)2HZi`poHnCof6EEI5e9l2%*@Oz z^baQ+D;wwkXE~Xs4@>EjNx(TK2Kr!P;sSsGRLb|L8-V|2h3Vwaf75;s{x|LS(0|i@ z5C5nAW@IZWf(Tlm+{B){ZVl!n=&3%v&;MFmFQH{E5f3>Fsv!={ot;pc#5by&(4&pEB z57^~OgRg$%$Mx0GA6yIMMqYORkhxcmIDGzWsBc!+f8Vc^gY>E92YL~1Hex<++_poK zx>dl|ZbwLLC18LIs7go4_UV!z;smVtAM>@HHqfNwTwWL1%PnH&dQC&4h_r+Jt}N#< zpM5*bf~UunLvk}Bu5POHm`9|hbMCN)J2oq|9LSLO;F8tc6jmeee!En!1mzFd!pbDC^P3ae@nlMmPabE=J5Z*AHPyOb^QO zon<(O)S&7OVf3xPpHBcSf*aOV?w@fDTF~n{ktM};4%4eJths!VAGS4(L8^3P5|*Y1 zNzeaw0{HXSe9wIT1Q3Vhhx&kN7cc@YDTF%FPGU;b`Jijj_I@XT_v7)smd~6;#U|>L z7m*#~7x&DFuY!bu!C0D@aUF1Y0JFt_zcmmwMLGf8yQ_Kv_!P==q-9f1!r>iFG~P6N zf9o~Att<$op}>9qOle`CeR)EDi#GSH>3xs@9r@7~;g-ifs9YC3HCw+am?%Pd_n=&M zIVb85Y7t>P>TM%1XXA8O@z*Htnt{J9FUyDmCu9|U3N`_+M7+=BH6|o=6}(9WI*yvT zefog)LN+l`$9ZxTQ>pnG2Ndlh$Y3F|u;vIk6w7mT6+cWn8)2u>X-DvED|v0iWV`mr z0kQt)f$s;}hB7P;szjQ=*UTS$Y$gTGOWBw+$)0V-emnupM*dvbMd`*VWEzJd#Zwpg z8fB^qDjT+&_X9F7n%IcPT{O`F{O83CC3|z9Y~kj%5;r-+Gc8*F^S+MIL^^=O^Y$et z)CZUxyj2(OhzY-7oV%KmK*E?ui}Vb&IGJ@j*1i%ECK~`i%~LQBKvfPW5+4Bag;9jFLfAy42( zP19WGIQ8hF)vKVyTGZkRpbLB%ra-wHkL;RJ8B_L#l@0jc2z4B~D8&cVLEUuAj8^II zF3?O{ivv$*N!#88ze3JnHn8{U@6@fMei|K4(O3u)d^-9zl5=u$_MXxZ55I;9U*dbk z&VVGiIH&^f8ER9_X`TBy&v4XC++Xu^HU**nLn!f{u_Joe`C!xKQm z);NvfSO<7k!Z`WwXb}Xr6$F#12K}hhG{vQ6eory4(7St2qBe$8@9`aF<-Ctdf=6Rx z-A}6PzLek9Q_$d4Q$6e12^UMFFB_Mq{dx~|Bc*djt~WR(a|N=-=e%yQc3VueoaKNC z`Blj%6RYYg!p(+cjMHRs#&KI>&&@mzyL17vxm|NyIU3tC~q}{u5 z0&oM4QW!$ie?oD2?HN9>~>Nu(62z zYf3W&;by7+`X_+wMN~=AYW~xeP{--(FqO2q=8~wQh^8aa1sKui?2YeLs*F)LzW!H* z<7DJN-*HL~l1g(O-1ETqE3h`Y1OvfyMcKi{d4EGfz+x`5Kf#;nMIEnFa0i(vK6aNq ztDnbw(~ym0$?RLzug%`=uMj#xrn{)_S^PuUP=S6A41TX84H^E{=%JpP&E3DkEk&Ct zK;jS@2sOt8JJgLoLhpU^6M0fa*!8njTOM&3T&SIPX0O;xS`PQi_C7P~S9&1iYnMi= zDkAL7r9_?p95zrN(U-5{Q$h~Xg^Y%E$HCUgfuji;MfUNup--}u3P-icz-MNAmX*1u zyd7uvwayiNc=vprFDfB)=H4X0|Vkh%TP#J7Z$j!ceL_3c>R2yZq6H%+i;O`r6i$#k}5MZWdk_N)%`PT?CQ zHUh_Ol6MZpzH<0F8ZT2T3JKCxo_WD4rBl7dt8IFrQDsVv?+w8=i|JObZ4b!52#eQN zjG}0FFTM2^KfR`F-8iN{GhU(JpaE&HbsR@l{Ie5f&!gM*s{b_K5b;}RF9=n5(5ifc z`^OWz-9E&2PtU3l29kELXbD5whwHC=Y10!ppPPyY z{X+c$PpS}i#u|;rZ8=HuqG`|K`#sh}-_+L%aU^iaU)}6p5g}J4Sq=s_Xvp~qBrQ3~ zemNBHcAH$mAzCk{JWO`mIstGq?5-upsa(}!?`SN(_jd{7piDX-Ac*-SG$Fy+^`PYO zT7V)l#qg)}MS#!wJF{00?e>{+p$AF{hEf>C%7os|%)iO6?Rst+i)Jm3`aJV_dfOuP zJ%C>{3jpvI|A`WAmNebfn{uerkyM}DX^@;_=yZ0nJl!xK^iWH%JuRrK0RGOW`&6aj z%^2hF`PZ8dNi1Y0zv1pR*LiT1v257Bgxs2Q40!75N|NT?JSr-k8V@dovo~`=HAwn7 zrHDD{OY91o45Eb%dFeCyMmA0UszUF-Co`Iku13Vpt3P$)3ml?o?-4d|C<-@ffh9UY z>I87vc9Nvt`-uB8a?Q8F7_^>!O&PSKA3qJ*WS6C}&f-s9$i%PTS`^e8llkDp1 zZ~f`JCh?-dwe1sMjE5!qUcnjW7rX2MaxJC?5TqhF(%sW{xqP#d;!81J$!kaQXsQ%C z&y(B1<(Dl-^OT*vl9n1T%x2P+_3T>(TDTP~}nB$e7Nv^)Y+7X{%&%v%RC{Ema3G z<422JwzdK>LOQ%=`owbWJ&kVj@C#Hh*a41kD0>)lSL`LpLyRG-Vrpm8F0szsS-SRpcZPs@@zHcq=oA zf&DUZSLum_&>4p-*)rxz``Tbe%4Ke}*)?@+ z$u7b3;2ZZg+}@u6@K3-3eui6(AVT3pqhZcDabgUE`Qxwvhw;#i^Ctjr@)ClFEQsHk z_kB;3BKZI>Qb1!b#+26TD-HL$ribpU+PhCbX9D+>?bwXB!=swDNq6R-Q_S#t<|z-G zo@jQO*7J|RnFoPM;~T#=bs(VwE9{akXzoN!{0V>~(&&fa6q5FkW}^*ur#cLRPt*F) zH7da{)o8afq*@&uT7Z-1g>Pro#(CoQG~WpTbL$A*&`>jDwm$-C%klrZjp$OmS>Al1 zDEnh9@}O`#IO;}8-2Wx=&kh!@xhf1iY<0eHk561w z)P+YE7ToCn3i8|IiBDW|)$kGWn@O)nzqat_p9GJR9nZb|UTy1wypozA;qLa?iy-?G zz(gMo-3I0(MYlC`mHlzsQVRT&7JPqifU!}>BK&#TT$bVh>^kMvkT&-ORGXCYGH3yU zC|^(W$z&Qhj}U1TQHty#cAFui;5e znA!98u+~KPxy5tt8RI>Usvk#-_^)ThLOhQ-+O4yiS(u(BJ$T5QsU{|DTVk<6^}R;8 z6oHR24rvAsE1@q>^+6!T;IgcsOAjov9(>cdE7p+#P~YB(aW{cJroZF@FWK-S?}h6h zuHk!c8d6gfj|!-HFyA>)-`<5* zjURsHR>l_x3I^ageNJW9zlh%Fm${-5t@-|i>)Kcou(Mf;o}rzpO|LBak{>F((4u`6znoW!$&1{%DU?lWjPg#^{NVWRnIa=?@q!zf!@6_4v%J zFu2x0*y!Su0cITm@0gH?cP)kRiWs2fzO-o+D;jx_8OvVAiB8=OuL8R|Ji|Lv^(pN`xAp`VQYry;|?mS$8hJmdtx z+o)IKcerpJ>4^vH{z<$)q+IZ=Aw)lcU-@;gF59{7$NqhwXC|2w3(Ci3;0!q19JP+O z3x4qgfP2w%bx`ops9xBfOh$`sid)%>`<)-Kh=W@2j(KJrT1gt#AVsAj%5iYVO?gd9 zy6)7Fdm#05v#fTXX*&f-sOVHdSMVY<&BR-R&US4Wk1DO&{TbGYmqsO<_77Qo?vLfg z|777gW+5xOY%O3W$nEuUXq=QR8@mx(C{ibQc$4@kFZ0^POWUV;TBh#sscBd3a)iyD z0D?m}Cic1{=2^u~0AVrSY_gl4E6$R9X?o5#-!|d7R}<;OUCzhwX$T1AoGGA`X(PubWzjrkWQQd z&CDDFzksUf&YjD(6eK6w(aIIEXW)&)!%U~D;@w{q;htgGjn zt;=N(az*3rt2#KSUI`fyp#6amc4%i4@piZ+WJu)+ASTTx^$o>LYyX+zfLo|R{`B)@ zFQyf5?C#HYAJI6ewSr|yF6zc`9Vt#K)a6xS{+yjZy1f3_{|Fcn1Ua}zq=Qevp)T}U zD$s|^>)I&r2Az#9vvetal%nppS|DpG%JfyUoWpGG$oZz??N;&F@2?75Q;qKjM{q-Hon*euy9v3;-PXF0pu+#` z*=0^0C-c3RWC0ph_zE@a;G-sRLGASjO?XB>^6Jvk@E43CHm*UV(JojGoQ~Re!<_&) zowB3`Z#L@T`uAv|^p*Y7!dBOba+`(Te_sxD>WY|KsRraf>8^s)t z{u%31A4y>Z-Zl_hqrB(*jsQ3}Htc7Yv8)KP;WA`2vw?w9Skr-t%;F{);-t`gq_$P7 z3GBIR=N+<44`0Q?BUAS&7#=o6QZ>t|Rx`ZSCmkDn>s56Ok`kZ`^gqrD z0RYrC#vj`{!dMvB8=B&f#7OE7iXCCa9OBQvk7l#{W%wf_w%snLhWXuY!1-uwM&cfm zZxQ=U69>+?&5e)ZK=2ag#JkpfQKSq{sa|7y?%7}dbCu=gD1x!^i;DPiP|2CL&q{+L zDR|%DMiT;C$Ft^(cpv2Tyn>m_q9$=s>J2qIccu}H%``%HTQvM{I!-zQQ6lXMH6VaH zREWp6>FypMHofY$nYcwCke+!lG0c)aYTmd)LqN}wSmux)g!vJA_(eAN0;6c+maYzC zU;$t;k#YkR#=Nv5@qLAtY~q{7%>D2QR*i1qP;`;kKOWM#?z6 zr1$8j!qcTbf|sgX#+^G05@c`k=W&&^{jH@@U?wlHB9C$r@~}FkTV%o7)8(h`TR~%$ zC#kF^wRxZaP*bqmVTYYT?PL+C>u9q!lxG;~Hm~8d%E7nGQuY!aOqie<@09dH z6uzf;@I19oYyp^;kB%bxapL?EG7)o*t!}aCQ*eip(}y_oEp|LiAL?M|@A1}K;*E4^ z+JWueU)XKAyJ1&u+c%%rMbIFy*~u3uHDJ#Y+u8!=L8q7!tLRB+Tsh#->d zNoE8qTapz7KWs|i=?Z@(t(B49-mSo5ej~s0PS53I^Q+%z(+l&zYx1DV1W>Dx^%_j9 zHmYlGro+DGX=ZS8yyZlJJXEV1>wpSkh4EL6wM(b=tNo%OQ0`NNMD{qc@{7Qh`j z>kLM}fj80viE?Ji3dWD*2K+G`Ha#`wMq^e$$8#%Eu{XL@rFz76JbMDiMkoB&VWO~6 z5Z<`Syw%x%0rLUhSLo#Y<*ib}qm*YwqcO~TH|@xID@^2R$nF4*z!J>LEPO^Jm>vGI z>L`r-iq`fF$eGJj@l19l<4io(tw+7Wmr2 z6=Hi{kXKz{`NMIzYy8WKv%IxgZgnYeDQ4VQ-`2h8+0z2*2X~=kvouLZ3+X5!zvI2%wTTk0uEIK18T`HN zjKWbGiF@)(k=N_-e;u`KyTdXZ<=aaiY^aa|G9$M5$wp9JY7WJeS~iajam=1nggZLR z)sdA8vsAb!o?M4HnnXV9vq!(9j#-Gl)C^63vYub3rUuT5)}4!0sLn0y&)X!r18vin zULK0Di0;1o8ov=(W)qJGHla)#zjRSVNE!sD1$gvVJh**89<^iNZgBPJMfJxF{@01$ zZ%{C|Sy8J+L`!NEO$hc|9ctN;pLK;$+!J|Vkw`V#X%hN=`AGk7&FtG1;Eki%PIN_g zH+^3gzJZqwJptGo_?JMrE?A*I%G@XFYi+gE3csmgSF;d`v0-E{)HY+vg%zF87>_WO z_jb4;Is}e94i^lSn9Dcm_?_dZ_G2@1N_wy~#_|Y5{6Q*OK!t!=-!{78H|~PDAac{# zyVbE5E0`{0s<8GTP_M?m(8jQ~RQrMkA{-_Wf*W)o+*_+L_`zeZ%Nwq^h|9UkFdO%L zVXRS*s5yKBaF*6+nd|^p_9yh9ZhcF?r!+PUEI-o!gH^mCc=*o0^z4CPN_p()7?tb5 z-}XX=L{LCc%x6)QVZ%d%9-nQD4T)<3!41_X>ElqB{kjbWv@43Rtfj6z!t&hPB2N0+ z?k%g3aS4Q|`$GSVy0ye8lFEEm5*a)N8=>)`Nr5*PaO(Pbnfgxxbz;8S5i`0)SP^|O zTbGG>Pm1z*yq3y`_Y~V+e2eXPy7bvu>Y>4x31_0|`mxN)PpA)G?>$r&-!RXMP(q$@ zTCqvgZ%%-Ommvn~6t2ot==14Yg9y+g?JAN1zG6P6oE|ym`n%zW+r_q&GS$NB*J@;i zx~P_ew|c{I(UL2EOh7pnRX)(5Tj%{z8JYm85YwKXGnz9jGd6lnq<^p9u)?i*^A(y4 zdg>eO?=eT~Y*Yd%n*6rgI*Xl4$^Uxhg(vjX#IxGx*0rs>ccv73qX?b>Zj*XR&W`1~ zAS3*qAa8JCk-iGXXlmFQ`{hl8OjG>4up|jd4?aO`>%2fViXxkSv5N^c$*U+8XIOUE zw~)1UYNAy%W6rD7j`E0X_trSrU0|Gh3A6dDrV**u z3UHv9H&e6c@aS*^0(B7S%_Kd?F1l=2A@!k-|NXHr=JsvM4VUP|f+VBwgS9ar95Dq=baD9k6;FJVL-+A|-&hDOy=e6p36v z7c0XjgFKa3=aT*+QC{YVJ%>-XZpGN~Eq}w0#~6n<^1poB%r&HGuwp%p{!(CR=q~zv zx2wY`H|CVqyE|$t%ImkPfr5>C_y*gm5unM48*XHO?TUGNf4s=Mij-al z5y?-{!ed%4sf(r@d+)Uu>IcLdF8%uLX01E)@`Lyb2K`OI<+GT0O28Jjf8%&SLsEFB zo3eA{7D#t}m}z04*HXjzG|UWB^R0KzT|qJ|WUX?gcb%VLPjz2AF#Qxo1{}KJ-6mkm zv>p2TD>_-cl)UbEu2fCv05Qmoc&`Tk#1 zahOiV{;NolBLvN=lnLyf<#+o^TFNDTI4A*Hg&U~8b#NRE-IZ7DJ=g7m-G6l}KJE-^ z^qBIVB%TZFm80Az&5DOqVYsYaDx4-)#}k%kl%oXgM5qNNA0Jzcth`Vg@c$BQjc=+y zsJXXYduQM!!MfDo!4{8f)oA6`(o_!~-s&oVA%Y1zVZ+4GA>}j5?B+&as`eW`x4w9# zd3R?e;at=E7aZ>(`+7uk{1!S)o5|1WP9?!w_*T@ITH89YTYtX+~exwhKIm;QFm}Iyj%y}z5$-t zX?qO4+0fLCZ_@$&T^`}GtM?|~Pp5P~1{@tSd>H2GOU}gzisAQtH>fUwQ*M$7HuE%7 zcoX@t37By`p@Oze&xH;i>^g2mqo82?VY`L~CYZ*D=nTMuI}$AWUMQb>eztmL7xCya z`*a1^V!Q`bZ5#oU@k__WW+GZq7iB{aa!0QD*1O&#D=A&I^6;Q1oh(w5 zWFBhP=W2Qw^iFEGs8g7i?j2F?!Ewes&kcUK99x&L4^;M5EqHGR-B~{~7k39WjfkcL z#U}ul0fSaBiz`%!a9vC4$*>nn);43KYs!UJ<2!XO@f}%@q4{k9c{~m_0V-GN;)t0A zDua3az>kN-gAJtowp?YD&nz_h1T#K@upX6fo}VK84tVzEt(~ODyZm%Zx1o*QZHKN# zAw6DQ)#954UEj?FOiONsp-)u}iPIK*38(iWg1OCa?b-LdaESVIf zIhRw6X_x!^WiTy6C=FDRDf@KTyXn^W^TBPq#%mT`njX3flD+N0XViFLXhzT+ z5?pNsRUMUvu|kcSLHKBwC!s^A>u;6FksO>2!k->=hYZpYaxVgT`;5r7#)l);LSK&G z+N9p&GS1RCunDdM3;GSWBRQb*a5~|HXF6I-#VDpR&i_?_wu-;=iH!ErP4iB=o%!#IIwKz}qt zrysXjc%df-@}}8)bHf#IMG28v$l7J3Mk(qpkSRW%B+5~QVm?ERP0(oB8dEa`MiN542^b3-d4xc1?bw^rw_a=8p8++m zM~TJleEbd~?o6jXDDOUk;~#RIrB=C?!o)T%te$P5hTe|CdLBYCa(PBHa~${w`aDdC zWPmdc-bVxcgnvDEn!FFQmS6F_oY24Bo$UL{BK1HtZO?;p6K9&9m}J$ojj5fdkl8-A zNg8l%t$SWaRy;j2%N)<#A9k>-DF$C~mPV0$pZ-a0i-rh=9^^YSfM;{l24oD5$OS2G{Ql~hIMa54v;Zgj-I z3bNbu^<_U@W*=7sOMGBsllMHwB4Eyg@0X?K05I^3r6N#==p;L%ljCn~?G@fNd zP%gU+#@K0+RpxKKeCA-1#ZT@ONu~?(8LykJ(vv^Av*p3^dgHR~60N|A;R9e;AV({B za$^ojccHm#{j6}Mxt{*{&#}}(v$U}X7j)U$+jy3(ifs1D#Z!QhCK1b>xwndaHY7 zJL3h}j7}g!R{^`NW}^j=EwdAjZE_ovn21+VhTzBbHh8R3yJS?u!sB;({rw((Y+aC8 zyvmjt*@PlZ*on}TAY1q;%=F!(2WQU2w5o*A-O^kf53%Q(oanCh6BZFw6f&VrGPbtq zaBTbl1kTpJntWb9%t`E5i4lIqy_#5cV7cQ1%cqBG`z(trE5=(zWZu6NyWzJm5sFGx z+<5mb_w8h!w(A7~9JSQYmZ0vPLt_^TQ&UC5@b>c0kt=~#MUGCnQDzdN9#`9?Bo;Y! z&!9HYomn#Y?hjsU=!>Lo(~t*4Z_K^MRnA{f5~|bE-((T5f^pF%nx&|pVM;TYg`Lpm z8{y+!pI+GBChoSn73!)^l`DjC=dT(u#j*Zkru_B}{n13KM7~|wvK3sZ?U5~dnkQju zDIq#5n;3+aHAJh}qK!`gFF{x%;|LWWFpIXNZ3@0*_+uq@Egdv5q}CGo_XN<_AdKAW zKLJz>?=h@1GVqC98OPuVrrBbuz@pbj%I9E0I0BpAVcxVqaza7i` z-EmHP<=st#_HkQ8w$1oK_Ss!jERe5J5HE$JWfM5M#)Zu?)3(BSM1I%TzJS(VGaa2m zv^XFDXcAy|EF?FUg3%@s2$(Kp zC9s1HE$r88(@-IlnI{rv1`4SKJQADxvtex-MTR zS-?vM%h@}@?|(TgmZ1-Ln-z$H2~guujlN_VKR4WL8zL&rtKDb}CA&OUS(Wmue(yBq zMydmIC6w`b+VKg%fQq2;I|D09W;d5!3{N^+7Q3GSoWQMRZZf5p?kDB>IJ~U;0ssW_ zyvu$sHAG{EUc}GRp>{kGi$jOWY`b}potHE18*U6GXV%Dx@#r$FbwYo?lrS&&HD33Y zYK&FY&Ag0aJxKGUFEYzkol4n`6_AEe@4~gqJXJIuuD)b!tW6LVb_m+mu8XYljECse!GtHBdNM_ z`8`?ey=p=#R}BgD^f4(2`DI{h0i0*h|W%u9ma(=&ZO+QJ1mu?pQ22 z5Xw2$%nL#CzBDwFX1m>7J-n?lNjv=Az>jb_z3AS9;qjJNnmclk((0G95LV^-&~%1R- z+y0N9fI>$?>c@C{@2lk;j|kR?kMOx%FvmU+xHiM6ePL8Q#gz72{{doiaU;^Ew-lKG}9!5 zSUOWuCH6e5c&LF_!Rnm-t?r13_*ZXQ9sc~cpv0a?p*%})9oR4Qe5A1tqHV0~INi!m#Y zXU{*$(z@qy#VpzeGg&LekSfWYiA1bef}OQ#_;kenT}^%i1x(a`Ef7j;=2Jc>8Dw(c(hL@2FL1Ghi|9C?qKSdGG>9{)l^_{? za5~EsLJaqSEeV9vMdCLM376}MD6Jp4`BUwD2;=t;P;LTmr-bLuakle;ELUkaCMn|T z!1g=-1rP`yzT*{R^0R(R(3S=btRlkR;L9Y!r`B`DKcikFw|rx5IjEv2;PaHYY!ukB z*PJYFF8-m?i~1vh(CC`&UxezKb?iJ$N6JBwgpCTVW}?etn=a$n(r+V98nTuSiG9FwUYPFm-Kxlzg(2>_iGg?4SW@q+*Dli zGSo~+Sxmy2&&x(ID4NEN_pUCO19Iv6m#)>@H2S6m938SNUiIPTe1Gp;YyyL3jQdzk zquU&tlpC|(dH4BDVFTxzJ7a2}r^7zwQn1zG)$F_5-B&z^Mbd_cN?;c_LYqaOA85V9 z3YkBe>FUDzxyh_JF>R@e@b`$?#j)E%?>FKfH8YU>bDb+(6f7+!U8FCUx1W8`Wp(6x zS|{m!tF2tO_NZpTJ=qy%gA!muvlHFqVd^|oj3hCix+Nd1W9e;>#F&mM6FDSjnJGh znp(dL?OaBl3NC-nZS-*q+Ml96`;O*=<95{DsN9igycV1^Elbf_FgyW#cUPgYpiTgS zC7>EPH0CG+PGD14l7yRu@IC77J6!v9@%0&NL!{7-(k?0deX(h4JNn}^)e0YN6ZITUdu-H(#wf|@9 zs`+0cyWjjho?iLwV}}Z$YL{-_lF4-1GxgQ$TY-Y2`w9zHe|hv2O|GpKuFk%o(-glF z==7rh2UC{gWciPPp{?WVizWnLtK7MyvLqDH} z@yw3xTW!jp8l=HKg`*0m1AG}-Z>8s7>%G}5n}tPc`N=NmwQ9^x7|(M=71K^lJYNlf zmHg=R{?4{$sWD*>YWR{P7j6oCadBl5tq-?HbNSJ^(iMlH^vB8S=K^9`Gk5>}atN37 z!{9~D1u4L0q0#cGI;Ik+9S$r+n@FVnZdMvSKt_~dNt0NUGQ(RfPG#RQZ%zOk8QA?q z$3w(JauLyOgmx7IJgupgPca(`kDQOpPTxv~yp?8tbp2gwn4k4mQ_^x>`Nkfd^dh^P zJAOR89GGGM?Z73Y0M46hVZ14W@TO8}zq@P4JK!}b^sKob7RM8wNRaAgo0FDTR<$=+ zsS5nL8w%Ac<2SyNm_djldo*98HXZ;&)E(Od`Dv=o>d6(~gp9Z@t6R76hb{pjmtKQE zpIdz0Kp+wonyNQy=AaR*YQ?=F5z0r+y+&X$Y1wsXBmsj1wn#hDGsZ3HKrV_C;!fG1 zlAHL$di6Kuv6k$AXrvpT`04;JD-N5AM(K37g}JTqt)*|InZx-QpJw@B15@7c=5`UK zTvz_5L0?r(>gZg$hHj&SMYIs~4*9ATMVrJW8%vRzmugbj(4EGuZj}y-RaZ@O^m+Kk z?j$q@2M8P>c_5$)OV~_+@RtHopk?B`l|EztyhOP3qflA*9 zbL?@=JnU;;BvHzeo*%3*n!SKFL?a8~=OeV}8&x1o$z zZFQDnc--^fPZ8@HaZ=?~VyY8JjpT);h{8Of`qEYEDz&LuV|}cKX)YonwECUFMPGs3 z)>E4MiOX?kF#JX;588v@HKhXkuCIaf$9*k1wl>JBlazb-MR_aA1AG+U+pW*OJ%3`Hnn~V6(`CcJG<_R4-a%CF%F@af$Gq_bN4H~}C2$k|J&g3_dd2HE zjze!`Za$HtkB8#(@!=ZPpYA5qivHxs>{Q;pbAk1v0zif9Pljg{rIDJ7b_V`6e@y5^ zN6-ZPEJ2JUgsI!LF{?h+`Mqfpm)^T_gDT|LC+6-)y84W0lMU-qiM^*!jEE$kKdQS; z59(Fbh6;GtD%Tr&_N)qrdMB;=rItWg2AC#C_$5!+??YZWQl__o{U<9rRm<{$ko`^3 zXM$!?WmoLajgKZRAcdM${O%E9B<~;sZ4NcaQjON!L{89LDeb8mCc>tI-i_Zhi<uf|REtz7>PDHEw!MjR?d1ykMzR z(dta!ZpqIt0*lUZ9X+jzoQIINWZMHu^M2eTM)w)$F9*vs+n04&@Di?^B?|tbEN56H zJUGHWpk|;Q+`=t2)!A3Vy~Ggx$vTP8o{QJ|eChk-P}}$O)X%S3rvdX#@Kz+dHV;V` zcR)vb-1LWD*CYBMvD%Y-G4U%J1|8W_S?+0+ha@RHc+Pz5&NndBg!+vp+bFEk+*rqJ z>D@_f%AWZC;O-5pxQ9uA;MCh^tnCGk#ffrR7+lSk$~@)+hz$ynjiU*+=dc}Q{Hkbo z3@E>L;_G=6F=e6UF_fm)MolTTcF{+PWk(3U0=LjncVyoTXgMc z{ZDQ8VN`j>+$SSd%xT(hP&G%_EYQ!fgU+7%DD6s7#tHs95DZcn!SZG;%REl1G~wf4 z@9%v7izxw8Nt!0#g$-#dk~)j&?BLIpR?aew5*H1N*Hd0=mFcVYem}~&_Fn5tfBJfGbey$*g$Bq^MPtU83^gwo|&0VOfi6C8@$Vo8+0z34}k5pofzDM83$XZd3q@VgzNBeK-42uqPIytmb13Sxuadpk^{7{9yZHcnEo{_QB8@EdnhH2t6aiWc77pPaxvB*@nq&!`OW z4A2=5pvSlj+Fw;uFPFZFTf=$R6C2QIdm62KNxTE~ceo42PR*dVS;)qpT@R5vrbbKb zNt;Zi%G^2td^5R1;||$Cgl~|A8!IYCit~nda0cmC`I8c_;_r#Z*G~5K`dtYL<}d*d zAhrs~a@I5qkh2;4_3q9bl1roS$1jiB*NHp&PFw4lcH%qYLQlsN(;aMkCB29nG)F96 zMW`{z0#{~fSiP`}f`L%flz*L%)}I0x!gI5>A*^7z)?9y0xftw-9Xuc(1HYs3 zrf!oe9RWMBNMg{?wc24XX^x)S5dj^i-{0J?MT%gNP0TN1gkz32vq6bfLIJIfh&32@&SamA#b+CYFodIU9Delv-bA;ZF?;Mug3unS>zFwlhpf%f-;7-Y8ee( z%VQ#ON4?{V(N^#xoE^{zu*&PBu@-z=O+!a|rvuNEU_%B}W799vTP5>?Ims61N6mzD zknIKYXbVjqXYw6dC+wzi$SeyD=-jME(5}m#+)yteTUqz1%3uVsqVc~Y7h5*MT24hy z1-#d?qOlf{B~cNTsdU)ISZLw|&@pZrn|LqSY^cvOnq_Si zDXLDK$MCJD$rK1kDe=~aRaLcR@8;x`7Bj4?wxU#2@>hV4{JR_I)gV;FU)7g|jfQX3 zizj`eo3jESCjbCtaTx84BHai*4y)uvGrgIwNqA~;+0vFh|M_XnX&;^g$1QjAqkn)3 z&OwT?9S%j(Pd96Xm~u?AqmvnVr)%}D2iWoPm@e#W$>M_8aN-}8je}=y;kU~c9z5a7 zv((FvA}l1IvvjJ_)GWp;=KyhC*Py^k=)%E zkRj@rSZz)VQnnc(E+(%x8`x#rAXFT^RL!% zfX!1-K{t9vXwr}N7lZ48UJk**C&f&P;IFUm+_b2W3PSC8-r@y?=>%K|{hWdFmn}!5 zkoU4B2b^Br7|At4%8E&*k0uyxDLYhDPo*2=?*xM@OCq-X=Z!xCnJKPwPk#<7{SpoL z7z4rf^o8XR``$EU z92G#|h-{Xl{?v3OE8Z$0aJQ6W76gsCb$Cj?{t4@&zQDCS|iqETR)GUKAu8@q3m_TbZIb-h7TCNdU#c&O56& z7zm^CCMG9b1i~|2?Og3VKA2okvwtmPf73vIC6C8<9{!QW?L1qF4sBG2@mUy3--Emw%0RhS67 z**8TV!GDj={|~;-EFS9k-~W`TY(>akmSjsr$TCR?MP*+n3CT{jjG0RIoh%_!*+NYA zY%}%=A?swUGqM|IEMu6Z|Mz$PcjuhPxtI%cF&AUL-|y%Bem`H&cl>gAg6&_Wkt9qK zbhJ>Ha#+F`JLG?*m^5lTGg$oEbrkph&iq}4jf`Bu7_0qd(8EFRwCh={=e7;f?CVe z)||yKsOLDM5k6}qwT<`6$4Hh}f@4Lk#EB3nq>rM|rGcGe`IVL}<E80SVxsL^msSugjAIf&HgFF5;4JufaqSo%!K zcJ|L zyOKj|gCA!9V?tPrZ9~9|Iskpb=zzuAJBA9}!r%7Gpjz!s7T!j_C~s5t75sVX>}thQ z0ude?M7iVD9`cXrA@P@b%^KEf=yy}b_PXSUC^Y{x$74xC&pR6MSPM#uBB0I%FC=5Z zy+a{;=A+wH1Vz8nux$&YN6Qt@VtK0e7R-~L1OQ^0a2ZF9fyjpk$iKw*H-zMv70%-}t~!%22`KpQ2i=m;%ynSbWQ(wT9<4omIQ=syR5HVR#4y z{|)aeDh)H$C!u5OBXq=Hh7OjR4P#WYty@kLnpika%svx`V9FmLXgVbIUIcI03vg_n z&-+?OFQx^($co5Ms?J`1?9wy6+44V?)B8I_Twhb7mq78?!%Ah}w%PELvwvT%y9N|{SgX^I(L3h287+rgt84Cm2s&H*zeISP06Ym7g=tW1eBQu8b*` z@z-qUn50ZfNj$-wd2XpL|3h~B2DZ)1lEfC@X3lc+a2RtPTc;xRutKixKqL8ff5+b` zzVOFQIj-ZKav251*oJ>hAKEpo^Ft?Ao|pAs_}-lI?3v5-8@655fhC=~Ex$i(46~&U z@q?+oe@ypQ5F58pIpY80J>(v6CB3G@?!pwpmj5wHnt%@ou=U44%mkT6v;E(4CjXZx z2l0#h-2yl4`A2m$NWf3~0ZCWX!F*!B>Zy*{T<~i1t>B*OzIp4U8mdOz6M>7Svt&JPc_B&3YTp>Bs0C9xs`8a34`K> zJG`WsH{mp;V|%^EKrSPMP%!sMWTG<<#e-@#106zY{0;^s`a4l7_8x!FKwFDRhVxIc z{9|$uY+cB0`o|OsQi~)}Z#l{;?eO34Ky^9hBat$Sx{ z_U^ifb11(Km`4sGvOr#w^iMOmhEMeTJS&p=41Ny+uiK{HoLT^Gu%Ih9QVK97$*9^a z6esK?Uc)tqdM8z<3RXXBGi++L0dXTCmiRw^4&zzFYCwy$DCB{^SF_CG49DWcKoJ6`hNjZ| zzCbjH(DEhoa|05!X?!M35OZGWo|v!|Mf_- zxlV?uMv}(>E%k&u7qjTG?z|$@QZ5Yt;2{{)Z5^;iX0)Z20t1x=tgt1$0IDz@_U*>t zrx#k@2Z?UlrZ0vV-?@+0&uIOo+Y|}WUr@@alJ<~Clp)-XaM)Q(-7}HS%oAY|6T5Ie zD!cP#fUGX-5I_AQM~t14g(nD`AuWxv(GLsNd@`V)v2}@Khrt{)WqLkT#MEm^Q_a*f(>kS3 z*s$)+nR&a+&G$&$&KBZZ@@pIp9}9 zHK9SqKpvnN8!;_ZO{D#?=ThzXma9*4Q0UDNS#O?-6WLFhH7+t`$d~bdS~s@*L(+Iv zdPQSHrL%IPrFNSNy2dc-XaYBI6Amk89xO&vB+|WtdVZ@-Cx9>A;4Hm=<0xz=>azGF zun*%W=#(1bHHu|_;S{A1uX!*Szpcjpk4b9f>S~KmpQ_Gj17>3po-?9exI=moYO#2V zndb2oB2{~faj-x4y7RI*wY_rRb&;*_Bl{9CRi?}1t)qR(@)1b+ zL_4n4#^&fo)AJgFoq}v9<@7p+<%b5B1uff3WaK@AeY8`X@EdAti_M;oO3s>@ei+YD zZ_BLBuAcL5+|wyckXN^>&9oTZsb4F8?!0m4sm)`n_bpi*r~jTWcq@ABP|TR1FKd?ncLRl_z+bK}(=D4K-xxvM+gBIiTYJPxa^$GT zC{oB%{HgjowV_|t?kzv$4c5+{Qp({md>NQ;uwRg6o=V_U~ z|7a~UFZ@f)r>98c;^N5>dlIqoFag?~O@9e89g$NiALQEedr?%csHOE1iG4XWm)i9F zO9I*a*T}F^%0lf3TFWm{CZf+P zqsJSVn>aQRADY^xiC@5C(C0StK<~CQ7~>`ly950};#YrNPPp2^64=9!KFO4qxg^jd zj)g6)q7l3_R{BS^89!D}=TiyXga>!*I4n%ny0$L{rx@Q!Nsv7GSz+!}K%Pee$lH9H z4PD5U2;?7pAh1RF#TWkT*XAQ;%~{5Y-w>(+zfNyXrloLk$**igh(BK*yymb;YeM;1 zkPBqUsgswDt&0e=Ium2$wt^;O6RfUN){bx4CjG)A@I9yk)E&SIOgYgs30)?09<>dc zmQM(Xa&IUFG9n}1IyWOYmjq;+l!xX2{$q-7gcD8cDQ*>@sKQ`ObOHRSm$6^o+U%CD z_43Cg+lAz5CRZK#!;?LuVb)aBd_&Ufx_mQ~g-Dw7k-Kb8OIBu#*yLVHBK0N|Juik4 z=y_%Sj;6)A?<-~Ko^m%CeK|utFu}X1uB z3|c8xsHuB|*HF81ZO>0vg^ZE?Wg*kj1b&o0-)=G$;gBD;TAMEY2K{q1?Wa+V$kVz5 zdW10%84ttjyjoV+C#VQsyLKfF_gDiH!~=cXe($04kj1M?TZ*=|r+Q+cirC+Rf~*!V zSO>$9ArZ#i1LM&4s-&>@X0I$*j+Q-_7R}|dHqaTJh_RRz!p);RK;9!KfPQ_^f57ai z1y;W5YSmn49g|D1%@vb;^^fA3b$tLI&#GY@MEi)W|zi# zQ9BA;31Wq4l`qfq>JPZ2Y!2n zL8Px7$Ql5unRNg)`amqa8U}mYH(8eZRJ#8ZsUG2_6g=%0`W$ryuD@e0K;x%>I^f1; zb_0hv0X%vf$-iFywC*`^gSY#n5zGDRQ(vHe%Lf);f?ijFlR@7x-QZO4MFD(ls-JHm zhI&n&wWd*mXRKSYcG&$d9W&G9_aq4HzMifBVBAM_k@2J^2~*60Xqq_7J8|g zcmiAuSl_i|4ZS_1 z{11c`XmIyY%njbSgwIxwEX(UxvVS`?c{P$}EXEZ8MDhcymY)1q;J5t?q)%I-#0o9F znadDr{>V9>3Wi*#tdfj6k8frXWqQUau9o#}p1!3CWG`N+D=H^)VzZB$JMkzT2#lD6 zU)|ypizm5u0!eUquTannYjRGWBEC&xTeE+b#V|bEjJcoL?#2(11bRzdvp{y5(>vw( zQLvZ);13VBSxXn$@d6Li%=GmLs8CUnZXcd_;7>V7IPTHewvjrtUNKWSa4uUh?=uIA zi+W3jBt@Sqm1L&G{gK^htg4|kykXi${88JR-;RUJ+pIxXPS18^%dl1X*@wB}#~3q4 zJ!24?y39_v*Lzsk4SGwTcVu#!5gm^5y$jc!YjX_-Xs<@_Yh6}K(GsSc;EFfzDI>`i@0GuLW%O`lRWDd>wOMNZb?S`a;$*$qd7XZEL?NbJWp#2XiO*o})ofm$o zC|(7Kk<{_0U9~d@@Wx&q)q5KsqBAiFB zV$f3JMxHkao3`O#=zt&z3dOyHc?C{mhs5m3@7y<0dF+zL+O zw|9TT3jewuqHbT98Nq?6o!Q_S)H?05M{Cf(h4?jlAR4x>6eq?-5yin%0q6vVlc9VF z@WY#*u?q{F^j6%LU2STOcxL#nYV^ZRg_)d8!Rf-Fu{K9XCp%4J`)!bC2&3J6Tm$+| z<=ArVp|4{_#nk(q;+AO{Wa?3fhbC?$F^2>RB<;i>fRji=xCeot^VdH39=GF?Id=!i z=l_165t{?FU9Krvd(tb3OC#rKH%VNXP{G?B-!<#gbvA=Q{q0Zy(VGIz<7Bxh^;ls(}_D8k5_i zmN)BInjfsC-)dcZ74q%!lV6|BxD11S3++k9Wr2f_4tDo+ry8w{PEr4~NA?oIo^}wq zLyYscBa8#Fx(WA#P&!#Pb#?nN+$;kvhwP3KE^-$Xi)lAOck3!n(iCxn?YxeS<5khc z6;JaYI~Xb`T~nF2d*UCqDzb!a0Yp?ZZAxevM3XpNgxiTbztQ5D(?mSl^$ZOeTD_Y5 z0aDktD9;sS%aEXiFKTgecaCos-b$G~;FOx}s9Dds!5WR&{>=k*6#d!lt97!_i%+l5 ze7r?R);+z_jEkV_jy&@cW$Jj4zrYzbFhY*^F+yZ-jUTGyKE{}W;?%pmS9dMy} zcuFe>J=NSU(_JVvrC?a%6D;yjtNw*YtF5UDb1U;79#$i@wXrbnjs;@WhER3pQrLD? z6@nm{i_8x`Z=In@X1YE-%=OAu>|nsF#XRQQ{saEWhQ=Sp{CK>kOMPI8vxOkbz0vQN zzv$Gw)%+%7?!YK3kL@jOetLk~Z5X49P2<{Oj>Oeu2%9uIrD$xwo$GEmQ7E`PJ#dd* zR*x0+dFH+^$2Up9RBI1%pccP!DR%_J#W*d{)OUu zXh7nStyXK%_hT6|Rx66Oa)?`EI*m1ys*h?^&V)OMIuVCbJ``g&0a+J+rB9G*nrPY} zyJX+zl)Duwr(J$lhc5)rG;fS9`;2E!{bL#!n}(z$S`lsH)Z`&wsFF3(;4g&CgnfRIV}G+;JTufy(m?LnO^n4`6%EiMCj&abwy9MKbdy7E zGO>=H{DJmAPkqm_ZJ0P@&T8vj^8WPX%}!jZ9;|-)#P?7Dr8I~OZzG6iPtU~C(x@T@PeU5>2?h#>YMzQ7*qY| zZmV>8)R}EcjO|I<{gunBRcPynW?Mwg@LXz8_v#oObaFe)z;uIBDN`Q;SHf-}G3(8E zpeX^{N>tkVNaD5_(>_(zx%fxTM=e;GZ9?5~#2q*L2kHzM zqB6Gq(vQ;=h}k@=Z_6JklosG&<-TIJg}g^rBl^9ADpAIE$Q~q2ygMCUAE`@5TaM#X zZ$DtYN7$Lt-@8wqzo&As^}0psBLFF>Wwd{#z!yDY)Y@&j8X0F>no|Og4XVoPVldvj zSJ_X>y`N=WLQAc-2QD|RS~qoL=U`Rbp~;q0{0NmC(yNJ8DtpNRY@YCWr-778Krfb( zFkOkF(h!&nadU9ub0$^)VT;#2_}T6Ou5Zn*1#b!`$-xjED#Y_26I-xS6!|*Ddooy5 zTwkc*iss9o*5OH1@fTUXH`5F9_G^GL2pDMpq#${G+$}{MQ0fN17^dZW+CB74=%$6J)}7Rxv>VR6q!~SWhOe#|jWw|kd6jgk^n=eA!f&a@I-`u0 znU~^fKhZyrW;zbkHl5`GWYa@{j#O1NeT+Hy%8*jXMj9kI$d3O>G(6Md_{?3`9L0yn zYR@PxVFr(zg7&vxA$udnDe^#*pn*1UJma?l^Kd=5f)YY`@1&rJHEs8`*>Qu(43Xc` zTGQNv!C}oDN1~5)t#r&B#>Jm9Fyf-IuW7oB6U5#=ZM|=I?+(Z2n|drZ)&0Ol)<(*c z8vil1M>Ld;iYJc7S}dU45GffDKFaG@t1J}m;?#iN*D|TQucSM~YBKj+D^`E-0Y}+R zeU$rEq+nP-RE5Gd=g?VrJ?B^MZRUhw1~`{6(~+>WQ>ww4{<|uSn#QcyaWaoyN?|6V zoJjwewpyT~5Z@*Nd&35Hv>V53dZ{oQQNb0kI4YEMBK2DrmH5~Y{(^|-X)^f1nN2H4OEt1h?L$C z=cbt?OBsLoc8wIbqwt58QGOsEj3$1U9BVT~eo&npJyO8Q1qQrQg;@iA0mAvuj4^R` zf;QNrcE-ML#;4Lw{@KhNbN@+1&ZVOwnMOln3rBRX_@Ac5oj!0PH^~MsqAv74wbZCA zpOayQT!RNX`rbEgla)=L37Ao%(HOsAKpG1LRN?X2NR_d}*>!a-p96<;Va*LPGAGM& z;o;l}(KCo~cp~&BNq~?Ad@ZIFX(AObLz56Y!RUyxvm+GCUi-TD?&)j!&*^Cz;H!-n ziZafKK>C7BF5sJse&vsb!+(VNq#zP2L9XCwlXJd-n0u8@#SM`+H~$=u0hj}oDrY;_ z3G>8>);ZkED2l8IioJ7D9JL`&kfUi*;6w-X!uPY!LHuo}NRM|MqYthsR@a;U>Qnmi zK4VdTKA z=6H@u5?sZzGDzOk2U9K$K~rsMC)((rbp;SH^`gcvVdp&s(l%aMjjs+md{yCd{3<2S zO6}k&J{grHUM9Z+!lIX!SGSIcoe<+s>w}e)MBBF5`(ssLEwPOyExV3RxldxuECn1% ze}kVM>AY_*MaOpkW3oFYC;=uzo{f2A=-3=BHLetBqrI(up$vLkI=){SR-%9M1$FE8 zpKee8f#puL=6PT8P$YqRoyHBG-ox?d6OX-c%`XwWnl0R&`$sXIk8Ic0ZYk^%@%T(QFMD_v>A-J%(@7V{2C&=g~*s z2t$z(Sgg3-B&Ur<{picqA=ihnYg2uKGFhlrpqZPx$Xt0~{%D+_=A*l5f=V4~Y>UTb zaT}%>$aRxg+spn!-PoOiyrBkIS|A;XdOlK5StUpQ{-!E^B0t{cd&YWI+e}vU0KcN9 z!fX;xg@EQsjsLiETB&zwE+n=dCyzxog6OB-VgTu>6H4Np**fop-99@MH=TLu`{lI7 zq_@nhZ7k5i1>hdh(VCB9XAC*(Ho!YoW=+*fN)9u`Chqqp;GY+tWZC=v`qh#YW)xO? zxYbV*?q9m{DIiv61@jc%16L6S zcj?@iJ?saI&Pg&9ow3mR_c6JtrAdc|7=ooAIst91J5X)NDe4_YuYFmMUmQ0Hws@Gl z-NuaH{F*I~i4A|db*ZZ^+&ii2H8ZtiF`&>ZnevFr7S^|b#)5i4SOL5aCbf*X{{`RG z^MSL)tgW)uh3T3Ec}AUXP*k?P8qenS&H9M?A8sEaEFC4=?++g654xQFJv{994?i5)i_0a4WIQ_ zP5=~ndX?#~(9u9jYJIRy{xY)q*qPpLj=i4#i=XGBT2ev4}s%?yes*%2g)qj*2$aL1gG^ zB=nH}BJz;R1ReVgMDA%Qt~M_XT{cqXH{N*vr}b?C%V=e`*BKl}O5Q%Mhv6&#n0(;V zrN9+N`5-+Ue-hRuj5o?yx}V1Jjq~G}edea1CfHIEV0;k3BybwhZU2vn5J440z0p;( zGZhJ`74{LZg?gkpaOeeoZoVR!+rKXH*Q31JOk!a26=V=^^qZoGwir|{94ML7^jlw8 zZaJzhTlCuJn}aA(fShv_LkF1MhTU51EMMpKdz4vgJHn%XX*=VZRqWEl*-!wGnwO^- zEPb@W?IbHcXe=)tcvJ(w_IyNn=sf6^CYv1W%@^RamfvWz7Yj;?L3> zd9Wg*gK?ty5?eGwlcq5Sm7qMn{REYCW>JBC#!-o$Rx#2SqMN(esg)=y3aR8KQZd zsN}?=W6hfu=?O3TU&eLaUFsX~rv<_4HYXC8oED1{-)hi}UG&_p55}k|U9fl1=OoZV z7eL$FUs8u(J{jmKa?8y?vxDvEpK30+Mmmjoq00O$h~@`(!VTRVrkgJQXueZ|eP{~x zIjVYDpsXk%!2mGia#quN_(HWJ7s zdL=^(a-X8P2;-&{;Xv;edUNa>MsgIyJqq0uc=?ITPgqjIRugu*tV>LI4n=%g78oK& zx|auQC|d@sB;{{q!Dp0$uQa?Ey5@u_-$M}Kc<}V95#@+TM&l7t2Racw7HA(jJk%qC z9QK5D@3d5^WQf#JY7^h7|ES+REP%|z$y*%GToZsL8M`8mT{*l7ee&JiefP%i-mG;D zFr4t~^Akto;qL@8A^hWQR>62Dag%mbTicj)Yre^kmY1F`#vJ#b?#3&|Tzacw5X`^8 zt8EMOK@&kVS-Pg`=O8(trDCg^Bk)Wy^PV^XX+2xVt)z7EZWMz0u{jXz_<`!59aa=Rk!A^#V#1;6;EaI$0*?J4lhpD2=)`e zqMIPz23QQL4#H6Ut$S{5o&`BYGv~?=Ed#G-&O6LIIAC>p9_Om@{9{tNcaNj!;9C&K zor1}`^ckJ0c39Pj`8$<(-1prsGbJqbp6b%$(Yi;GVq|1nHAWutl(evQw1|iXv(eb|?(ub-yp>#~;Bdw2DzndFo2r3oEk2nm@6JVV$?ppQSzRO-^!MX18*==V&7-k{ zNNcodyQ%H9C_V0Jytypv-$AAeC(P|>SQ4g(9@GbCV_c+QtSL@k!s>va`_m2}u$}9( zMfm2M(;suy`&o)cpIcy{$^$);B)J{H)rBFZezAq8Nv2_;-FZ(v(`Gztunwvr>HS_O z>*d^|m%Hk|AN~2h_8_2}&xEt3f^_IpmK{6#G*AR8Q6Rf0cPrSU^4ILlgCMLi(7th} zPezVtjv@sh;!*gKBpMs9SDho}nodA*wlTq49U z@X|TMwzH0^6_&~bx&%4qxFMhog*0e*mOdI3e9IO?X3%I1UmJkz)*UKi6P zvmc(UJ<`*BQlvRzCilP;kF2TaE;IPLZhwEhXolVN_Egllj#}+b){D==Gg^Z(v-f)g z26+Ww2EUKx>=f?>?i3d@G`)BH3IXv(*%It5m}Up>NyM!`Mdn-V86A76$mc927XWl8 z=8Y;KIGT6f3>6^TjLobI`3w#AH9h+LUcT5N^fVNFkY7hgIhbCXvX5Lq&#CL@>osNc zMZTE7DJ*l8{?~C=YKQPB%nzLt`+x*N5#yhe4PNG4xm%kRD6~Co@^S2Z6nxoO(NAV_ zU)3-B?1e9ki}dQ`8B}@Nydot?;EkH1xCycMul3=p=z>ha22n!G)z#@HZ__t>BC9%; z?=gODvZ0^Xa?X^_IrQ>4Iuv^bR&|-UDVQ1>OAQ!(KT-c_%Eq2(Ruw>xc3H8To6h1V z-h)F-&iJH`h~97-ZusgD2FAf&BM92kWlpnRvK0^feF-c1+ z`%Qi~O#9xCJqitM>ZQ*dM5Yf8;{DDjCA>6(-zl`)$GHfBX3oZ(-wIYZ;R-~N+tK#F zp&Wq5RR|J}W7iS4Yn)W@1yHa3#)->ed8UTBHM_`RE|Kz%xxQC+jUWv+9XfvNmlXX~Ok!0_-G=*Ls?Eyat)Woq@;-#ArK3T!9CxdU|;B_F?4I z&nR8iW7;Kd#h{`ZHgRG)ijx>dkWCMN#&oNnw6`LG)v zXSmdr=M!J^eS9?uiyb;2dMm5V)^$as8snOx*-3CVsGP>4-YEp9ewWUg$f@h-Wl0cH zO}^;AHT%|FqPe1P)H)Jm@u%y-t=b?n+7yu4a0@Tcx$r@(zM&@q{b%7jH|-<(HE6SH zm61LKo(P6r1O4~6B3FS2F7q)Xj$un0WEj3jZDRj1jTHlHivZo__x~sk|J#fC-~X|n z3^9a=&VvzJ1(Koipj3oMra|WY$=DeE2Twbd*qi<+412c+J3L$f+I1!n*(7vsFA zy?PIKvd}w_@XN^XgZBSewC-Ii4p)7nopgUM>hhn)qwe!bu5aas(cnCOl+#BPZloUP zYwi-ivWx{?5GkY!Q6M|WzNYBIDymVOy-eyuTzq08h(|2_B4O73>&Je3LM;w$B0#a*_8_dki@$|gFs3wW&reauY z`@x@MF{CuTP1_UjNp_%4Zw15R=2=qlT)=r)x#V4w(hDAIo1WHDk-toa%EMlSiX(Y4 z8Zd4sane-kkQ5nQ4L&lU;dF+nYgw|9_bG>r0W)C;l86lp&^yJD!}UVW?egjb_cCQ3KS4>$4-_9BtoL;?9!gf;0kHa3HKO<_{*G8waRK}@M$KEt!D{es$0;iMvb z4nY}*V5MDb9oJTL@d^Lt`EC>aaLjer+ZX+Q-WntRLn{=o+>bVsN9$e*8@Hmo!AFc- z@Dk>aD3JTY#3%VET5onSR2%cbB>#P3Yv6yu;?v?Tc`Fx)`%m8c$;wrl)P=MizSzLZ zl#Wx1KP)>dP91=GHU;v?&4HNE!!bektm8WIzihYJnEd5`DN!+@Uk}vQ#82v;@`8y# zp0+%@On52JZ!>|ojaMZI@pfjLC1`QfEC$VHO&?K1jJ`0?@VX+18A&LN+S@oyY)rhW z+_$uhY|&3EB=z}&>qM09V+g02Z!q{G*mI>sDiNQzivz!DNT!3Q6`_&PSy;_b8G#ws zy#!{_7e%Gk+oooX+B|y03LTP!L12S2I@|+Li}W=YWo<;ZgSZm>83U_3m3No}FiR{9 zKlw4JK<}cWd2h%qrJf;^Lxa(6hXJR@Xs^MtpR3DRMScfqe;Fh_Yo=|BG^-!%GxBpP zw$cdARrOD7waWi`J_^onw?!43*_RfNfxjPCloWf=0#W(V z_YS}bk>eX5=w~4OK(EFRP*XO=zl6sjWl7PiJ*sujxA!T_cN6;aV-u{ImsQ?$U*)=R z$Pxf9oun{RS)nt2#1=9J(3$t30gqvdS2V2xA z^5dbTK8iB_p8bU5p|StP4-TP&X#(u6Nd<+>d@3+<{9WVIBqSfCIj$7V!@5 zXsk`Q6W8)iboP<2Z?UlR%8L7fLP?5tMZ>^N9Qps&+iNv3OK&U2+Rhj9R2BwT;(hC(ymgySngi zbM714gGR{q@9`cjO>gZalHw;?tp=;zNg8;X_so0o$QH?-9Q z6}#fyflWMOFRrc`FBQt6dO$NvWH3bsF&lQp$Ha>|ct%Jou0F#2)vnhlY<7#pq7P$g_&>WRUstxzFd=pnv9C%1G{PZQJ9PjCqzj+DH9EGA& zE93{QJ04yPqsrDu|OX~yyzK_M?}943{rx!Oxk=j zp)Kqa_i6kxRp04_%F1`NHrO6a#MUFQ|TI$MauE( z@Mroq^}zJJZa9J>6GRdhe~)7zqRWxxY61h)$}+zkiyp`Wyy*xp7#(zrUWlENZmMgr zRurwOhXqQ>RI8YZsVghI!e|*puyriFb-qENlUUImVjQuDndhY^$4|J_H9kH$beTMS zA=v%W*<;(j>&e;i;)#~!Hs`H|eFg$U{X%ARY{<9ixoU8%4PMUHEW;tp$naT?ZTq97 z%ga9ar^+jXN!;7jw)?<%(;UkmTSW0NtOWug1j+B?FV`Nkhe-%OzFKtA>e1RMe{ohQ zH$`jFqx-+iR6evyE$KOhs??h+Awc6?wQ6pbq0i)h>QzY?C6BiAlP+*|kYDH{yazzQ zC2{bX2=c>v`ez!K8j7PUJ#4l6=ZLEkJNau%`?ut^xrgwU`5w$3xZD=(X>%y#T;@w| z%FfWX(3i*x`>P*g&6cKjQH`E&vusYFQu(>#8r5M!48CaqT=#yeRQ~5q+s4B_&1^MM z-*a5>-QNL+uSfblKG@tMPfWE#MiyvFOOw$VgEblS8R#a=4Vl`{;O<|i2uequBAPjg zqjhCR+eGoGi*pp2M{(>H+Sus7@9u z_jRz~*@D_;x@>$`#0#@C)BCTqB9B2vMSmEtbwz+_mR3Wge(x)f1YKR)!&XYPThaZd zbDp19!lZ&V!8h$ng6iI6SqaAq*Ay_mzGmtGRww~y+_xUkF)<;cQsTi#%)UMvbp=I;BJUG&Kq#7F%)ySAOH~#APQ~Z#o7AOU|!!W>iD9&p{gY7k6&K2fA>@z6)Cu-pkbldZqY>1&)`XnHFw}1Dsem?});x zwoVdDkGArC)pDgHI-herk@vV` z;@+{NG+9=O2#nBjo9hycjixgy-UAE-6kjUJso%q{>{`WK+7*$@_U6bOYnIOGHL`1m zJFlXQlH>-eaM7hR@9j;*2ddnvM=CZ(`jir&f^?K5C6w4V)6GN46Acu-Me%|?X;oHk z_L}Cn;>uo8cckSXt`z{jJqH2NtCgTPAlD}dK0@X!?3|}5nFNqaxEStSWZTH!J0<0)Zyax#5M*5X_+COi5@7OUm$cJ~#z4QLLs2=;Wy5qr;2xj~` z&79(~_|Zb2Vu1$N*DDKw+6UfIa&QMWB8^E72L)*R%r zGW#Wid`^}wbo2E=)N1L)+ca*Hpr3C;?4-?u@!?#Iy2q;r^b&9B>-DM8u57tySa|Yd z?-)QCH0T&tJJG!FSfeEdO?YL172GyG{gm}aPTH4NpOcvl#`~#)h-g_lSw}AJlDGyT4;^Rt8~C=IsLNS3^&OrkC>!qj_Q(uaQc( zWw~|1rHS&YOAA=1R)TLSrU%Hq22Uc-QJiQ7o`4K%^hHZthGDkmibIfyp_&H=)N%U~ zuOnN}$O+2!!hIqOzmRM9OQdKjVdJfuU%lv^{S{|J$KK`L@R_DYc^g}b-X}YV_~_Fg zlzbda!pHq!>4!PNP9pG9v>oe^#t*j+89CO|o!UzyNLh;5%zg9IcM2uP5Q@vcpM+g= zQNJTQPO!P^#L~EaiFq<-)j-p6iG_FB+M%)K+E@C?Yvtc01?7Xxw~VhLVQ+R0*{S-V zzOI+~L(kXPEWy%GT3~f_xPOO%P2aDVOg2&Lw_xb@mvf#uoK@gEY>H^=KdXE#krPrLCgH_CcqE#`cbdQws-Zu%SB0rF_&UH@ND+j>BhS=DG%iEn- zj$Hia23u`_+l*BM5rJ0Xr1);HjE!537do`QR61Gjmh-HP!_}oPpmXQ(2H-R&l!U=d z>KOA=RWftIl9$`F%=+BU`h!1TuG!`cY>A-QkK-5(I1gao<&2szRXF#LUg&ZhAtTry z-9H?3siSje2=o|OkP?A|b2-C#*F%0wso&h#qgUNe$x*pCW9gvW!Sl5rt%<%rB``i0 zsdZFXln_MUx9O#&fu}5M%McEj zp(J()IgaZGy4EO2zplHxDOJ5OU}yDRpR0m1QU`3OFPZeI?ALgR$D&|ag4J_Oq2R-J zMeLpGk(mqqYp`pDCx~SPFPJOdOHs*;^fG66eKh>+7ow(ogtvolz|RK@F_D{EcA#@zTW}Sx^5+q^9otLy5~UUpeayta64Fd z2Nv0ib22xtY0BQ}*CuD6d3@{NbN(7&>c12;k1CP7iU2DPP6=_HS#W?ClHda7nmuUV z$9YkYmTN;d7B+y^%PY2@|Jz=0@qhpOiuM1$z3QjZ+dS}A5qFPy^3Ec`t4Cd^7<;ot z?Qe5a^3#~U6&!wY%~{*OakyKywJr{aVBlqYKl$QC`HM%B_)U}^Fo!1!>J$Xwrs(M zTDknoM)u2tBH>G35FOr~)p3#zLnGNOx{|X*V2??)@-W9Z7sn(2vO`OlN;s(G6(z3Rpz{EvHmDrbjY$rZ znsqv`0jq|a)dA0|P5Wz!r4*E1zQd1E@1+L2RjK;u3vB7N#@GJ3HAXkA9aOycuN&VD;hNP(Au*KX@=CB=Q2A>GrCDZonzTV@?_XW$X6L_dFoOXZJo z1{bVjTmUSOZN;20MCF=+JZ)$OeshhJinIV9vCT>=W?rjgHo>iH7dYf$7rP4kP;iGq zEnuH6p?Dr@b=F*q$1K|!20ZoO|6p4L2Zdx=LWu=U?*Ux$ne-J!$g707it6us{usR z7WfQQ*-2;QUA%2Q-Jm~Qb(ScZwrf8dvLo)7^X4cEuf(;9Nu#C|5&POe#&mVO&a>w3E3vJup6ME8YxkVCWcl8bM}z;8LuIDYBBQ1kue z%YZC5moz<*x3@4Ev`~s9jvLE{o7?F|2%eE_`r($x9v84O@D{xW3eTo zD^ON6(=4dEb9#WV^ay+S`ij4H!A|qIso1kdnQxI;o`AWV1u=Ag`e&qO*zV4vbt(x& ze%Kb(lOq&U;=}6FB=CA>xLP<+S;{Mrs zW2n{c5aqWHSAli610`QX`ah=A!=Fi1i_ETP;Sc;)op0pc@>Z^&cfLug|A_p97=rT0COqHIhytK#Gx8ebygb)L+!%ReS` zpahMNyR(# ze4U<_GLowNo-fg-3<>4PI+*KTdH81})iy5_fWIJ0s9)IX#3(eekNk#mXIxx!ct;U6 z85~&BjvUurx}XND3F%}HRYR7Yp1*4v!cP^z713^=lxW+Nsu#G!mx-1*qie59jxLAE zdF;eTPcP~nojz45sbP1V3Uk~**CcJi3HrAPP!Wpl!z=GgrV7KJmP{iwF!w1wq>}lP z4*5oHbt1M?^HH^z6L;3wH{_=-jaTeVPxtNK$hYGj6N&`3`0r2k!glBtC`^tQr`vDL zgLOYC&DbIDx9|!e*`)D8e1{e^yX)q)0a`tyMzybHe$WHA>GPNU&PpCth_XFDmwee|x;%WJn=gtBa z`i-?MjV}%yurX9|{?{-0+VwqBU%p)SYbi93FAw$(g!)fclZP6VV$l5BXNhs`hjW^w z?ffi{w8xFk9u5Xjf4S7sp6-bI2dk{2O5NBg`Gxw;t%XMLj`Y)hWRO-?4ck(lF_sfwA;t=x|76isUjU3T$O+*nEB8>de*g*byb zmbb~!Q~lNNo{V1_v&a5igFerjD(M2(jK@Q!gRysr74J5$wOm$vbOo_j+|8}RRg#~0 zwh=0S-=UHSpT~4?C(^Hmx5K#0I7j=QRLH)F3pO-rPY+oAqQzB)xMv!)CWn#QoBUN2 zlo8aMiD!eHK};{2C2S%Y=g;v2al7xUMWfvpm{+n~=x1OKg~Y3-=)2fz%gyO|>|}LY zCB+5zL-D}&Yt{3h)K7e~znJM-B7a`r4fXaml7Tv7CrjG!)v&M(+}yc%`}M5r$zbnLCxK@KnUri=LuSkrX@gD=UQh%@(fz=_cMq|Nik zgCh_%+L__KPh!IxBDdM;*ru6jiNB!qoLT0%_2xfdT-QG@5x^c@082sF;c5Kj>2Bbb zl`!76AyZG163n|lB*pL)d&9u7J!S{Vkf=evvoF{U49m#5$_chsc+lUJnHoH z+%iL`Dj6~V6#E`%c0K8Al8e}I5!EDIlJG=HEb=%)}XmE8cb zmt?1myFy6Y7P;96!KI$-dBLOTxYP^;v=2{ z?@)^gDO!JH zcT*xEa0*oMqo(A#^>oTqMaeHG%DPYxaBG(?X{6C>)2m1iNqXOGMuN}dH>CjM$-f{@ znyFfe-;>KK(^f{J^4etzv1hNoxyMKyjAt7m8Ek$gt`IO?Fb?`qeNf^@uJRI@i2fRu zp08J@>AMQ+59_sBU#JF~bzs6^Y^W5E!3ay5W6LH73`3Zami2>5+Gc@HpA^HH7P8niisBVEOoZe31g^Uh z+ndC%V;ag)7iscMCm-ln5YE%{wKK4for+=Z=1?R~EPXgf|ps3eRUz zs@vvnP_9=XxhS=trje4vHBIg!H%;)_uo~YhGA?}J3+rSqO&KSwlkGJp_QX&d^Em_Up4^kxy#%?zQzMuo8-&0;M1Sl7G=yX`*Y*UF) zchCq)J&rES9X2OAT7!)oQ~PZX*9=pw`IcLBwX~Z9s`)iXh>ztTJ_FH!3T}dBqOVq; z*=Z%3br9B=3v(0 zN>kEd^W-!l?hysHFp!kdD#>i!fE`wK^ge&l<7@MqI$~4(Hef&PmG_*a#BNP_@PIyP zkV2lj_BF-jv6N4(bi&uu2GWZy{^F9PmB@oUp>-8|g!)~Pgg+_UO(v!D-z^_d+al+? z-vqKf#JqUgFc5jR&c<)MKKRpw-WOZFLc5H*%s+|O8C<)BKR)vsfXZ{_roI02zPzYn zz>M?ey^lYS8s9#xx7UInO}wX=d{+kuUfQsd=jyGGvIF66(&jiJXLS>K`HLL-XMbED zD7PDNA-d2c=_13mY9p+A&AI9kPnVqD7Yiw`bKE9P#7?x55@XQBdW)LhH2Ej_`p2!g z&OH5r(ClcB*p-1?)22Xb!>>S$_=u|YfzXSsKZzRol?nMVj{ktlrHs9eq;q}+Qy=+w zCC*x|`#)J5`Eo)nH8Gi5^F9Kk#9sy`_8wM`L7mZ2hgrVf1~0BH#VTJsee-LRLHcyp z-5CLbJl42Sny9L%^4eLsDqn$CE9A7p%$ zSziHpkTbin|9Mu*{8ziq_FX2FBT|jxMAb(?yYYMn++fPF?aA4WF_D~#_^<0KGuf;a zg%{?W651XrOD?+gbTPejJoElzAF2seGX8SbnD=8ec_yRA{bzRYMxRFuEgsB9(>bXx zImtn~m2IFn5y$}er-|YWXJ5p8x;Unn=}vgwXpYVu1btUPRVdtX&C zTN!3#2vm{Y-ajKYQKW}YT#W_a7Lsf84jT4o%MqxcYp(5zC?~n%oN~DQmU!K^cdqov z&OzLypV(dhhK9GaUn{b!H<-U-)JB5G3vx97f*!S)LQU#}OEd2Z0xEehQG6x#n7<8N zT~y<3P=+{sv3;fQS(I|p@hWa71OTEps$G8iNF(vYzIj+Q{BIwC_2}g?%8~qzR zf)Ym@Zo>;|EBnj?(Gi-rMY=M)uO~VkB~!IMAkVxe--c5x9l&Q63MPovc8NksC{2^S z1Au2nAz}O|{8T-f*-0)^Ulk8}uI29h*kgZfmmFZMWhAbDPE5@3$p*}cJ_IA0dIHZJ zj5$LSx=N|X^<}{MleXlYHX=k>N6^)gWYwb>ZT_hl6E{Epc5M!G!$0PUiOm8|k6EM| zq*T|tyt0NLEWn>^EIbeHEt#R`L7nl_jtlBr-0r7SIopUkab)_TVe?B7eKDJ{yMj(+ zkjC~0Z&Wviur2IVpQRNSD01=R@NAw?DkDAU6nd&cWJ3Haq(F(_JCrDi9A8gb{a)%YbqnosBcf+k;HUe! zB#W22C!EL(Vg~uoNe)Dzn?n#H6Q?^W!>DFcDVTD4-_mEP{>GIfNZNr~VM3X!x|f@! zmJNAIFAVYS<+Gc-gHfV`6iOt~ z=yFn^pGskcw7;ME*SB{hB=ygJSjjqsMu4k;1Bg4_qQOGkbtSzrn5n* zP^3m;uoC;6(W1LO1H0gR=wLXy{w3uOy75~}&x5yh21|%|`%A_L(#ePW#ggZQE0;n8 zJk}DA`cclPVIauZ6JW}-J#8Yw+OgG?(oWWI>J9Oc!!KYTXEr*9ln!~en0M5?Ymc6g z_Q-e&p0wpMxKqnbG0;|3G$1=<^^5f6ruO0bwi^L?2ND4SfvzwGoq4G?=F;n|eRt|5 zZf%8b;IxR37Qaf#BLriFD0>#!R-+g$2e)i3%j+z0gFOBgAhyL)7u=jEaQ3Oq_T)m? znl_9cm~>5ZmY2C-t_Vrf)e!5Q4>*r~igxy-T##a}0mOa+5)w}h_s_Xqy`l3h(+sj2 zwLUs&UiP>af@XbH9+TIAnFao#5I(?(+i@ov=n^U_>NSJJ&VGMX#ne2yeCzAtj^2yH zr`BFOwLz7u-1}Zcm-!4FK9tTK-j3z=U8S&O2wGk*DtOXUq?bARitM(`>z|}*iaz!)u{0i-r)wWre^0nU;?mI0)5xkwj>RVC(RIx)TCwiM; zcoBlvwOtl&_Z_9LWp6zD3o8DtbeZ{Z7&33UlKBr`;p0T$sDp{i(VVc!n!8MY`!Tl9 zr&;PgkbQ1jF<0tKUq)MEt9Q2SiAm)6p+}y~>*oh-V;Jq-&c65@$hvJ2WSqIWmb_6z zdW3$S#EWz#V!xY|+s=!<&dP{;v^^;mDew2qksTrfu_eB2p<3=H6hF_(jrpWo>N=ik z1+)2z@KE^UT9go_Z*WuCtU>iyiFyUyewzj-`r5C;&PcV80Ew~2yIC{;UA(Q>N8>K% z%VqCfJPfY4FSll}cNiCjW_ayj@?pxw0Gey5DZg=6FSG$#zDYyDi~D z6EO~_S>Ks$Sxs{9|KzSOEXi}#w1Md8a5ny>{7B)~%Jc<~rMiTbJ5O)M+Bp`p3S)Y9 z67j;1Ps$XU!Rni%B=6~V^&sme&9wA_Pks3_CM$uEdu>Nf)0GqsuRI@E8A>#EnVZ|~ zrNdpt3aQ#x&)%w$q+T<;S0|m;UTF#8;^!+tbU!S|`0R>H^nQ8r7Cw3?t1drWeOF|) zcJi;gwxOEhN9Z@erORKay4Kef-uKEW#i+UXlG>I`ug1|!pCK%7D;R+xV^ZhG zJA=zzz8?~wlgY=z!{^alg{JaqAMzhphP-3-?@rM_=X%tz0duoQa-I}l^Nhuwp-Uj* zx~MeHQY$#nLe!Z*PmSI}jh;J6i+7JE@ILeA?v0P7Z2;Mt(AXWK3qiGZB?)M$9))n- zPSZ9_EvhR2`6|mua#S^p&;uVi^mc@4dfKetqs?&XmX8 zZZpbWpMvjz*0ij&53tXy&xy90+EQ)rTP9JN9G|cxioNbo=wi5zbIfy)AD~~Hra5Mt z5t(Mhg__gXS8&O-%d?^>vLX@dPeP-XJTJ?NdW9!?+kZi7Qj$w(7XVLbQhGLjUzvmE zPu2Zremu9;q70-i`6TD6?3mMxiAf1dLnu*chik(h5lD5BAN}R8`P9idM|0&(%R&+! z2mknExJp!7cB)ReM_%T3@r8w#kW7F@(Lxaer4niYtkIEJGbf(YtfKfe9Rnuo>?Ra6 z=$q5;`lm^|KobHDaAq8|dqXZH$j7sCUy1qAl(o9b)SeRw+3=$}btxz=+bsJLnm3EEiC}=SAHAY+J?wJ$~ z=H>43awq@U*+Un<0v2z*ZYx~DxaH)@irAtSCOQMksR&)TG3+u77#pHp_R3J*4JWxP zD9G*E#2a1?Ob*lN61b%UL+kKPE!Euyy(ND=4TWDm@+z}r>CZG|yu7aQXIu`;p>1)J z=~DLVMR{2*SY>mlKg1*K3H#=oVwkjiSjmi~fUB|Twaoo@Z2O<}K3}U{%b(7N7U3rz z&`q@#@F#&rK(_lIOgVfBKT&Pi`Wx#6m{q$4#eNwOh~uvu)013Iej9o7VHk_H$Z;(8 zPP_d5uw7U(&8TC@{AZ;YqYyvlYnm zc{aSlbq4$#v{!!Dt*AGx@8m8#bVO^s78@z~j(+UlO_ktY$wESSf9F3QV1AEljtW>C zD(;*)N~_s8xoak6Rv#QCn-AOp`q@C+-3{)0sYfd^y_ZcDMYtIrpNyWM()M4g5p88N zJVm++szRC99^C(-==h#^_?iY?fTOEd)7?&^s3&Wp$JM!JyA_GCmtp|N&(Qz)IYKYi zZ@wZLkV9ilu!5`KyY1zp4?>_k4vLzdF9gyI3S;LcHemPeTL{-|$CuuZFx>682miu$ z!6+9zyLLd@$4R*HxNe)9&py?kVqbD-FQ0kSO>b;+`E9wdY?1-$Y*~~U^oN0P>VR7QCG)iFBOnkhf0y!oh z@wv~&yq=J>mqTvae44&K_!b|t(;PS%R^y8G-qz&t-@vT0rOh9?0Rgw%6wcQqjtI>-JsJ|flM(v}Far2q_ z`Df1E3wHIOXGHl!EnFTEICU{jYeM5%C7#=aB{io%`Z~o(X?#CE@-HMFhNSnQP9wGr zBGC071?_u&*10>$ct>6#n&8{Ynz44XNYaCVoQzWFd+vW|nxvhM1l?;G1_bKRq0uK) zK_c)>h#hAEN%h#42&1z;Sp@Pqn?>ro5}rcYCqz}OCIuQ>{Wc;~-{tr0>?O~_4{zNj zlEW8DrH7GPCAMqn(IBhxe5*H(B`l!w*rm@&sTAKEKV>=0BWq#m^GPC<8&oJwhQeEj zenz`Sq-nwpKbZOB`kNc1nulBMTfE*Bd==QNxo>yZw5fV^Z8-kyWedP0<4oLtoWGRr zMf~PPqWof$9sI1O7jYoQxtBgNEb!C~JVpQ?_yngg=Iz8%HDkMKSso@`9vxhD%B+)R zws`b?7jXb3>ym_8>74iPo|-1NsIM!Ek5mNFhrH68RJglUrc}* z*4sU0@s!@xSfdtSfYSW2fk>b}{)0Qo(!RHAmaqF6;ox=^{o5n1i@U}r1|*IX*z?B% zzYtLuptRr_V}3g)an7Vx(fIW0El7BS5c@Sw}n({G|l0XCcEY}?qH~+UvM?O-^ z*f4DE$`&K(R;MQsPcTbdY9zLn7O&r*+o`Pe2h#Rlw?2(!)xT66%(j+}93cl1u3i#&|MG!XxG#Qt;S5IDz8*2R zW;2HWOz9?=f)`Y1g2e5#V#*i+4mXA0bJYHJr!qZI^7R&rj7KP!55CA7&;NnQIhQjp zuS(*eUc+*Ap6Vr^?)ZS8Gi_j$kUDXm5nsH)3B)x3L)%UfP>NKb$xY3*kWJu?o&v9l zo>z@yU-SQQ6w*zUk0*K3_(~VXqdhL4&Rev9)HQ5co%l;=PUu3&r(Z`d-=C$dd?i$U zP9a?05~$h${@zYICFAWygRJKLx4D(}o|A!j@4hjW;L73Tm|<;Q=2)jh!j64?rhE3? z49P4DN%kmt%07mxVGzmhSO`}aIU56e*t84cIVUW^bZ79-j*{A%C+ky2n+-Pm+5(%NCp zkG;i?mTUej=Yxx?N)L$>DUW$@M?F^f_9Z3!-LY7Tr9KVg*+CN{?q`Xn2Sw(T&-Fg8 zKXt)~j}damJ%xp%Vg8^3-$_4DbSRzQVWt>=CvRI4N7Q?zmq#t6)dNg5w@%y9K7<0} zjsi@oaQV&be?fZ8fReP$3fh%K@wI$~Uj49+rm!~=Wx-2eED0-K%jmm~=nmtKo`M}PT|((RXnUL`_(zR9%u9Lo|*VViPUM57T#e>N;u zt~}|=f3n5?uRr@c_McmT{8$3uvn;=Gh->ouM#9HjJ2@pQ{2D;YH+Q7G$Y&TP?pw8rlf=Ubvm)ai4niXmqrOc`c&7fExXV`+ zz0&-o#r5T%|KIh$0@jIACwjhm0jo}xD=JZg(amL>l`pQ8WOx5#DvTWi`!x4q|6vzsD*2@R)> zwxz9}>KNW{oQV*YW=I_&WEa9Uq0!eUweF{nIgPts9QjNeSKp=@J*fSi24sSTkA+_Q zlphQFf5*Ix6Wy6BNGyBVPe^d?Ge}&Dy*-}6I~^nBeKS+o+u%caPf?=lkgJ`MXedBf zj|8Z|V!9ug2kjD@fXc-{aUl8b@FP-9hZJ-E2|L%<3uaYo6vhA$B4;q+(LKGogwDy%EtPYDu{U)StTs}d_=g} z&~r;lk#JQdc2yu8^uwE5B5@y7^S`%`k4|F=SayoJPzCZqne{61*R)h{U-Z@}UU;fl z|K8o}ZJ!PRXu|;a($*dkXIDA20Pq*?9a<{uR1vv*o;^Mn#x1;|BPMpH=a12SrWYR68jQ~BgY`Q3V>DJCC)cg8|TUWJZLr{w=}T6rzUGzZd+!~xS}UBypF&k#4H=T z`sqK+ssCl3G5$9(i;T+yQ!9(#>HT<+_5+d)i}{NA*NDf%3#r>i!QS8H0)oEqw?qkB zNJ*8Wp!6=-*Q1lF8k4-KaB>xpX3dL<_HeO>^w>2+yl*c_XBnO4NZxs5xES}C!D#Ij zEdO6ihKOT82?L$v?tBChezl{@dD=GQi93NQ*Q#vds9onzTFfj7@4>8rkq26ud&m%? zdLr8SgnvOa=(QkSo_N2pKC5%)^02Lmc51hmh`7D-g)#Y@qQEYF9W^|i)t#vR-QgN> zrNb;tGx76$2pl(+P`=#Rl)gQ+F6U&KBK%L;vr~FZ-c^rX(w=mNNKs8t<5*U#Qy=Gb zmNz#$Spz6_=l+evj0)Y((G)vagy!=}LGiH(@tRykBV^c+fleNK=B?!qMTJ|COo#Kw|G^Nb>g`#hD|8P01i zqDC2Ig}ViIFuBX4rzwAY?p@}8+)ulI`H5Iv+0b{PD!rMi3<+-oyC4`4-%HL1 zbOZd)=h%Ml<9Wn_J8kX^tsRdg^JSZyPP`37iu1|qy+K?mPljou+u^}h#m{7` z&e@7OqMqD6;qxi8aS}J)zID*_f|=3>efoWX3Og|7;o>7*MLBmj;Ax<3!0d8Fq?MWWRiKYw7A)i&v7MWZB#BQ~$-?C5~M=KTYO<6pzTbRl>ie>1DX*Z_y#<*bTKACb;a*3I{8 zio|a$am|-FGGFT8eWT0xXANM9VPMrzn*0F8^z#0!x4Suf@%`YUOkq=D$>5OAC-R&3QKVRVM|dHOQ|%C>CSP3ysF zT@eCtypdnwrkwA_4g9n2zyRnc>MdxkhzmU4HR{rRQ zY2lt-^Zlu^_^Y;xMlWN91fKx@^s9KHB|4(Oyegv`&xL%j0Oow|AvkT-SZDp@Qjpep z9{eVE#YREPMJDwv@DHLQvT6+HE1 zwi=2fCux-k_oKd}+qGK75a$5;WEBioZh31^LBaDNK~sy`mllIO+ID){Id@Z?Pg4mh z2jaNLJd78CTv*FHjXHzC5OOi#JokkhNbrEPSh1TFY_{XvdtuQpE_HCC8Trmhb~DiI zF^Q8508rf(I~Cqxd6+9&+Zt?g12Xo2!~4z|ty^bd7v|3R3EffQ?|Y^Ut;wrX$GO7^ zJDWJt*@eT%MYjWcGn0Ut7}Wmo)VG;#K5Eb4Yw=~QI(j}4ErvV`#@%osBgZ|T4$-WO zpI6UK+ss^vHEB>35ExT^A($e2tHq!{jOz*)=mtR2AR56Ye*`1^lwJpDVnVeIp7tt~ zZvo(}q4Z6bJ*k|@*!Mn_zp3LBd0-&>At?e{HSlVy`2or0A!2HgXUfJ_HYYITs@#lZ zeA+L~YL!bB`c`}@8xHEgk_c-(=lTGV*Zpm_IJ~GGXBKd&qbNzwZue*KrLc~D0g5}- z&8QHMZokvYkrdZ?Zh`i7tshSQrNHVY4Tby#WjtxzrS;D^?s(~d4P2o_OQnb?%dfC# zEEmme9>cv}AvkW?RB!G6qu#*JPHn~;iM>J6>?M}zZZL-$v)h&6xIyK!pK&!!j z-&zsg%A&m(1$Qa|`>1O*AahIod4-!_|JvdOAF`zXv{$bGQLgjcM3+V<2o+A#2j01k z#z9O=&kF!HRn85R@?QCeZ>{j#u^QKQZroyVc~|{8NB1br`vVE9I=%rTKEzJ`8ee!6 zMdHT2e4cxLO!Y75x+HO{@Y2+KK3g6c>OBr~Y5TS)j_GzL-S)2T&B^&Z*0nt1T@y~l zKI8h>aZ6&zLFYQl_{jw~V6WD0M*f^D$pvl7|M8@pH;CMqp@fjQJF!oqH@H9BTAzIS za5c^^8k=4g$Y%BDRlCN3t`0aD)q|~<#c@xO6`tb0lht#;w|e+McEW%%<5O5?WypE9BP z(Y!+fAA9n32Y$%`_qV@MBFAIlS5_!to4>A4=T3g$u0wGEo6J2rFU^RO=1DB*b`Tze zoP$rr$8lzmzki6d25;+cs@2?GiI~YYTJ-mF*rDF0Pk}kn&KNO@I-qAr@XfKkhb2{Z zzRyD5(^jS*NS>^vyS_vI;{H+I1#r9vfaffU`|eguQ*0ROFd>1~uf{oYK_*_@q#~zp zQ`KA&CcA_7fCQI|n}_MpvruRI2OVV+h)n|gGc3Yds-f1+NS4~z`+<8Jodg!9sZt@x zOJj6JO6$-z)#&%_3$8C7sW*5+p1tzFj1du43G>FqpU5u`#DVMc!eL~~VTdxq5+^Q~ zJ55kX$H0qK)5?^5O8VVbY>y80Waar-r7jB#|GM6VD9r9G$ighGd0wIyL`eqpG}v$EUofHxc=~LI z$W}HIOi4W(t8I^=MA@5tHy=r70< zqw#^e*d3_jUtrZm%SH2Br-n_(3&)zRk-lrbWWbm#=hDW{*ems)FRxly^}ThawcV`0 z$%0A6&VR<%DMi=VB2c`vYe}8rJmW~M@{U8B5fid0WcPHm;|Cmnyo7=4oc})RrM)?s- zqwXz4ZCxZ5EXsa|55(tj`CS(iM~8bVV|-3Ym{8(#?B<#~ zu^%8oFx{GeTMgk3VZ*!XM9${BqS4D_>AJrl^#HAUj}=u)3zgx>h{R5=h+3GGU6>g5 zgH{~{Qq{S=6`C=?T(W2qT5;BT8VZ?+qO;9XuRn)?JOx+3L9oGPeStDL6`^&%>k8E0 zMsHo`HI;T_{&m(zjiAfG#Tm4+nE|{Q17J631rN#yzZRlK{Z79;ED28^_Tx-g(TNwB z6Xscn6=WDqvvQH(P5D=F8YY7>U;d>C|oELZ8=4v`dW28#P?;o(FWcZlaPg5%sa0a`=5tnphw0 zyU3u)Ia&q}^f|Lr-CyhWEcQAtw^Bm^LytLw>Tba1RF*(DO(2`_it^>WCr<$^hsSh8{wtAh6f(XAQ1;3S%05SRfxNyY7LoM#{;m+61IuZ8;kbOP_jNNH|qn5Aje z9d?2c^)!L%ZV@v8j10vy(9gTRG4(8=q-@(#5!)7zp2aRt$y&UT(u@S{GVn@nWqti6!PlTNYVI2!Z#qBm3z~gs*U=IXPT!B5Q6|?EX07`=w)oR zzeAJiNP^sq6qgC>dAlFNW(-_GznOpM)sMAMA(O~Ulp6t5CyFO*Ox2{puF!@f`Sx#E zHrxks0e?;ETr}HhukazPE=n00M6sr7A=P2gFunt`&3;Kijq*kI<(ZkDzPFJ=?R|}m zBcQ8Rp{$s8C5jxZ12D9Ecmh;HXQgO}iXA-YXTa{PvVF0UfP~qV#JR`!=u_O`6ob9?3X4ZEP!_Nqy!>_U?-!$ofIil$dJ^--M6;3%}wQ` zQwGF-_MY0bc93DR?^<#;Z;*(Oe}HwRdPbU2RS|RGb4BlnzCoPr?(NzUF;5`6abF)# zYv(j?PuIVHFy0SiEK*auE(1azf-aqRZ zt9p#%sMAQ^V+DW0I)Pz1k8n!*A&AI4G7cQs^}(zN56^S7=cL=;kV;?N_}9q(K50x_ z^9?D)AIW(O+)EcW{n(g5s4!G75Tl_Kfv*h!QhEPz3o-?YU9&||$2hYYR*vq=(r%xh z%q_zT>?wvlU4dQrEmo>8Qia05C~p>FUylU` z1t-9!3(bX`uB#scsz`O~`MA3Y?kVe2OG{Q=zckNU(9P-NNbiy4!J0L3)-!f1cfVn<{=YnWc#;6PW z1^LhCK$P&3uA$ZljE+R^L*2%o3VH9o&^oQ;-TRm>v5F>bsQ|O>EJcs1Nf}a#Ab)%G zaWPN0Zb1F+&`n;?WsWZQky5)5kG;}_|2f&ABG&O$^i*Q%{3O7l;6&TkWZn4obk)3@Ur&AV@gB$Av)@aG z)A7%UaNrC#U(Z7k0HPqKoNc`pFSInW=tF+{4uS`oMBjBxtF6SZfIF?xL`$=y^T692 zp7^Hz1sN%PVm!zr3;<3t!GMQ}Q4!^k!1YlkGWjx(UddGc(fIfsN2c?!S5PkOhcS;! zN8fqD_z)&^wn-x1e&q7XI0PuNw~2EFnzrp_sx>~-%ld4^I%ObX?M)X*=y)%{8{7Gc z?arIbhsNAaigF${rF!h%7Qg>3ZY!IL{@k@qg&*#1&Iv+9Ogu03(RoQ8+59iij?^pn z9x$H3GU|osqK;=6j$)|#8`!EK`Yf1L0d<~&?$C_tnC=+NzSUZper;ItQET=l)>S`8 zbSuLLBK6`kV{Y0G>F$`~^$db-H7qM8*D*?`;{hw>@tK~{53qQI7U}VOW zsV|jcKXn9-R;Vt4XySp=B(=9YqZ-%bzu!$R>QYV|%^>A{&lMwKXoAr1;$V`+Yn$9` zX+fVG&FkW_!En~%q@Y|OR$g1?@I(L(EAoZ@>!IET6bSGH#kK>&Lt>8HzhQ2XIBwY* zD|54A`jNGy<>gBL(X{ukdd0ksjYwJqJZ>i(%e_OFrRk0mhPAayx~gup_Z{WB9M{BH zDoFJhvV38c9$zw`31WxQ zOjELg1=JE!LzgTn>#;rCbFpRu$9@YKK34~2B0UEHw()wO*F;*H10H4EQ(IjQbK*!W zX^>dPUW(dGrQSq(QqDe^KRo$Zeqa_wKR;BKBJb)e?-Wxu=!m#)EyP>>bYCLuN??D; z8dZsQv7Q1Z%6`YPx%-WhE~CT}2@I%Ernj{}t9U@pjC~4Ai?Zjuwsgdg{Wv+B0$o9&DK7B0zkU;k0c zt{>s;43Vckq<2AC{(|5)E_5l0mK{4c6`mFpX^FA;7Gd?LtNoR=lx$APn?Uoz_a|O@ zAe0?Kmd={yRsy$C-RV?{r(Yq_(-x@jJT~`BOB>HuTPwWNwoP8jTQV zGHgI=-7~h_A@!e1qRSr1m^2Qd-36MJ}phmG(8R1yC=jk;E zc2aS9NTJP|CDuLB1gsPc=}O{UmgjiV#I}WWC(7dHG$;&S)Em!5B1S}1yOKN$RA+)A zLz4Bk^}Nz9Hoq1=^6DkDe2)bY(UaW()`WE)cnw5Zb92CE)~B3v6h=x_S8vIOe0h|X z)?91Rc3Ee!#D;?D#uH7aKza*avbSxZ~-k>7A`y6#0*oE@sl*MgpD(vE5(hXX-n+&D zq?bMOW)4+`qE6N)7S}I;nbr@FJL?*eN`Ha@i?|?-n5R>%<`P|JwfC})t71X&i{+kV zWujUW^`RTE;YM`Tf9fWPX=>#EU&D9*n#B8(%$^r{+2tW|dLZNxRiFMdj}84C%A^!V zm$TB&AN@I|KKAo`b5&*}J5G}0N*k!Bt@QD;XQv>n!`S&0aei#`*I~L0&5mRiuPs~D z0CyL3i@%qTVPF)NVN-739puvFg<-DUWB!R3fjW6cfrU_v-BhiT%moPl~US(!$uh8&VWs|KD z(q>tFUxx|hO&`o7-bo6Df{|uki&R!6ZEiwS_nwE+wft7opCzf6>$cU7Uhg`n-Rw^9 zUB>DI)4L}Rz<}B%=mJ(7WSgB%nD_=AA`)EM>8^5;E;yyj_D8#FYw=BD)LiSI6QD{t zL0`ja&xX6xq(~RO>l}M2l;EeX#P0U?wC~PDGFdS+3JGY40^^aA9}1PamPe5;d5!TC zgK)>@2_o+=CULmk6&crW%*=l!fM9LtRUGYR?t7@QsuxmAnkHLOf=S);NHEAb1;$Qs z9^9Ooy;hsR!TJmxAR=EXXZI4(AEKfM3SSiqsdB&AP&xQn-?ekof z4HJZYBG?;FFD`Z021WsMLmHt3%#E(v1y82uhsQ0g_5K;&z|~q>ix~f7_N6Zzj$^GA z`JoUsiX&AJ!MzXw*fi}-xfqTeoNEc_I<;?SW408M4y`v8-C5ew`3t%j&(tTb@_u}g zc{78mhL9zvo)kk}71%q^W*jRPtm$$ftc=sjtV8eQd zLmY7O05i7=Q1a|ZdB7JAo7HdQG1hta8$;Eok%1zQ6jBTnJ4hQkU~VMP;)mJOG>Ph+ z?6A1JGtF*gNb_;uNj^Aq3w_`wZP&HC9q8aFyu|SG!|uazpr5Ku+KeMElD`q1k|MP| zRlBl3xejX|dlb3V47}J4t-5H*${@cGfs9s1a@``s2$V#h=wYXL7!<_J`CGab$0RK|3%xr8J2(7}-(W zH!SmGX9$31kTFX4Q{2Q_Em+f3c(6)3D!K60AUVb*{NydN1rfcOJb%Lyu%jBq)MnqX zH4nLKq(s9gKSMkX{c>a&?s1+u`F-kCt0m<#(9_mX0twg{AYHJPpR#C8vHr52qFoz4f8 zUVf$afysu&Yk%~$?sdL?*u|)3qC5+`LmvbG&wVlJpEbzofA__LDL0YW_4~a~mW~q6 zWDZtsJcX-`QG;q%!m&e^w&0y@@Lmd*R5z3Ph{1R;%> zR>&FDPbeFTm3AIWQ6~~Q_qUWD*5{sUDt;of^fQC^U2fnrDUJoRA6EptLO*!HZVl>y zBM z3Hp|>uX*yDR>AyF(cwR-Q2}$*xf#nzzy+ZVUu#C=Us-V3x=t{GM&EkaLWCuhrSUxf z=4$R_g&oawlri$9Gh04l3^QaD*M-vf0v61oX;LA@G-H4fq5XuAjhEziz&7yB*x=Zs zlx+>vvp!Y%djISez3AX`TnX&HGNe>8^S`q41$A8BYt6%GRTBmEg&HG)_hhlOc2jq6NSd_n+Cz6j`V8Q{Qzh$r?BPq{*rDV>Ct;C8jE^0<%9P3QH@zJ1L6lMr(C z&FB5jkZBSl*_Pt62%7}Fc%6oxpxG%m3GBg$aWnSio9TN+Z4x~~Dk>$v`co#Q-|xYU z?(2xl6T$P5uZYa^+X+NRT*jehWM+QR4?Qox>VYRd*HY`YVoTfKyn6MUV>}J@17C@K z1x%j}fL=vWw;56OCl!tj&&2J>^QwM+|0$_YZ~kcO{mO?!3}*EMcjEreL|@yK=axkP z)mpE7U4d|}A@@Qh#B!nyaa$NHn|Y{@I01v80#1jPgrjPNhD>;d!Zt9HjzE%ib?Yqraf90E9Rg zV}-AVc48xf%r;5H={qAd$AT&QDGJ&&(aH&1ZzSbcuk&O57u#2Do5_Je)3z zHIsh33k1qNXhPEJjE}ubeAaS;yHLEu+PTap1o!6y#*Jgef@A-qx$_QZvv2=DMb#); zEh<)xqE@L@qq#fmk{cyTHLY1v)DB|RR->XurIOo-J!-@rwbH7hW{53ChmlYOp}*_- z@Ao{%?|y#A_mA&!+<#t4{`lm&KKbN4-{*P0-pck6AtnDC@3)trVy)XZt@^)0O<-BD z1|?v65BQ-aF4R>#;w7%Dy-{d;!fV_RX#P;3$HSU^Qx*ys1rBm}ujE`UGWSSFI&CVB~UFK6BgTlE^)jBS)LZ z@TID)zD_(WhAv9s?*`Ihyn^_*?>D7AbQGEpy$wH0IAhZiW}-zySij$|sn6Y=vBV&i zQ_R>bJFl;ps*Z)Gn-=e5@0-J}AT;(c;{dxq6&m$^fm^K{@cjOM7`if7)TqJ%4>b#U z_AGY7sZ0FXtCX2rgkT|J3H=Oh1em+-Ji8tyfpEX{Y^fRbR2q*J39YBxfF_!@c@jV= zpXH1ZC8i|#Re}?ujK{Z}L&|B_RwG2(wP0jxYVWFCmyQxprnJCpR_#XdI7RQs`okpJqpTxG#D`^;m3bTmvLsSN2D90GrsO_^oXvTd zJtROXZ2PFIqIa+6H4*k5WWLnqG*hqOg!U-RQfc#?G2U>`JTaL21qB)!IDn)wl?4f< z}cmNd1vIPW0750j}Z?{fW|UAKd|93RO|iG*{Brr7g5J@5zB?R zV?*VCN!Kl(bu)KUKT~JwqY$y+_rRy-4}q9jdis0j@#El(=H`0UR1O!e0^B-Q#i+d= za(BKp2KlNqK);PUOhPhN${Cou?^Yh>dB_0H^>x8#u zOde@o^}vl^aJ-vD&~A-i3@PnrFgWAG(P4A8JpE&J_HTyAUI(hxfUdv0;SJ1$s}P4o zDVCz_@UR;l0BK$Htm#+x!=jqdv*@p-{37@D<*$7+iC|n0s`x>c64I*3n9Ia8XkXL0 zVO-@c-{6140jD;&a#1?G$8y)W2C26U;YJu3G*kH#@!B4uCUp#h{y{3Yn4iS|`N-h1 zz7d1dwwYWnV1k$AN{;JF=Iz@j4-ge5D%22$q!D-ZY@;Y8P~cll;V4Gf8Gs?81Fxiw zL0V|aOP2zsXeNM88;$L)P1?A>!v_Mne4y%}g3u6;Q?INu}-<@gZ0*i)X4$RPX)723l?LpCgmngqc$!HQ6D5dHRQ@UUsRkCOF2_+wR1Cz z3e0!1bhRyxs$0!U`jg*F}!Th z6&l!sVeyGIHO0jSZ9I7uA=Qk()6V*$(-qF|r(>-K2R*Ke&zPS=+|MV=xXoP z(vlrrIurnomYqFiAxel%)Z)CJuUchm_&^Byes4*HPv4!3LTOf({AP-S*-M%s{Ex8p z#b=c~Y@wkj4|RIRa5)en#sg-=yB@56QmwO=O|`c4N1ae|15;J`At7#;89vs2s>HLO z;y5jqv}4beT=88*e6&(+?$n;JiD_rDOJj3mnse;aS!a7;BWsHr%tC_n^5a+j!)I*6 zK6LW8=`APg{5%1{c;H$5>NPj+_&I5a32ekZs@2_wL6E)d`-DHzSfc@W_O1B=3}wgb>+8{HH0y@)^F|RYm-YA z&!h#BS+8?ZW-5W}qn^W)A4W1Vs^Bfsv3}^>klsLwTd)Ue3$wba7b}piIlKAwY7(Z9 zNSFlUvr_9#oB(iJbyUkjG*U{I1%FE(8t#w$jUP`8~ zUrq9DZlxcoGiyHdpvXNq0AEDPhBZ-tbVJ{{x$TWu%=IzFubh|&e`#D$);jSw!<8*c&q-f0keM4USHA>% zUN89mn*r2u77;~OC~RC{eYVmnn8@O$z-#U=&f22)(P`-ODX#m5tQH*AiO=)P>>7(Y z^DaHo6Vi8-2c_4p&R8E<2ACY=RgTWi z+ar5*fN*@P+PykU)hi~v7TQM`X@f3qaQEcACS4qC-_4zT^{Pi+Wu~QHgzQn%uxY++ z$c{B8(!L)V0zndPK6NQ$ObKZT!~y3}EuINQVj-mwnkwpsYy#lkP%bUO5~z1nhIQrK zB?pxz>uNy(0SMZTVBOM)$*~K;Iez=Jra*tPq}_RbmZm~{P=|)q#&}APjrze$^1@ps zy18~Or_!QeDl~(=cv`haf?qE0GJx!H?B27T`)vL02hZ?XI**Yu3F~%lslWoE@Pai= zZdKTKxhjEz-rJ|1RF^)Oci$(T2#sy{9cw_Mhl4_V`fT*hCox8_xj0<>qhBreA<`j# zlGxB_;K>yIb0XK+hxS@1OD-wB&X{Og)$%j%uu%`GyR97|{H;TTCOXRNTC-dMqomzR*E z6g2;@VfooGe7*PU*@xBnvpepTRh5f*cTn4?mO4|li6XS^g9Upn%z++KI`HT*ZQ=dx zv#w-px3FUuhQBFILaV}8FZ^P4_Vc3e;s*tn7Sk~BA7CNmLs~dxe`y@ZB#u&m&{$U&5N7saGFY*(fR=8ICn9x2 zPveRf#4b9fyCvl-tCZa$R_G)Q~%#ER3@ zm~BqqG<2v704&Dw{9k;H|8#2kpIrZ)s|(Xg^^EI@spT@{_{H!^lJR!9;6Hh8^R(3LKE8=BCV9Rn(w7`_OtwCO{Q)K$_4(^N?1QD|M8e-odNigTIRC zbwxH9)jjvhc=z1r>V70rxR9<(afw^BKl9OV7{)j*Qvj$;WkN^W8kSO2g2X(45h%x6O$r(a%qdiKTv=TGrBnex=Oh~pcCQ20-9 zBAAITO!cy-c*P_C+?VrT^tkIDAaWOdVTk?O&S|cdPh6T?R$*#%20(QM{5RiG-}2rG z!I`X0xvTqRD?M8|`DIZp@k&L@`{%>$_x;8eFYJo4@^#CEtc?9iMteV*Zr_i_wLTne zao@6Y^Y_P%^q))-PXxw*3V*6-=Sy24ioRA@4U)G$)s87Ot7w` zMM~gwvqM-%voz{?OaHta@eb!<5_$JCbEu=>W# zKk|Lq1+w`tstg@ai?7W0+}~|n3|Y}o_A!#4(jQ=Rf^ z!||E08dkzWqOAvQ8!1bkQ}^P1s_eM$XahmP?~BwL&b(v2U4Rd()PJElsrKzJMjA8; z38S%4*2vNGAnb<*FP{^>PLrcMp!I>O>mcfm{uC&ygTl zr9~uyYpe2>LQVDpD9rQR2(($MVMFRUM>Z8PURkDsAapJFjcO#~35u^V?P9*O%%2hW zO9jMIvOl=fiXcW&*@J^3l9Q{IFFPM^| zV`1vCq`5S1D}QdU@42A^IR369D@C|$ZQah*4ai;w_Aj--oU5b5WA0`i8bThlF$PM5wSE{BD!4xDnH+BpM)8B8{hxfo< zbnws~EJFkl1wCFqM2z-0xE{}_gZ=`^0=Ei)pldq5q`HFWlHyCoGmZNN)ym64@+DRU zdj0dcnsSx1lIV^ql7hxMGL3rPL{|wQ-%)nVspzY2M2t{ttNkD;$io zUxO15TlW&jb(D*c9Mp;7BoWEcp)^G5nj>7hcR+A)SL)MZFmSA2Iyp_hN!wa#YP&%m z&`4kRdHGViJn-7wgCQ*=&hk%~(;-5fzXYhdpgnm?h^iYE)He?dSGl=YH8kW*Y@y*y zDi^p{#H`Z32$dVPvSRQBIBP}kL{k$$7JD5ebc{Dd;;V`^LfU1l~Z_Nn@h7SQxL)}Xm8cY zAuuxZ%|N3(Iq++G2{Am#NNG@%*Bv7_8t#}%it2+N((^wot@W9(?E*jshbY@xi!nlb z6l9!hom@-I2J$pPBK_KM-&XR8dVF!>iQM|6CljYCANXnVGmqFH_B7$Cb(nDZFIRH- zdvF}Ko-4AL=DRFak;T{I{59}hHg@3o_b&S>uw3cYo~Ej}Wk~lhC(@keRf?1+_P%?X z^Y+|~p#plqKSka8&p1{AlD1!kbOd_MUVh`p{%?lJAHhvw@;!Y|eyAQ7tLheP33~X< z43_%_^@+ae#GlU9UFcqzb|G5;X0)Pb`}_-~_HbVHAXz7T)cOu;>D7hHALY*oT;Zjel6In)+M70!SE%e<8%~(xA)X z(~#SpZSaM-lZz92(Tz}N+v_Tl9B`>FhRYxI#mEe&H((`N_|0_3!bk3cvo(4}w>VJ` zjg&=ydB&QFRVqa*dtRS$(sJe6vi9{`MI@}q$>6qIdf0m`Xmy0&CHG2c z6~2}%5HT~8P0&U*;b=slHSikudy}7dSIS1lTAcHwRoK?FPS(TDBhaVRQrqJTl!f(S z3b`A5773#i>)U1fi3-}!&)fg?>U8o}hP7UFWb_|AECS1|9->REk-s{l_rh7K);sY? zCAZ?D?8j}#OPTX(>`WMySyu^W5&g41Jqp;mui*n;xtk$)+8AdIWYmClcSl!frNK7nHXDfIYrJn$Y%ETNa(bMgGs_I$;}}nXk~QOfFcbO zFsf9IWIqr;K#knB+mVW&@_BLpk0R%X&uh=ijjAq8r!Ng5*wwHSS?M?yI-cyW0ajGV+GZfojtOnq$US*AMVeZo5lb87^eL8B;lY3D-S|qv`r(>1E}(_ z2u^s?y#?^?lSon5;}=&>c(>8((CyZ8d?0Cby9c%bUl9Xo{Wz$YI@r}>W{w~bfV;FEOW zBT2U$?u2^aL}>hQB10|Ax$xcJ(W4BuN|MdLkxY`wV2*)Egaxg(dl}Q@mV_n9d^OaL ziA_knTB|_)!9?NbQv!x^c;%kwD!%sqmo19ryz#mFyEjAF61pExAS}z)hD{0dO1Xwy zeBiV@5(nHvb}JY+U8dPQ;R&@Pqvkh5f{%@m$aKZbcMm_H!`&Y@oMI!Xg^l_w0t!mq zJozFCyCDwz#owiKg?V{n-`!L~@=;O=$zLK> zLw}eZ)uh0;eec&0f2B_SX4rEbzbP(>r6k)RP;9%(GTM#+#lSS(6no&pQ+1s%ir;rG zUI#B88~j<9wd|FwV?t%RH&*z-a9Huu467x4m%g*MyQ~-iy^y6T{7oLj49Ou9U)OGWxfey^{ieRe$JY!vlr07oE>? zeY?fNyDuS#-Tej2Y>KGqlP^^A3l4vY#KYHN-5sD8Q?x(`EtSjxK9g2=#HV}j_^sY= z2A${_zuW!DRc^jGap}j-q{M*SFvbqowT(x)fivubo4q<>x_l%u{56G zf3DMUgFM)x>o>=QKRge?D@w)nfPW|`hcPjAuszV;{Xoo`p!~g6a+@WkT zcJy`5FOPQsG1PMS{|O}dJJ8O5c^&-UEHVBa1@LcU+yB3@?*Hxgnf@CQ^&h?NzkxXY sgV+48XUX*U-^94F#VqXKN6zz=>Px# literal 0 HcmV?d00001 diff --git a/docs/conf.py b/docs/source/conf.py similarity index 95% rename from docs/conf.py rename to docs/source/conf.py index ded1330..d745823 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -23,5 +23,6 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' +html_theme = 'pyramid' html_static_path = ['_static'] +html_logo = "ccb_logo.jpg" diff --git a/docs/index.rst b/docs/source/index.rst similarity index 61% rename from docs/index.rst rename to docs/source/index.rst index 6456e30..46ed444 100644 --- a/docs/index.rst +++ b/docs/source/index.rst @@ -3,18 +3,26 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to text2term's documentation! +.. .. image:: ccb_logo.jpg +.. :alt: CCB's Logo +.. :scale: 50 % +.. :align: left + +Text2term ===================================== .. toctree:: :maxdepth: 2 - :caption: Contents: - +.. include:: ../../README.md + :parser: myst_parser.sphinx_ +.. include:: ../../README-UI.md + :parser: myst_parser.sphinx_ Indices and tables ================== * :ref:`genindex` -* :ref:`modindex` * :ref:`search` + + From 58ea820a036552f8a8bca96cfa8d8dcf5a61b541 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 11 Oct 2023 11:13:33 -0400 Subject: [PATCH 125/185] Update .readthedocs.yaml Updates the readthedocs file to account for changes in document structure --- .readthedocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index c409646..8d044c5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,7 +17,7 @@ build: # Build documentation in the "docs/" directory with Sphinx sphinx: - configuration: docs/conf.py + configuration: docs/source/conf.py # Optionally build your docs in additional formats such as PDF and ePub # formats: From 1aecce4f062c814cf29a2aaba1097c5cd4026dcd Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 11 Oct 2023 14:24:43 -0400 Subject: [PATCH 126/185] Set up term collector instance before tests are run --- test/simple_tests.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index 90a798a..642194d 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -8,6 +8,7 @@ pd.set_option('display.max_columns', None) EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" +EFO_TERM_COLLECTOR = OntologyTermCollector(ontology_iri=EFO_URL) MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" TAGS_COLUMN = "Tags" @@ -117,7 +118,7 @@ def test_mapping_zooma_ontologies(): mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) print(f"{df_zooma}\n") assert df_zooma.size > 0 - assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() # returns true if any of the values contains EFO + assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() @@ -132,10 +133,6 @@ def test_mapping_bioportal_ontologies(): assert df_bioportal[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() -# TEST ONTOLOGY TERM COLLECTOR -EFO_TERM_COLLECTOR = OntologyTermCollector(ontology_iri=EFO_URL) - - def test_term_collector(): expected_nr_efo_terms = 50867 terms = EFO_TERM_COLLECTOR.get_ontology_terms() From 83ea587de1cefaded18c410b1e28f4e5c6260351 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 Oct 2023 10:15:35 -0400 Subject: [PATCH 127/185] Turn simple_tests into a more modular test suite Each test should have a descriptive name and message, be executable on its own, and have indicators of success or failure via 'assert' statements --- test/simple_tests.py | 346 ++++++++++++++++++++++--------------------- 1 file changed, 181 insertions(+), 165 deletions(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index 642194d..b9c7db4 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -1,4 +1,5 @@ import os +import unittest import pandas as pd import text2term from term import OntologyTermType @@ -7,172 +8,187 @@ pd.set_option('display.max_columns', None) -EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" -EFO_TERM_COLLECTOR = OntologyTermCollector(ontology_iri=EFO_URL) - -MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" -TAGS_COLUMN = "Tags" - - -def run_tests(): - pizza = "https://protege.stanford.edu/ontologies/pizza/pizza.owl" - ncit = "http://purl.obolibrary.org/obo/ncit/releases/2022-08-19/ncit.owl" - hpo = "http://purl.obolibrary.org/obo/hp/releases/2022-06-11/hp.owl" - ecto = "http://purl.obolibrary.org/obo/ecto/releases/2022-12-12/ecto.owl" - - # ONTOLOGY CACHING - # Test caching an ontology loaded from a URL - print("Test caching an ontology loaded from a URL...") - efo_cache = text2term.cache_ontology(ontology_url=EFO_URL, ontology_acronym="EFO") - print(f"Cache exists: {efo_cache.cache_exists()}\n") - - # Test caching an ontology by resolving its acronym using bioregistry - print("Test caching an ontology by resolving its acronym using bioregistry...") - clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") - print(f"Cache exists: {clo_cache.cache_exists()}\n") - - # Test caching the set of ontologies specified in resources/ontologies.csv - caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) - - # MAPPING TO A (CACHED) ONTOLOGY - # Test mapping a list of terms to cached EFO ontology - print("Test mapping a list of terms to cached EFO ontology...") - mappings_efo_cache = efo_cache.map_terms(["asthma", "disease location", "food allergy"], - term_type=OntologyTermType.ANY) - print(f"{mappings_efo_cache}\n") - - # Test mapping a list of terms to EFO loaded from a URL - print("Test mapping a list of terms to EFO loaded from a URL...") - mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology=EFO_URL, - term_type=OntologyTermType.ANY) - print(f"{mappings_efo_url}\n") - - # Test that mapping to cached ontology is the same as to ontology loaded from its URL - print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") - mappings_match = check_df_equals(drop_source_term_ids(mappings_efo_cache), - drop_source_term_ids(mappings_efo_url)) - print(f"...{mappings_match}") - - # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric - print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") - df1 = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", min_score=.8, - mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, - term_type=OntologyTermType.ANY) - print(f"{df1}\n") - - # Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry - print("Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") - df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) - print(f"{df2}\n") - - -def test_mapping_tagged_terms(): - # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output - print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") - df3 = text2term.map_terms( - {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], "isdjfnsdfwd": None}, - target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) - print(f"{df3}\n") - assert df3.size > 0 - assert df3[TAGS_COLUMN].str.contains("disease").any() - assert df3[TAGS_COLUMN].str.contains("measurement").any() - - -def test_preprocessing_from_file(): - # Test processing tagged terms where the tags are provided in a file - print("Test processing tagged terms where the tags are provided in a file...") - tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") - df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) - print(f"{df4}\n") - assert df4.size > 0 - assert df4[TAGS_COLUMN].str.contains("disease").any() - assert df4[TAGS_COLUMN].str.contains("important").any() - - -def test_mapping_to_properties(): - # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties - print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") - df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=EFO_URL, - term_type=OntologyTermType.PROPERTY) - print(f"{df5}\n") - assert df5.size > 0 - - # Test mapping a list of properties to EFO loaded from cache and restrict search to properties - print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") - if not text2term.cache_exists("EFO"): - text2term.cache_ontology(ontology_url=EFO_URL, ontology_acronym="EFO") - df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, - term_type=OntologyTermType.PROPERTY) - print(f"{df6}\n") - assert df6.size > 0 - - # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL - properties_df_match = check_df_equals(drop_source_term_ids(df5), drop_source_term_ids(df6)) - print(f"...{properties_df_match}") - - -def test_mapping_zooma_ontologies(): - # Test mapping a list of terms to multiple ontologies using the Zooma mapper - print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") - df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", - mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) - print(f"{df_zooma}\n") - assert df_zooma.size > 0 - assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() - assert df_zooma[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() - - -def test_mapping_bioportal_ontologies(): - # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper - print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") - df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", - mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) - print(f"{df_bioportal}\n") - assert df_bioportal.size > 0 - assert df_bioportal[MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() - assert df_bioportal[MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() - - -def test_term_collector(): - expected_nr_efo_terms = 50867 - terms = EFO_TERM_COLLECTOR.get_ontology_terms() - assert len(terms) == expected_nr_efo_terms - - -def test_term_collector_classes_only(): - expected_nr_efo_classes = 50643 - terms = EFO_TERM_COLLECTOR.get_ontology_terms(term_type=OntologyTermType.CLASS) - assert len(terms) == expected_nr_efo_classes - - -def test_term_collector_properties_only(): - expected_nr_efo_properties = 224 - terms = EFO_TERM_COLLECTOR.get_ontology_terms(term_type=OntologyTermType.PROPERTY) - assert len(terms) == expected_nr_efo_properties - - -def test_term_collector_iri_limit(): - iri = "http://www.ebi.ac.uk/efo/" - expected_nr_terms_with_efo_iri = 17383 - terms = EFO_TERM_COLLECTOR.get_ontology_terms(base_iris=[iri], term_type=OntologyTermType.ANY) - assert len(terms) == expected_nr_terms_with_efo_iri - -def test_term_collector_iri_limit_properties_only(): - iri = "http://www.ebi.ac.uk/efo/" - expected_nr_properties_with_efo_iri = 29 - terms = EFO_TERM_COLLECTOR.get_ontology_terms(base_iris=[iri], term_type=OntologyTermType.PROPERTY) - assert len(terms) == expected_nr_properties_with_efo_iri - - -def drop_source_term_ids(df): - return df.drop('Source Term ID', axis=1) - - -def check_df_equals(df, expected_df): - pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) - return True +class Text2TermTestSuite(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(Text2TermTestSuite, cls).setUpClass() + print("Setting up test suite global variables...") + cls.EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" + cls.SOURCE_TERM_ID_COLUMN = "Source Term ID" + cls.MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" + cls.TAGS_COLUMN = "Tags" + + def test_caching_ontology_from_url(self): + # Test caching an ontology loaded from a URL + print("Test caching an ontology loaded from a URL...") + efo_cache = text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + print(f"Cache exists: {efo_cache.cache_exists()}\n") + assert efo_cache.cache_exists() is True + + print("Test using the returned ontology cache object to map a list of terms...") + mappings_efo_cache = efo_cache.map_terms(["asthma", "disease location", "food allergy"], + term_type=OntologyTermType.ANY) + assert mappings_efo_cache.size > 0 + + def test_caching_ontology_from_acronym(self): + # Test caching an ontology by resolving its acronym using bioregistry + print("Test caching an ontology by resolving its acronym using bioregistry...") + clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") + print(f"Cache exists: {clo_cache.cache_exists()}\n") + assert clo_cache.cache_exists() is True + + def test_caching_ontology_set(self): + nr_ontologies_in_registry = 8 + # Test caching the set of ontologies specified in resources/ontologies.csv + caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) + assert len(caches) == nr_ontologies_in_registry + + def test_mapping_to_cached_ontology(self): + # Test mapping a list of terms to EFO loaded from cache + print("Test mapping a list of terms to EFO loaded from cache...") + mappings_efo_cache = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology="EFO", + use_cache=True, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_cache}\n") + assert mappings_efo_cache.size > 0 + + # Test mapping a list of terms to EFO loaded from a URL + print("Test mapping a list of terms to EFO loaded from a URL...") + mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], + target_ontology=self.EFO_URL, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_url}\n") + assert mappings_efo_url.size > 0 + + # Test that mapping to cached ontology is the same as to ontology loaded from its URL + print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") + mappings_match = self.check_df_equals(self.drop_source_term_ids(mappings_efo_cache), + self.drop_source_term_ids(mappings_efo_url)) + print(f"...{mappings_match}") + assert mappings_match is True + + def test_mapping_to_cached_efo_using_syntactic_mapper(self): + # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric + print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") + df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", min_score=.8, + mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, + term_type=OntologyTermType.ANY) + print(f"{df}\n") + assert df.size > 0 + + def test_mapping_to_efo_using_ontology_acronym(self): + # Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry + print( + "Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) + print(f"{df2}\n") + assert df2.size > 0 + + def test_mapping_tagged_terms(self): + # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output + print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") + df3 = text2term.map_terms( + {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], + "isdjfnsdfwd": None}, target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + print(f"{df3}\n") + assert df3.size > 0 + assert df3[self.TAGS_COLUMN].str.contains("disease").any() + assert df3[self.TAGS_COLUMN].str.contains("measurement").any() + + def test_preprocessing_from_file(self): + # Test processing tagged terms where the tags are provided in a file + print("Test processing tagged terms where the tags are provided in a file...") + tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") + df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) + print(f"{df4}\n") + assert df4.size > 0 + assert df4[self.TAGS_COLUMN].str.contains("disease").any() + assert df4[self.TAGS_COLUMN].str.contains("important").any() + + def test_mapping_to_properties(self): + # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties + print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") + df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=self.EFO_URL, + term_type=OntologyTermType.PROPERTY) + print(f"{df5}\n") + assert df5.size > 0 + + # Test mapping a list of properties to EFO loaded from cache and restrict search to properties + print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") + if not text2term.cache_exists("EFO"): + text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, + term_type=OntologyTermType.PROPERTY) + print(f"{df6}\n") + assert df6.size > 0 + + # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL + properties_df_match = self.check_df_equals(self.drop_source_term_ids(df5), self.drop_source_term_ids(df6)) + print(f"Properties match: {properties_df_match}") + assert properties_df_match is True + + def test_mapping_zooma_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the Zooma mapper + print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") + df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) + print(f"{df_zooma}\n") + assert df_zooma.size > 0 + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_mapping_bioportal_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper + print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") + df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + print(f"{df_bioportal}\n") + assert df_bioportal.size > 0 + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_term_collector(self): + expected_nr_efo_terms = 50867 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms() + assert len(terms) == expected_nr_efo_terms + + def test_term_collector_classes_only(self): + expected_nr_efo_classes = 50643 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.CLASS) + assert len(terms) == expected_nr_efo_classes + + def test_term_collector_properties_only(self): + expected_nr_efo_properties = 224 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_efo_properties + + def test_term_collector_iri_limit(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_terms_with_efo_iri = 17383 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.ANY) + assert len(terms) == expected_nr_terms_with_efo_iri + + def test_term_collector_iri_limit_properties_only(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_properties_with_efo_iri = 29 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_properties_with_efo_iri + + def drop_source_term_ids(self, df): + # Unless specified, source term IDs are randomly generated UUIDs. We have to drop the ID column to be able to + # get a meaningful diff between two dataframes. Otherwise, the dataframes would always differ because of the IDs + return df.drop(self.SOURCE_TERM_ID_COLUMN, axis=1) + + def check_df_equals(self, df, expected_df): + # Use pandas::assert_frame_equal function to determine if two data frames are equal + pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) + return True if __name__ == '__main__': - run_tests() + unittest.main() From 2fa6aebe72ae18b0ffb321fe67d42fd07179017f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 Oct 2023 10:16:00 -0400 Subject: [PATCH 128/185] Change logging message type from info to warning --- text2term/term_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 31f9a23..5d93db4 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -90,7 +90,7 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type owl_term_type = OntologyTermType.PROPERTY else: owl_term_type = "undetermined" - self.logger.info("Term has undetermined type %s %s", iri, labels) + self.logger.warn("Term has undetermined type %s %s", iri, labels) term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, parents=named_parents, children=children, instances=instances, restrictions=complex_parents, deprecated=is_deprecated, From e8c3f0255bf45868b65002ea88e5a4498b3bf893 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 Oct 2023 18:52:51 -0400 Subject: [PATCH 129/185] Add 'min_score' filter tests --- test/simple_tests.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index b9c7db4..0a9e5e2 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -18,6 +18,7 @@ def setUpClass(cls): cls.EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" cls.SOURCE_TERM_ID_COLUMN = "Source Term ID" cls.MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" + cls.MAPPING_SCORE_COLUMN = "Mapping Score" cls.TAGS_COLUMN = "Tags" def test_caching_ontology_from_url(self): @@ -70,9 +71,8 @@ def test_mapping_to_cached_ontology(self): def test_mapping_to_cached_efo_using_syntactic_mapper(self): # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") - df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", min_score=.8, - mapper=text2term.Mapper.JARO_WINKLER, excl_deprecated=True, use_cache=True, - term_type=OntologyTermType.ANY) + df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", use_cache=True, + mapper=text2term.Mapper.JARO_WINKLER, term_type=OntologyTermType.ANY) print(f"{df}\n") assert df.size > 0 @@ -179,6 +179,25 @@ def test_term_collector_iri_limit_properties_only(self): terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.PROPERTY) assert len(terms) == expected_nr_properties_with_efo_iri + def test_mapping_with_min_score_filter(self): + min_score = 0.6 + search_terms = ["asthma attack", "location"] + + print("Test mapping to cached EFO using Zooma mapper and min_score filter...") + df_zooma = text2term.map_terms(search_terms, target_ontology="EFO,NCIT", mapper=Mapper.ZOOMA, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_zooma[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using TFIDF similarity metric and min_score filter...") + df_tfidf = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_tfidf[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using Levenshtein similarity metric and min_score filter...") + df_leven = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.LEVENSHTEIN, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() + def drop_source_term_ids(self, df): # Unless specified, source term IDs are randomly generated UUIDs. We have to drop the ID column to be able to # get a meaningful diff between two dataframes. Otherwise, the dataframes would always differ because of the IDs From c44728468058cd0ba5f17695d427209f6e2bb9e4 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 Oct 2023 18:53:09 -0400 Subject: [PATCH 130/185] Fix min_score filter issue --- text2term/t2t.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 890088c..77ab182 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -212,8 +212,8 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi # Add tags, process, and filter df = _filter_mappings(mappings_df, min_score) if incl_unmapped: - df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) - df = _add_tags_to_df(mappings_df, tags) + df = _add_unmapped_terms(df, tags, source_terms, source_term_ids) + df = _add_tags_to_df(df, tags) return df From 997e4a6513e0aee40e8d56db0a6aa0fd35f05882 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 12 Oct 2023 19:34:00 -0400 Subject: [PATCH 131/185] Add warning about mapping score when using bioportal mapper --- text2term/t2t.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index 77ab182..b0ce36d 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -209,10 +209,17 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi raise ValueError("Unsupported mapper: " + mapper) LOGGER.info("...done (mapping time: %.2fs seconds)", time.time() - start) - # Add tags, process, and filter - df = _filter_mappings(mappings_df, min_score) + # Filter terms by the mapping score specified + if mapper == Mapper.BIOPORTAL: + LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score " + "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.") + df = mappings_df + else: + df = _filter_mappings(mappings_df, min_score) + # Include in output data frame any input terms that did not meet min_score threshold if incl_unmapped: df = _add_unmapped_terms(df, tags, source_terms, source_term_ids) + # Add tags df = _add_tags_to_df(df, tags) return df From 66cc57394e2899f3d3cef6b0183bf9fb476ffe53 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 Oct 2023 11:19:32 -0400 Subject: [PATCH 132/185] Fix issue accessing df when mappings df is empty --- text2term/t2t.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/text2term/t2t.py b/text2term/t2t.py index b0ce36d..bf03965 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -269,7 +269,10 @@ def _filter_mappings(mappings_df, min_score): def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): - mapped = pd.unique(mappings_df["Source Term"]) + if mappings_df.size == 0: + mapped = () + else: + mapped = pd.unique(mappings_df["Source Term"]) for (term, term_id) in zip(source_terms, source_terms_ids): if term not in mapped: non_mapping = TermMapping(term, term_id, "", "", 0) From 3c993e767164dd97ba774d496974fbc510fbaec2 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 Oct 2023 11:20:03 -0400 Subject: [PATCH 133/185] Add tests for include_unmapped feature --- test/simple_tests.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/simple_tests.py b/test/simple_tests.py index 0a9e5e2..7e65169 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -198,6 +198,16 @@ def test_mapping_with_min_score_filter(self): term_type=OntologyTermType.ANY, min_score=min_score) assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() + def test_include_unmapped_terms(self): + df = text2term.map_terms(["asthma", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + + def test_include_unmapped_terms_when_no_mappings_are_returned(self): + df = text2term.map_terms(["mojito", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + def drop_source_term_ids(self, df): # Unless specified, source term IDs are randomly generated UUIDs. We have to drop the ID column to be able to # get a meaningful diff between two dataframes. Otherwise, the dataframes would always differ because of the IDs From b3c44e5dc3fb9ccd442fb9ac014e83a2f5dd2b80 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Fri, 13 Oct 2023 13:18:12 -0400 Subject: [PATCH 134/185] Rename test to be clearer --- test/simple_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index 7e65169..6c0f908 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -203,7 +203,7 @@ def test_include_unmapped_terms(self): incl_unmapped=True, min_score=0.8) assert df[self.TAGS_COLUMN].str.contains("unmapped").any() - def test_include_unmapped_terms_when_no_mappings_are_returned(self): + def test_include_unmapped_terms_when_mappings_df_is_empty(self): df = text2term.map_terms(["mojito", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, incl_unmapped=True, min_score=0.8) assert df[self.TAGS_COLUMN].str.contains("unmapped").any() From ec71b2b0449edc210b374b1a0db24d6fda38312d Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 16 Oct 2023 09:58:11 -0400 Subject: [PATCH 135/185] Requirements and style Added myst-parser to the requirements, which should allow readthedocs to work. Also enlarges the sidebar on Sphinx --- docs/source/conf.py | 5 ++++- requirements.txt | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d745823..5713476 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,4 +25,7 @@ html_theme = 'pyramid' html_static_path = ['_static'] -html_logo = "ccb_logo.jpg" +# html_logo = "ccb_logo.jpg" +html_theme_options = { + 'sidebarwidth': 280 +} diff --git a/requirements.txt b/requirements.txt index 98714ea..62e5bea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ bioregistry~=0.10.6 nltk~=3.8.1 rapidfuzz~=2.13.7 shortuuid~=1.0.11 +myst-parser~=2.0.0 \ No newline at end of file From b659a3ec5e85f2a124106ffd51d7807e3c342ddd Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 27 Oct 2023 10:59:58 -0400 Subject: [PATCH 136/185] Testing bug fixes and action Fixes bugs, including one that makes caching ineffective. Also adds a GitHub Action to automatically test on commit. --- .github/workflows/python-app.yml | 39 ++++++++++++++++++++ test/simple_tests.py | 10 +++-- text2term/__init__.py | 3 ++ text2term/config.py | 2 +- text2term/t2t.py | 12 ++++-- text2term/term_collector.py | 63 +++++++++++++++++--------------- text2term/term_mapping.py | 3 ++ 7 files changed, 94 insertions(+), 38 deletions(-) create mode 100644 .github/workflows/python-app.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..7c281c0 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python application + +on: + push: + branches: [ "development" ] + pull_request: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + python -m unittest test/simple_tests diff --git a/test/simple_tests.py b/test/simple_tests.py index 6c0f908..fc51d21 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -2,13 +2,12 @@ import unittest import pandas as pd import text2term -from term import OntologyTermType -from mapper import Mapper +from text2term import OntologyTermType +from text2term import Mapper from text2term import OntologyTermCollector pd.set_option('display.max_columns', None) - class Text2TermTestSuite(unittest.TestCase): @classmethod @@ -21,6 +20,11 @@ def setUpClass(cls): cls.MAPPING_SCORE_COLUMN = "Mapping Score" cls.TAGS_COLUMN = "Tags" + @classmethod + def tearDownClass(cls): + super(Text2TermTestSuite, cls).tearDownClass() + text2term.clear_cache() + def test_caching_ontology_from_url(self): # Test caching an ontology loaded from a URL print("Test caching an ontology loaded from a URL...") diff --git a/text2term/__init__.py b/text2term/__init__.py index ad9f676..49a3773 100644 --- a/text2term/__init__.py +++ b/text2term/__init__.py @@ -8,3 +8,6 @@ from .preprocess import preprocess_tagged_terms from .tagged_term import TaggedTerm from .term_collector import OntologyTermCollector +from .term_collector import filter_terms +from .term import OntologyTermType +from .term import OntologyTerm diff --git a/text2term/config.py b/text2term/config.py index 189c03b..73f980c 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.0.0" +VERSION = "4.1.0" diff --git a/text2term/t2t.py b/text2term/t2t.py index bf03965..92a3b7a 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -10,6 +10,7 @@ from text2term.mapper import Mapper from text2term.term import OntologyTermType from text2term.term_collector import OntologyTermCollector +from text2term.term_collector import filter_terms from text2term.term_graph_generator import TermGraphGenerator from text2term.bioportal_mapper import BioPortalAnnotatorMapper from text2term.syntactic_mapper import SyntacticMapper @@ -21,6 +22,8 @@ IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" +OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label", + "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) @@ -174,16 +177,16 @@ def _load_data(input_file_path, csv_column_names, separator): def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type=OntologyTermType.CLASS): - term_collector = OntologyTermCollector(ontology_iri=ontology) if use_cache: pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") LOGGER.info(f"Loading cached ontology from: {pickle_file}") onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) - onto_terms = term_collector.filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) + onto_terms = filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: + term_collector = OntologyTermCollector(ontology_iri=ontology) onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated, term_type=term_type) - term_collector.close() + term_collector.close() LOGGER.info(f"Filtered ontology terms to those of type: {term_type}") if len(onto_terms) == 0: raise RuntimeError("Could not find any terms in the given ontology.") @@ -270,7 +273,8 @@ def _filter_mappings(mappings_df, min_score): def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): if mappings_df.size == 0: - mapped = () + mapped = [] + mappings_df = pd.DataFrame(columns=OUTPUT_COLUMNS) else: mapped = pd.unique(mappings_df["Source Term"]) for (term, term_id) in zip(source_terms, source_terms_ids): diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 5d93db4..3d9671a 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -48,17 +48,7 @@ def get_ontology_terms(self, base_iris=(), exclude_deprecated=False, term_type=O return ontology_terms def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): - filtered_onto_terms = {} - for base_iri, term in onto_terms.items(): - if type(iris) == str: - begins_with_iri = (iris == ()) or base_iri.startswith(iris) - else: - begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) - is_not_deprecated = (not excl_deprecated) or (not term.deprecated) - include = self._filter_term_type(term, term_type, True) - if begins_with_iri and is_not_deprecated and include: - filtered_onto_terms.update({base_iri: term}) - return filtered_onto_terms + return filter_terms(onto_terms, iris, exclude_deprecated, term_type) def _get_ontology_signature(self, ontology): signature = list(ontology.classes()) @@ -73,7 +63,7 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type ontology_terms = dict() for ontology_term in term_list: # Parse if should include ontology classes, properties, or both - include = self._filter_term_type(ontology_term, term_type, False) + include = _filter_term_type(ontology_term, term_type, False) if include and ontology_term is not Thing and ontology_term is not Nothing: if (exclude_deprecated and not deprecated[ontology_term]) or (not exclude_deprecated): iri = ontology_term.iri @@ -84,9 +74,9 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type instances = self._get_instances(ontology_term, ontology) definitions = self._get_definitions(ontology_term) is_deprecated = deprecated[ontology_term] == [True] - if self._filter_term_type(ontology_term, OntologyTermType.CLASS, False): + if _filter_term_type(ontology_term, OntologyTermType.CLASS, False): owl_term_type = OntologyTermType.CLASS - elif self._filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): + elif _filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): owl_term_type = OntologyTermType.PROPERTY else: owl_term_type = "undetermined" @@ -100,22 +90,6 @@ def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) return ontology_terms - def _filter_term_type(self, ontology_term, term_type, cached): - if term_type == OntologyTermType.CLASS: - if cached: - return ontology_term.term_type == OntologyTermType.CLASS - else: - return isinstance(ontology_term, ThingClass) - elif term_type == OntologyTermType.PROPERTY: - if cached: - return ontology_term.term_type == OntologyTermType.PROPERTY - else: - return isinstance(ontology_term, PropertyClass) - elif term_type == OntologyTermType.ANY: - return True - else: - raise ValueError("Invalid term-type option. Acceptable term types are: 'class' or 'property' or 'any'") - def _get_parents(self, ontology_term): parents = dict() # named/atomic superclasses except owl:Thing restrictions = dict() # restrictions are class expressions such as 'pancreatitis disease_has_location pancreas' @@ -401,3 +375,32 @@ def _log_ontology_metrics(self, ontology): self.logger.debug(" Object property count: %i", len(list(ontology.object_properties()))) self.logger.debug(" Data property count: %i", len(list(ontology.data_properties()))) self.logger.debug(" Annotation property count: %i", len(list(ontology.annotation_properties()))) + +def filter_terms(onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): + filtered_onto_terms = {} + for base_iri, term in onto_terms.items(): + if type(iris) == str: + begins_with_iri = (iris == ()) or base_iri.startswith(iris) + else: + begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) + is_not_deprecated = (not excl_deprecated) or (not term.deprecated) + include = _filter_term_type(term, term_type, True) + if begins_with_iri and is_not_deprecated and include: + filtered_onto_terms.update({base_iri: term}) + return filtered_onto_terms + +def _filter_term_type(ontology_term, term_type, cached): + if term_type == OntologyTermType.CLASS: + if cached: + return ontology_term.term_type == OntologyTermType.CLASS + else: + return isinstance(ontology_term, ThingClass) + elif term_type == OntologyTermType.PROPERTY: + if cached: + return ontology_term.term_type == OntologyTermType.PROPERTY + else: + return isinstance(ontology_term, PropertyClass) + elif term_type == OntologyTermType.ANY: + return True + else: + raise ValueError("Invalid term-type option. Acceptable term types are: 'class' or 'property' or 'any'") diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index 8da155c..bf3add7 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -55,6 +55,9 @@ def to_dict(self): self.MAPPING_SCORE: self.mapping_score } + def get_col_names(self): + return [SRC_TERM, SRC_TERM_ID, TGT_TERM_LBL, TGT_TERM_CURIE, TGT_TERM_IRI, MAPPING_SCORE] + def __eq__(self, other): if isinstance(other, TermMapping): return self.source_term == other.source_term and self.mapped_term_iri == other.mapped_term_iri From d9fbb9e29fa829642adca67341849b052420beb6 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 27 Oct 2023 11:02:54 -0400 Subject: [PATCH 137/185] Fix unsaved error Accidentally committed without saving file, fixes the commit. --- text2term/term_mapping.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/text2term/term_mapping.py b/text2term/term_mapping.py index bf3add7..8da155c 100644 --- a/text2term/term_mapping.py +++ b/text2term/term_mapping.py @@ -55,9 +55,6 @@ def to_dict(self): self.MAPPING_SCORE: self.mapping_score } - def get_col_names(self): - return [SRC_TERM, SRC_TERM_ID, TGT_TERM_LBL, TGT_TERM_CURIE, TGT_TERM_IRI, MAPPING_SCORE] - def __eq__(self, other): if isinstance(other, TermMapping): return self.source_term == other.source_term and self.mapped_term_iri == other.mapped_term_iri From 6ad92b143447cfd1dac2f1950472e066ff76887c Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 27 Oct 2023 11:07:07 -0400 Subject: [PATCH 138/185] Debugging GitHub action --- .github/workflows/python-app.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 7c281c0..d9b991f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -r requirements.txt - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -36,4 +36,5 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - python -m unittest test/simple_tests + cd test + python -m unittest simple_tests From face918e1b80d1bd8eaea60d45019d7ff91a04ca Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Fri, 27 Oct 2023 11:09:31 -0400 Subject: [PATCH 139/185] Update python-app.yml Installing t2t locally for testing --- .github/workflows/python-app.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index d9b991f..b904574 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -28,6 +28,7 @@ jobs: python -m pip install --upgrade pip pip install flake8 pytest pip install -r requirements.txt + pip install -e . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 8e24de2c7483637ac798010d4aa9a2471cdac5be Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 30 Oct 2023 15:09:17 -0400 Subject: [PATCH 140/185] Debug Testing Action Debug Unit Testing Action --- .github/workflows/python-app.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index b904574..7ec39c6 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -29,12 +29,12 @@ jobs: pip install flake8 pytest pip install -r requirements.txt pip install -e . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # - name: Lint with flake8 + # run: | + # # stop the build if there are Python syntax errors or undefined names + # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | cd test From 14640ba46e97267b369991318ede8d5c92d81777 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 1 Nov 2023 11:12:37 -0400 Subject: [PATCH 141/185] Update python-app.yml Debug Action --- .github/workflows/python-app.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 7ec39c6..d05a82c 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -26,15 +26,16 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip --version pip install -r requirements.txt + pip install flake8 pytest pip install -e . - # - name: Lint with flake8 - # run: | - # # stop the build if there are Python syntax errors or undefined names - # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | cd test From 0ddf1379fcb5965ae5d7754b08da87a024ce2dd9 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 6 Nov 2023 10:38:19 -0500 Subject: [PATCH 142/185] Update python-app.yml Adds print statement to determine python path --- .github/workflows/python-app.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index d05a82c..3f1b631 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -23,10 +23,11 @@ jobs: uses: actions/setup-python@v3 with: python-version: "3.10" + - name: show python path + run: python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | python -m pip install --upgrade pip - pip --version pip install -r requirements.txt pip install flake8 pytest pip install -e . From 8804a3eecfa025690f6d5875917d0cf148091b60 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 6 Nov 2023 10:40:17 -0500 Subject: [PATCH 143/185] Update python-app.yml Fix syntax error --- .github/workflows/python-app.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 3f1b631..60a5534 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -24,7 +24,8 @@ jobs: with: python-version: "3.10" - name: show python path - run: python -c "import sys; print('\n'.join(sys.path))" + run: | + python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | python -m pip install --upgrade pip From 431dace3e8168221cdae90642b36b7bb0c7413e5 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 6 Nov 2023 10:44:15 -0500 Subject: [PATCH 144/185] Update python-app.yml Continued debugging --- .github/workflows/python-app.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 60a5534..96f008a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -31,6 +31,11 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install flake8 pytest + - name: Check package location + run: | + pip show pandas + - name: Install text2term + run: | pip install -e . - name: Lint with flake8 run: | From f683142d69f654f3c9b3db5b92f8c27066fbe3f3 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:18:31 -0500 Subject: [PATCH 145/185] Update python-app.yml Debugging --- .github/workflows/python-app.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 96f008a..ca40803 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -36,6 +36,7 @@ jobs: pip show pandas - name: Install text2term run: | + python --version pip install -e . - name: Lint with flake8 run: | From a23451c2944cb95330b9b1ed9d69729f4cf79249 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:24:04 -0500 Subject: [PATCH 146/185] Update python-app.yml Debugging --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index ca40803..3029165 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -34,9 +34,9 @@ jobs: - name: Check package location run: | pip show pandas + pip install pandas - name: Install text2term run: | - python --version pip install -e . - name: Lint with flake8 run: | From 0788cc954e4c071a09d3f6a2712ece1032dcad58 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:26:26 -0500 Subject: [PATCH 147/185] Update python-app.yml Continued debugging --- .github/workflows/python-app.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 3029165..128586f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -28,13 +28,12 @@ jobs: python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip requirements.txt pip install -r requirements.txt pip install flake8 pytest - name: Check package location run: | pip show pandas - pip install pandas - name: Install text2term run: | pip install -e . From 5176b576cb898bd2cb365bc264038f06534f44c9 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:28:21 -0500 Subject: [PATCH 148/185] Update python-app.yml Throwing everything at the wall to see what sticks --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 128586f..190d55f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -28,7 +28,7 @@ jobs: python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | - python -m pip install --upgrade pip requirements.txt + python -m pip install --upgrade pip -r requirements.txt pip install -r requirements.txt pip install flake8 pytest - name: Check package location From d65b547dcfa612562c764e429fb1765253224ac7 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:41:20 -0500 Subject: [PATCH 149/185] Update python-app.yml Nothing sticks to the wall --- .github/workflows/python-app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 190d55f..7d683f4 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -22,13 +22,13 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: "3.10" + python-version: "3.9" - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | - python -m pip install --upgrade pip -r requirements.txt + python -m pip install --upgrade pip pip install -r requirements.txt pip install flake8 pytest - name: Check package location From 427ef2d73b7f6dd5d82b5f5e8788ad6b86ec4ada Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 10:46:05 -0500 Subject: [PATCH 150/185] Update python-app.yml Making the program fail just to see a new error message --- .github/workflows/python-app.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 7d683f4..1f9c0c9 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -22,7 +22,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: "3.9" + python-version: "3.10" - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" @@ -34,9 +34,9 @@ jobs: - name: Check package location run: | pip show pandas - - name: Install text2term - run: | - pip install -e . + # - name: Install text2term + # run: | + # pip install -e . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From c39a408e23f77928ae7bede21f3f6eafe9eba4c3 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 8 Nov 2023 13:08:13 -0500 Subject: [PATCH 151/185] Update python-app.yml Adding back the install --- .github/workflows/python-app.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 1f9c0c9..f81e0c2 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -26,6 +26,7 @@ jobs: - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" + PYTHONPATH - name: Install dependencies run: | python -m pip install --upgrade pip @@ -34,9 +35,9 @@ jobs: - name: Check package location run: | pip show pandas - # - name: Install text2term - # run: | - # pip install -e . + - name: Install text2term + run: | + pip install -e . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 8b5cdd0af52a55594c7a6a04cd2c9ef60cbee56a Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 16 Nov 2023 16:24:56 -0500 Subject: [PATCH 152/185] Change default tags to empty list. Use open() to open pickle files. --- text2term/config.py | 2 +- text2term/preprocess.py | 2 +- text2term/t2t.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/text2term/config.py b/text2term/config.py index 73f980c..8c8f595 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.0" +VERSION = "4.1.1" diff --git a/text2term/preprocess.py b/text2term/preprocess.py index 2e97883..d16a036 100644 --- a/text2term/preprocess.py +++ b/text2term/preprocess.py @@ -111,7 +111,7 @@ def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=Fal return False -def _update_tagged_term(processed_terms, term, new_term, tags=()): +def _update_tagged_term(processed_terms, term, new_term, tags=[]): term.update_term(new_term) term.add_tags(tags) processed_terms.append(term) diff --git a/text2term/t2t.py b/text2term/t2t.py index 92a3b7a..fa247fc 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -180,8 +180,9 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ if use_cache: pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") LOGGER.info(f"Loading cached ontology from: {pickle_file}") - onto_terms_unfiltered = pickle.load(open(pickle_file, "rb")) - onto_terms = filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) + with open(pickle_file, "rb") as cached_ontology_pickle: + onto_terms_unfiltered = pickle.load(cached_ontology_pickle) + onto_terms = filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) else: term_collector = OntologyTermCollector(ontology_iri=ontology) onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated, From f95db93d34c673d251c0ecd5690b4df533026105 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Sat, 2 Dec 2023 16:33:52 -0500 Subject: [PATCH 153/185] Fix add_tags() in tagged_term --- text2term/tagged_term.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/tagged_term.py b/text2term/tagged_term.py index 7891f63..db26dd8 100644 --- a/text2term/tagged_term.py +++ b/text2term/tagged_term.py @@ -12,7 +12,7 @@ def __repr__(self): return f" Date: Tue, 2 Jan 2024 15:40:41 -0500 Subject: [PATCH 154/185] Fix imports in onto_cache --- text2term/onto_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py index 614f912..204dcb0 100644 --- a/text2term/onto_cache.py +++ b/text2term/onto_cache.py @@ -3,8 +3,8 @@ import text2term import owlready2 import pandas as pd -from .term import OntologyTermType -from .mapper import Mapper +from text2term.term import OntologyTermType +from text2term.mapper import Mapper from shutil import rmtree CACHE_FOLDER = "cache" From 7fc84487eda05716b0c426719c86b0d9fd5c8b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Gon=C3=A7alves?= Date: Wed, 7 Feb 2024 13:34:05 -0500 Subject: [PATCH 155/185] Update python-app.yml --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index f81e0c2..2e42b70 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -26,7 +26,7 @@ jobs: - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" - PYTHONPATH + echo $PYTHONPATH - name: Install dependencies run: | python -m pip install --upgrade pip From 32485122ff1de55d0fec22151b5d5ea531854ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Gon=C3=A7alves?= Date: Wed, 7 Feb 2024 13:40:35 -0500 Subject: [PATCH 156/185] Update python-app.yml --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index f81e0c2..2e42b70 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -26,7 +26,7 @@ jobs: - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" - PYTHONPATH + echo $PYTHONPATH - name: Install dependencies run: | python -m pip install --upgrade pip From 07f22981b40dbc636dfb9a35152e4671570f6c54 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 13 Feb 2024 18:25:32 -0500 Subject: [PATCH 157/185] Fix typo in variable name, likely due to refactoring --- text2term/term_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/term_collector.py b/text2term/term_collector.py index 3d9671a..109d349 100644 --- a/text2term/term_collector.py +++ b/text2term/term_collector.py @@ -48,7 +48,7 @@ def get_ontology_terms(self, base_iris=(), exclude_deprecated=False, term_type=O return ontology_terms def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): - return filter_terms(onto_terms, iris, exclude_deprecated, term_type) + return filter_terms(onto_terms, iris, excl_deprecated, term_type) def _get_ontology_signature(self, ontology): signature = list(ontology.classes()) From d8c7132ea8dcff14c2ab5f4a2ef7952592e0bce6 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 29 Feb 2024 10:24:44 -0500 Subject: [PATCH 158/185] Optimized filter mappings Changed the t2t._filter_mappings function to use vectorization instead of loops, thus making it faster. --- .github/workflows/python-app.yml | 1 - text2term/t2t.py | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 2e42b70..96f008a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -26,7 +26,6 @@ jobs: - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" - echo $PYTHONPATH - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/text2term/t2t.py b/text2term/t2t.py index fa247fc..ca89c34 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -265,13 +265,11 @@ def _add_tags_to_df(df, tags): def _filter_mappings(mappings_df, min_score): - new_df = pd.DataFrame(columns=mappings_df.columns) - for index, row in mappings_df.iterrows(): - if row['Mapping Score'] >= min_score: - new_df.loc[len(new_df.index)] = row + if mappings_df.empty: + return mappings_df + new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score] return new_df - def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): if mappings_df.size == 0: mapped = [] From 1745eb3ee8c714b1225f40b3cc58c02e6cd9949d Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 6 Mar 2024 10:16:37 -0500 Subject: [PATCH 159/185] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 423af80..f4f249b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ ipython_config.py # For PyPi upload make-pypi.sh +pypi_auth.txt # Cache should not be uploaded cache/ From 0b6498274c49b56ee87d8a3186d5e5ef6cc78d6f Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 6 Mar 2024 10:17:15 -0500 Subject: [PATCH 160/185] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 423af80..f4f249b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ ipython_config.py # For PyPi upload make-pypi.sh +pypi_auth.txt # Cache should not be uploaded cache/ From be8027e024cc74e5d9dfacd0520c43923dbeca2e Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 6 Mar 2024 10:37:52 -0500 Subject: [PATCH 161/185] Update .gitignore and version number Update .gitignore for new local files and updates the version number --- .gitignore | 2 +- text2term/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f4f249b..aa2a1c4 100644 --- a/.gitignore +++ b/.gitignore @@ -86,7 +86,7 @@ ipython_config.py # For PyPi upload make-pypi.sh -pypi_auth.txt +.pypirc # Cache should not be uploaded cache/ diff --git a/text2term/config.py b/text2term/config.py index 8c8f595..546a837 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.1" +VERSION = "4.1.2" From ddf0af8ec830f3f225af2c14037d9a3696509b8e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Thu, 7 Mar 2024 16:35:22 -0500 Subject: [PATCH 162/185] Some tests relied on cache that might not exist unless the whole test suite is executed --- test/simple_tests.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/test/simple_tests.py b/test/simple_tests.py index fc51d21..bbe8654 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -8,6 +8,7 @@ pd.set_option('display.max_columns', None) + class Text2TermTestSuite(unittest.TestCase): @classmethod @@ -51,6 +52,7 @@ def test_caching_ontology_set(self): assert len(caches) == nr_ontologies_in_registry def test_mapping_to_cached_ontology(self): + self.ensure_cache_exists("EFO", self.EFO_URL) # Test mapping a list of terms to EFO loaded from cache print("Test mapping a list of terms to EFO loaded from cache...") mappings_efo_cache = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology="EFO", @@ -72,23 +74,25 @@ def test_mapping_to_cached_ontology(self): print(f"...{mappings_match}") assert mappings_match is True - def test_mapping_to_cached_efo_using_syntactic_mapper(self): + def test_mapping_to_cached_ontology_using_syntactic_mapper(self): + self.ensure_cache_exists("EFO", self.EFO_URL) # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric - print("Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric...") + print("Test mapping a list of terms to cached ontology using Jaro-Winkler syntactic similarity metric...") df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", use_cache=True, mapper=text2term.Mapper.JARO_WINKLER, term_type=OntologyTermType.ANY) print(f"{df}\n") assert df.size > 0 - def test_mapping_to_efo_using_ontology_acronym(self): - # Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry + def test_mapping_using_ontology_acronym(self): + # Test mapping a list of terms by specifying the target ontology acronym, which gets resolved by bioregistry print( - "Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") - df2 = text2term.map_terms(["contains", "asthma"], "EFO", term_type=OntologyTermType.CLASS) + "Test mapping a list of terms by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "MONDO") print(f"{df2}\n") assert df2.size > 0 def test_mapping_tagged_terms(self): + self.ensure_cache_exists("EFO", self.EFO_URL) # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") df3 = text2term.map_terms( @@ -100,6 +104,7 @@ def test_mapping_tagged_terms(self): assert df3[self.TAGS_COLUMN].str.contains("measurement").any() def test_preprocessing_from_file(self): + self.ensure_cache_exists("EFO", self.EFO_URL) # Test processing tagged terms where the tags are provided in a file print("Test processing tagged terms where the tags are provided in a file...") tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") @@ -119,8 +124,7 @@ def test_mapping_to_properties(self): # Test mapping a list of properties to EFO loaded from cache and restrict search to properties print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") - if not text2term.cache_exists("EFO"): - text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + self.ensure_cache_exists("EFO", self.EFO_URL) df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, term_type=OntologyTermType.PROPERTY) print(f"{df6}\n") @@ -184,6 +188,7 @@ def test_term_collector_iri_limit_properties_only(self): assert len(terms) == expected_nr_properties_with_efo_iri def test_mapping_with_min_score_filter(self): + self.ensure_cache_exists("EFO", self.EFO_URL) min_score = 0.6 search_terms = ["asthma attack", "location"] @@ -203,11 +208,13 @@ def test_mapping_with_min_score_filter(self): assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() def test_include_unmapped_terms(self): + self.ensure_cache_exists("EFO", self.EFO_URL) df = text2term.map_terms(["asthma", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, incl_unmapped=True, min_score=0.8) assert df[self.TAGS_COLUMN].str.contains("unmapped").any() def test_include_unmapped_terms_when_mappings_df_is_empty(self): + self.ensure_cache_exists("EFO", self.EFO_URL) df = text2term.map_terms(["mojito", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, incl_unmapped=True, min_score=0.8) assert df[self.TAGS_COLUMN].str.contains("unmapped").any() @@ -222,6 +229,10 @@ def check_df_equals(self, df, expected_df): pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) return True + def ensure_cache_exists(self, ontology_name, ontology_url): + if not text2term.cache_exists(ontology_name): + text2term.cache_ontology(ontology_url=ontology_url, ontology_acronym=ontology_name) + if __name__ == '__main__': unittest.main() From 6c4ff68cfe42b28d2952b0243f217192cdf80b3d Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 11 Mar 2024 11:47:38 -0400 Subject: [PATCH 163/185] Fix Testing Action Fixes the GitHub Action that automatically tests the module. Also fixes a "bug" in the testing suite introduced by errors in EFO --- .github/workflows/python-app.yml | 3 ++- test/simple_tests.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 96f008a..cf5010f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -23,6 +23,7 @@ jobs: uses: actions/setup-python@v3 with: python-version: "3.10" + cache: 'pip' - name: show python path run: | python -c "import sys; print('\n'.join(sys.path))" @@ -30,7 +31,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install flake8 pytest + pip install flake8 pytest wheel - name: Check package location run: | pip show pandas diff --git a/test/simple_tests.py b/test/simple_tests.py index bbe8654..305281e 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -86,8 +86,8 @@ def test_mapping_to_cached_ontology_using_syntactic_mapper(self): def test_mapping_using_ontology_acronym(self): # Test mapping a list of terms by specifying the target ontology acronym, which gets resolved by bioregistry print( - "Test mapping a list of terms by specifying the ontology acronym, which gets resolved by bioregistry") - df2 = text2term.map_terms(["contains", "asthma"], "MONDO") + "Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "MONDO", term_type=OntologyTermType.CLASS) print(f"{df2}\n") assert df2.size > 0 From f55f581114a8ab70aa09735b25e482f7a7ee8dc0 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 20 Mar 2024 10:05:42 -0400 Subject: [PATCH 164/185] Clean printing Cleans up unnecessary print statements from the GitHub testing action --- .github/workflows/python-app.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index cf5010f..e99f354 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -24,17 +24,11 @@ jobs: with: python-version: "3.10" cache: 'pip' - - name: show python path - run: | - python -c "import sys; print('\n'.join(sys.path))" - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install flake8 pytest wheel - - name: Check package location - run: | - pip show pandas - name: Install text2term run: | pip install -e . From f0236d96b74f4e518462a9a43dad18b5dfdba7ca Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 20 Mar 2024 13:30:59 -0400 Subject: [PATCH 165/185] Add actions to upload to PyPI Adds actions to upload to PyPI and test PyPI automatically upon release --- .../{python-app.yml => github_test.yml} | 3 +- .github/workflows/upload_pypi.yml | 45 ++++++++++++++++++ .github/workflows/upload_testpypi.yml | 47 +++++++++++++++++++ 3 files changed, 94 insertions(+), 1 deletion(-) rename .github/workflows/{python-app.yml => github_test.yml} (94%) create mode 100644 .github/workflows/upload_pypi.yml create mode 100644 .github/workflows/upload_testpypi.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/github_test.yml similarity index 94% rename from .github/workflows/python-app.yml rename to .github/workflows/github_test.yml index e99f354..947449f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/github_test.yml @@ -1,7 +1,8 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: Python application +name: GitHub Unit Testing +run-name: Unit Testing on ${{ github.event_name }} on: push: diff --git a/.github/workflows/upload_pypi.yml b/.github/workflows/upload_pypi.yml new file mode 100644 index 0000000..eef2a4f --- /dev/null +++ b/.github/workflows/upload_pypi.yml @@ -0,0 +1,45 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Upload PyPI +run-name: Upload ${{ github.event.release.tag_name }} to PyPI + +on: + release: + types: [published] + +permissions: + contents: write + id-token: write + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 twine sdist wheel build + - name: Install text2term + run: | + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Build dist/ + run: | + python -m build --sdist --wheel --no-isolation --outdir dist/ . + - name: Upload to pypi + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/upload_testpypi.yml b/.github/workflows/upload_testpypi.yml new file mode 100644 index 0000000..82fec26 --- /dev/null +++ b/.github/workflows/upload_testpypi.yml @@ -0,0 +1,47 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Upload Test PyPI +run-name: Upload ${{ github.event.release.tag_name }} to Test PyPI + +on: + release: + types: [published] + +permissions: + contents: write + id-token: write + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 twine sdist wheel build + - name: Install text2term + run: | + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Build dist/ + run: | + python -m build --sdist --wheel --no-isolation --outdir dist/ . + - name: Upload to pypi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ From 77310cebf86c1d9848101eb96580c12a6da1f02d Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 25 Mar 2024 14:58:47 -0400 Subject: [PATCH 166/185] Update github_test.yml Adds a better description, but removes testing on pull in main to stop bug from occurring --- .github/workflows/github_test.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/github_test.yml b/.github/workflows/github_test.yml index 947449f..105b8a2 100644 --- a/.github/workflows/github_test.yml +++ b/.github/workflows/github_test.yml @@ -2,13 +2,11 @@ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python name: GitHub Unit Testing -run-name: Unit Testing on ${{ github.event_name }} +run-name: Unit Testing on ${{ github.event.push.head_commit.message }} on: push: branches: [ "development" ] - pull_request: - branches: [ "main" ] permissions: contents: read From 4c1fc2e8b0eaec57027d925be362c2529e381a4b Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Mon, 25 Mar 2024 15:10:24 -0400 Subject: [PATCH 167/185] Revert "Update github_test.yml" This reverts commit 77310cebf86c1d9848101eb96580c12a6da1f02d. --- .github/workflows/github_test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github_test.yml b/.github/workflows/github_test.yml index 105b8a2..947449f 100644 --- a/.github/workflows/github_test.yml +++ b/.github/workflows/github_test.yml @@ -2,11 +2,13 @@ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python name: GitHub Unit Testing -run-name: Unit Testing on ${{ github.event.push.head_commit.message }} +run-name: Unit Testing on ${{ github.event_name }} on: push: branches: [ "development" ] + pull_request: + branches: [ "main" ] permissions: contents: read From 88a66807700be042f9ba7d5045cf11a23d6254e6 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 28 Mar 2024 10:43:12 -0400 Subject: [PATCH 168/185] Update requirements.txt Fixes typo in requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 62e5bea..cf8334d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,4 @@ bioregistry~=0.10.6 nltk~=3.8.1 rapidfuzz~=2.13.7 shortuuid~=1.0.11 -myst-parser~=2.0.0 \ No newline at end of file +myst_parser~=2.0.0 \ No newline at end of file From 1727e805bf2afec32e959c7df05d9f8e381e6eb7 Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Wed, 1 May 2024 11:59:46 -0400 Subject: [PATCH 169/185] Update CLI Updates the Command Line Interface to reflect recent changes and fix bugs --- text2term/__main__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/text2term/__main__.py b/text2term/__main__.py index df9863b..9560fac 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -17,7 +17,7 @@ "'all' to search all ontologies") parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, + parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), @@ -39,8 +39,10 @@ help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", help="Store the target ontology into local cache under acronym") - parser.add_argument("-type", "--term_type", required=False, type=str, default=OntologyTermType.CLASS, + parser.add_argument("-type", "--term_type", required=False, type=str, default="class", help="Define whether to return ontology classes, properties, or both") + parser.add_argument('-u', "--incl_unmapped", required=False, default=False, action="store_true", + help="Include all unmapped terms in the output") arguments = parser.parse_args() if not os.path.exists(arguments.source): @@ -62,4 +64,4 @@ excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), - term_type=arguments.term_type) + term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped) From 5fd3481e1b2e92a8be027585f78ba1e6823617ee Mon Sep 17 00:00:00 2001 From: Jason Payne Date: Thu, 2 May 2024 11:05:56 -0400 Subject: [PATCH 170/185] Update README Updates the READMEs to change the examples to MONDO links instead of EFO links, since those often contain bugs --- README-UI.md | 2 +- README.md | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README-UI.md b/README-UI.md index a096884..7846136 100644 --- a/README-UI.md +++ b/README-UI.md @@ -9,7 +9,7 @@ The following information pertains to the text2term UI, which is written [here]( - npm >= 8.0.0 - Python >= 3.9.0 - pip >= 21.0.0 -- text2term >= 1.1.0 +- text2term >= 4.1.2 **\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions. diff --git a/README.md b/README.md index f936def..feaf75d 100644 --- a/README.md +++ b/README.md @@ -13,40 +13,40 @@ pip install text2term import text2term import pandas -df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") -df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl") +df1 = text2term.map_terms("test/unstruct_terms.txt", "http://purl.obolibrary.org/obo/mondo.owl") +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://purl.obolibrary.org/obo/mondo.owl") +df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://purl.obolibrary.org/obo/mondo.owl") ``` Below is an example of caching, assuming the same imports as above: ```python -text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True) -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) -text2term.clear_cache("EFO") +text2term.cache_ontology("http://purl.obolibrary.org/obo/mondo.owl", "MONDO") +df1 = text2term.map_terms("test/unstruct_terms.txt", "MONDO", use_cache=True) +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "MONDO", use_cache=True) +text2term.clear_cache("MONDO") ``` ### Command Line The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` +`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl` Specify an output file where the mappings should be saved using `-o`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -o /Documents/my-mappings.csv` Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -min 0.8` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -min 0.8` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -d` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -d` Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s test/unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` +`python text2term.py -s test/unstruct_terms.txt -t mondo.owl -iris http://purl.obolibrary.org/obo/mondo.owl,http://purl.obolibrary.org/obo/HP` Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. Use the cache on the command line, first by flagging it, then in the future using the acronym: -`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` +`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO` Then, after running this, the following command is equivalent: -`python text2term -s test/unstruct_terms.txt -t EFO` +`python text2term -s test/unstruct_terms.txt -t MONDO` ## Programmatic Usage The tool can be executed in Python with the `map_terms` function: From d2f7efc70d3c73447524b9d2f4801e4bc331146e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 3 Jun 2024 16:57:36 -0400 Subject: [PATCH 171/185] Parameterize ngram length. Ensure inputs are strings closes #49 --- text2term/config.py | 2 +- text2term/tfidf_mapper.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/text2term/config.py b/text2term/config.py index 546a837..aa1adbb 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.2" +VERSION = "4.1.3" diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index c90c7f9..f8e4f07 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -17,7 +17,7 @@ def __init__(self, target_ontology_terms): self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) - def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): + def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3, ngram_length=3): """ Main mapping function. Default settings return only the top candidate for every source string. :param source_terms: List of source terms to be mapped with ontology terms @@ -25,9 +25,10 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates + :param ngram_length: The gram length n for the string tokenizer """ source_terms_norm = onto_utils.normalize_list(source_terms) - vectorizer = self._tokenize(source_terms_norm, self.target_labels) + vectorizer = self._tokenize(source_terms_norm, self.target_labels, n=ngram_length) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) return results_df @@ -80,9 +81,15 @@ def _get_target_labels_terms(self, ontology_terms): target_labels, target_terms = [], [] for term in ontology_terms.values(): for label in term.labels: - target_labels.append(label) - target_terms.append(term) + if not isinstance(label, str): + self.logger.debug(f"ontology term label {label} is not a string") + else: + target_labels.append(label) + target_terms.append(term) for synonym in term.synonyms: - target_labels.append(synonym) - target_terms.append(term) + if not isinstance(synonym, str): + self.logger.debug(f"ontology term synonym {synonym} is not a string") + else: + target_labels.append(synonym) + target_terms.append(term) return target_labels, target_terms From 3e38e9ecbc160d3332f0bcd9e88d0420454a8c2e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 3 Jun 2024 18:15:57 -0400 Subject: [PATCH 172/185] Add timers to post-mapping filters --- text2term/onto_utils.py | 2 +- text2term/t2t.py | 24 ++++++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 9cbd9ac..d0bc45b 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -11,7 +11,7 @@ STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's', 'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined', 'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid', - 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'not', 'by', + 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'by', 'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls', 'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'} diff --git a/text2term/t2t.py b/text2term/t2t.py index ca89c34..a2e27a4 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -23,7 +23,7 @@ IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label", - "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] + "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) @@ -217,15 +217,26 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi if mapper == Mapper.BIOPORTAL: LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score " "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.") - df = mappings_df else: - df = _filter_mappings(mappings_df, min_score) + LOGGER.debug("Filtering mappings by their score...") + start_filter = time.time() + mappings_df = _filter_mappings(mappings_df, min_score) + LOGGER.debug("...done (filtering time: %.2fs seconds)", time.time() - start_filter) + # Include in output data frame any input terms that did not meet min_score threshold if incl_unmapped: - df = _add_unmapped_terms(df, tags, source_terms, source_term_ids) + LOGGER.debug("Adding unmapped terms...") + start_unmapped = time.time() + mappings_df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) + LOGGER.debug("...done (adding unmapped time: %.2fs seconds)", time.time() - start_unmapped) + # Add tags - df = _add_tags_to_df(df, tags) - return df + if not mappings_df.empty: + LOGGER.debug("Adding tags...") + start_tagging = time.time() + mappings_df = _add_tags_to_df(mappings_df, tags) + LOGGER.debug("...done (adding tags time: %.2fs seconds)", time.time() - start_tagging) + return mappings_df # Takes in the tags and source terms and processes them accordingly @@ -270,6 +281,7 @@ def _filter_mappings(mappings_df, min_score): new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score] return new_df + def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): if mappings_df.size == 0: mapped = [] From 8e2f5949854755cb127059b137ef80196e3c90a3 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 10:50:06 -0400 Subject: [PATCH 173/185] Bump version needed to publish in pypi due to deleted v4.1.3 --- text2term/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text2term/config.py b/text2term/config.py index aa1adbb..60e3b39 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.3" +VERSION = "4.1.4" From 06318d7f155773b5c463d2cd138e851b217d9d7c Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 18:55:28 -0400 Subject: [PATCH 174/185] Update README.md --- README.md | 275 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 155 insertions(+), 120 deletions(-) diff --git a/README.md b/README.md index feaf75d..a93b5f9 100644 --- a/README.md +++ b/README.md @@ -7,49 +7,99 @@ Install package using **pip**: ``` pip install text2term ``` -## Examples -### Programmatic +## Basic Examples + +
+ Examples of Programmatic Use + +text2term supports mapping strings specified in different input formats: + ```python import text2term -import pandas -df1 = text2term.map_terms("test/unstruct_terms.txt", "http://purl.obolibrary.org/obo/mondo.owl") -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://purl.obolibrary.org/obo/mondo.owl") -df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://purl.obolibrary.org/obo/mondo.owl") +# map strings in a list to an ontology specified by its URL +dfl = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") + +# map strings listed in a file 'test/unstruct_terms.txt' to an ontology specified by its URL +dff = text2term.map_terms(source_terms="test/unstruct_terms.txt", + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") + +# map strings in a dictionary with associated tags to an ontology specified by its URL +dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":["disease", "lung"]}, + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") ``` -Below is an example of caching, assuming the same imports as above: + +text2term supports caching an ontology for repeated use: ```python -text2term.cache_ontology("http://purl.obolibrary.org/obo/mondo.owl", "MONDO") -df1 = text2term.map_terms("test/unstruct_terms.txt", "MONDO", use_cache=True) -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "MONDO", use_cache=True) -text2term.clear_cache("MONDO") +# cache ontology and give it a name for use later on +mondo = text2term.cache_ontology(ontology_url="http://purl.obolibrary.org/obo/mondo.owl", + ontology_acronym="MONDO") + +# now map strings to the cached ontology by specifying as `target_ontology` the name chosen above and the flag `use_cache=True` +dfc = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], target_ontology="MONDO", use_cache=True) + +# or more succinctly, use the OntologyCache object `mondo` +dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"]) ``` +
-### Command Line -The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl` -Specify an output file where the mappings should be saved using `-o`: -`python text2term -s test/unstruct_terms.txt -t mondo.owl -o /Documents/my-mappings.csv` +
+ Examples of Command Line Interface Use + +To show a help message describing all arguments type into a terminal: +```shell +python text2term --help +``` +The basic use of text2term requires a `source` file containing the terms to map to a given `target` ontology: +```shell +python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl +``` + +--- +Map to a local ontology and specify an output file where the mappings should be saved using `-o`: +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -o test/mymappings.csv +``` + +--- Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term -s test/unstruct_terms.txt -t mondo.owl -min 0.8` +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -min 0.8 +``` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. +--- Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term -s test/unstruct_terms.txt -t mondo.owl -d` +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -d +``` +--- Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s test/unstruct_terms.txt -t mondo.owl -iris http://purl.obolibrary.org/obo/mondo.owl,http://purl.obolibrary.org/obo/HP` -Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. +```shell +python text2term.py -s test/unstruct_terms.txt -t test/mondo.owl -iris http://purl.obolibrary.org/obo/mondo,http://identifiers.org/hgnc +``` +While MONDO uses terms from other ontologies such as CHEBI and Uberon, the tool only considers terms whose IRIs start either with "http://purl.obolibrary.org/obo/mondo" or "http://identifiers.org/hgnc". + +--- +Cache an ontology for repeated use, by first running the tool as usual while instructing it to cache the ontology using `-c `: +```shell +python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO +``` + +Now the ontology is cached and we can refer to it as the target ontology using the name given beforehand: +```shell +python text2term -s test/unstruct_terms.txt -t MONDO +``` + +
-Use the cache on the command line, first by flagging it, then in the future using the acronym: -`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO` -Then, after running this, the following command is equivalent: -`python text2term -s test/unstruct_terms.txt -t MONDO` ## Programmatic Usage -The tool can be executed in Python with the `map_terms` function: +After installing and importing to a Python environment, the main function is `map_terms`: ```python text2term.map_terms(source_terms, @@ -67,148 +117,124 @@ text2term.map_terms(source_terms, use_cache=False, term_type=OntologyTermType.CLASS, incl_unmapped=False) - ``` -NOTE: As of 3.0.0, the former three functions (`map_file`, `map_terms`, `map_tagged_terms`) have been condensed into one function. Users can now change the name of any function in old code to `map_terms` and it reads the input context to maintain the functionality of each one. - -### Arguments -For `map_terms`, the first argument can be any of the following: 1) a string that specifies a path to a file containing the terms to be mapped, 2) a list of the terms to be mapped, or 3) a dictionary where the keys are the terms to be mapped, and values can be a list of tags or a list of TaggedTerm objects (see below). -Currently, the tags do not affect the mapping in any way, but they are added to the output dataframe at the end of the process. The exception is the Ignore tag, which causes the term to not be mapped at all, but still be outputted in the results if the incl_unmapped argument is True (see below). +The function returns a pandas `DataFrame` containing the generated ontology mappings. -All other arguments are the same, and have the same functionality: +
+ Argument Details -`target_ontology` : str - Path or URL or acronym of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, - provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. When the target ontology has been previously cached, provide the ontology name that was used to cache it. - As of version 2.3.0, it is possible to specify ontology acronyms as the `target_ontology` (eg "EFO" or "CL"), which is achieved using [bioregistry](https://bioregistry.io) to retrieve URLs for those acronyms. +`source_terms`**_—Strings to be mapped to an ontology_**, which can be specified as a: +1. list of strings +2. string containing a file path +3. dictionary of terms and associated tags, where each key is a term and the value is a list of tags +4. list of `TaggedTerm` objects + - Tags do not affect the mapping, they are simply added to the output dataframe + - If a term is tagged with "Ignore", text2term will not map it + - Unmapped terms can still be included in the output if `incl_unmapped` is True -`base_iris` : tuple - Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: - ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') +`target_ontology`:str—Path, URL or name of 'target' ontology to map the source terms to +: Ontology names can be given as values to `target_ontology` (eg "EFO" or "CL")--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. +: When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. +: When the target ontology has been cached, this should be the ontology name given when it was first cached. -`csv_column` : tuple - Allows the user to specify a column to map if a csv is passed in as the input file. Ignored if the input is not a file path. +`base_iris`:_tuple_—Map only to ontology terms whose IRIs start with one of the strings given in this tuple -`source_terms_ids` : tuple - Collection of identifiers for the given source terms - WARNING: While this is still available for the tagged term function, it is worth noting that dictionaries do not necessarily preserve order, so it is not recommended. If using the TaggedTerm object, the source terms can be attached there to guarantee order. +`excl_deprecated`:_bool_—Exclude ontology terms stated as deprecated via `owl:deprecated true` -`excl_deprecated` : bool - Exclude ontology terms stated as deprecated via `owl:deprecated true` +`source_terms_ids`:_tuple_—Collection of identifiers for the given source terms -`mapper` : mapper.Mapper - Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal - These can be initialized by invoking mapper.Mapper e.g. `mapper.Mapper.TFIDF` +`csv_column`:_tuple_—Specify the name of the column containing the terms to map, when the input file is a table. Optionally provide a second column name, containing the respective term identifiers -`max_mappings` : int - Maximum number of top-ranked mappings returned per source term +`separator`:_str_—Character that separates columns when input is a table (eg '\t' for TSV) -`min_score` : float - Minimum similarity score [0,1] for the mappings (1=exact match) +`mapper`:mapper.Mapper—Method used to compare source terms with ontology terms + : One of levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal -`output_file` : str - Path to desired output file for the mappings +`max_mappings`:_int_—Maximum number of top-ranked mappings returned per source term -`save_graphs` : bool - Save vis.js graphs representing the neighborhood of each ontology term +`min_score`:_float_—Minimum similarity score [0,1] for the mappings (1=exact match) -`save_mappings` : bool - Save the generated mappings to a file (specified by `output_file`) +`save_mappings`:_bool_—Save the generated mappings to a file (specified by `output_file`) -`seperator` : str - Character that separates the source term values if a file input is given. Ignored if the input is not a file path. +`output_file`:_str_—Path to desired output file for the mappings dataframe -`use_cache` : bool - Use the cache for the ontology. More details are below. +`save_graphs`:_bool_—Save vis.js graphs representing the neighborhood of each ontology term -`term_type` : term.OntologyTermType - Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any']. +`use_cache`:_bool_—Use the cache for the ontology -`incl_unmapped` : bool - Include all unmapped terms in the output. If something has been tagged 'Ignore' (see below) or falls below the `min_score` threshold, it is included without a mapped term at the end of the output data frame. +`term_type`:_term.OntologyTermType_—Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any'] -All default values, if they exist, can be seen above. +`incl_unmapped`:_bool_—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame -### Return Value -Both functions return the same value: +
-`df` : Data frame containing the generated ontology mappings +
+ Ontology Caching -### Ontology Caching -As of version 1.1.0, users can cache ontologies that they want to use regularly or quickly. Programmatically, there are two steps to using the cache: creating the cache, then accessing it. First, the user can cache ontologies using either of two functions: +text2term supports caching ontologies for faster or repeated mapping to the same ontology. An ontology can be cached using the function: ```python cache_ontology(ontology_url, ontology_acronym="", base_iris=()) ``` +This caches a single ontology from a URL or file path, and takes an optional acronym that will be used to reference the cached ontology later. If no acronym is given, the URL is used as the name. +It is also possible to cache multiple ontologies, whose names and URLs are specified in a table formatted as such `acronym,version,url`. An example is provided in [resources/ontologies.csv](https://github.com/ccb-hms/ontology-mapper/blob/main/text2term/resources/ontologies.csv): ```python cache_ontology_set(ontology_registry_path) ``` -The first of these will cache a single ontology from a URL or file path, with it being referenced by an acronym that will be used to reference it later. If no acronym is given, it will use the URL as the cache name. An example can be found above. -The second function allows the user to cache several ontologies at once by referencing a CSV file of the format: -`acronym,version,url`. An example is provided in `resources/ontologies.csv` +Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. Users can leverage the cache by using the assigned acronym as the value for the `target_ontology` argument, and setting the `use_cache` argument to `True`. -Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. -As of version 2.3.0, the `cache_ontology` function also returns an object that can be used to call any of the `map` functions, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is not specified and there is no `use_cache` option, as it is always True. +To clear the ontology cache, the following function can be used: -NOTE: Due to how ontologies are processed in memory, `cache_ontology_set` must be used to cache multiple ontologies in a single Python instance. If `cache_ontology` is used multiple times in one instance, the behavior is undefined and may cause visible or invisible errors. +```python +text2term.clear_cache(ontology_acronym='') +``` -After an ontology is cached, the user can access the cache by using the assigned acronym in the place of `target_ontology` and setting the `use_cache` flag to `True`. -To clear the cache, one can call: -`clear_cache(ontology_acronym='')` If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. -Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. It is worth noting that while ontology URLs can repeat, acronyms must be distinct in a given environment. +Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. + +**_Notes_** +- The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. +- While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. + +
+ +
+ Input Preprocessing -### Input Preprocessing -As of version 1.2.0, text2term includes regex-based preprocessing functionality for input terms. Specifically, these functions take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. +text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. -Like the "map" functions above, the two functions differ on whether the input is a file or a list of strings: ```python -preprocess_terms(terms, template_path, output_file='', blocklist_path='', blocklist_char='', rem_duplicates=False) +preprocess_terms(terms, template_path, output_file='', blocklist_path='', + blocklist_char='', rem_duplicates=False) ``` +This returns a dictionary where the keys are the original terms and the values are the preprocessed terms. + ```python -preprocess_tagged_terms(file_path, template_path='', blocklist_path='', blocklist_char='', rem_duplicates=False, separator=';:;') +preprocess_tagged_terms(file_path, template_path='', blocklist_path='', + blocklist_char='', rem_duplicates=False, separator=';:;') ``` -In all cases, the regex templates and blocklist must be stored in a newline-separated file. If an output file is specified, the preprocessed strings are written to that file and the list of preprocessed strings is returned. +This returns a list of `TaggedTerm` objects. -The blocklist functionality allows the user to specify another regex file. If any terms match any regex in blocklist, they are removed from the terms, or, if a blocklist character is specified, replaced with that character for placeholding. -NOTE: As of version 2.1.0, the arguments were changed to "blocklist" from "blacklist". Backwards compatibility is currently supported, but will likely be discontinued at the next major release. +The regex templates file `template_path` and the blocklist `blocklist_path` must each be a newline-separated file. If an output file is specified, the preprocessed strings are written to that file. -The Remove Duplicates `rem_duplicates` functionality will remove all duplicate terms after processing, if set to `True`. -WARNING: Removing duplicates at any point does not guarantee which original term is kept. This is particularly important if original terms have different tags, so user caution is advised. +The blocklist functionality allows specifying another file with regular expressions that, when terms match any such regex in the blocklist, they are removed from the list of terms to map. Alternatively, if a blocklist character is specified, the input is replaced with that character. -The function `preprocess_terms()` returns a dictionary where the keys are the original terms and the values are the preprocessed terms. -The `preprocess_tagged_terms()` function returns a list of TaggedTerm items with the following function contracts: -```python -def __init__(self, term=None, tags=[], original_term=None, source_term_id=None) -def add_tags(self, new_tags) -def update_term(self, term) -def update_source_term_id(self, source_term_id) -def get_original_term(self) -def get_term(self) -def get_tags(self) -def get_source_term_id(self) -``` -As mentioned in the mapping section above, this can then be passed directly to `map_terms`, allowing for easy programmatic usage. Note that this allows multiple of the same preprocessed term with different tags. +The `rem_duplicates` option removes all duplicate terms after processing, if set to `True`. -**Note on NA values in input**: As of v2.0.3, when the input to text2term is a table file, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. +When the input to text2term is a table, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. -### Tag Usage -As of 3.0.0, some tags have additional functionality that is added when attached to a term: +If an ignore tag `"ignore"` or `"Ignore"` is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. The following values are regarded as ignore tags: `"ignore", "Ignore". -IGNORE: - If an ignore tag is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. Here are the following values that count as ignore tags: -```python - IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] -``` +
## Command Line Usage -After installation, execute the tool from a command line as follows: +After installing, execute the tool from a command line as follows: -`python text2term -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-g SAVE_TERM_GRAPHS]` +`python text2term [-h] -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-sep SEPARATOR] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d] [-g] [-c STORE_IN_CACHE] [-type TERM_TYPE] [-u]` To display a help message with descriptions of tool arguments do: @@ -219,7 +245,8 @@ To display a help message with descriptions of tool arguments do: `-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies. -### Optional arguments +
+ Optional arguments `-o OUTPUT` Path to desired output file for the mappings. @@ -227,14 +254,22 @@ To display a help message with descriptions of tool arguments do: `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids'). +`-sep SEPARATOR`, Specifies the cell separator to be used when reading a table + `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. `-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match). `-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)'). -`-d EXCL_DEPRECATED` Exclude ontology terms stated as deprecated via `owl:deprecated true`. +`-d` Exclude ontology terms stated as deprecated via `owl:deprecated true`. + +`-g` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. + +`-c STORE_IN_CACHE` Cache the target ontology using the name given here. + +`-type TERM_TYPE` Specify whether to map to ontology classes, properties, or both -`-g SAVE_TERM_GRAPHS` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. +`-u` Include all unmapped terms in the output -`-c STORE_IN_CACHE` Using this flag followed by the acronym the ontology should be stored as, the program will same the target ontology to the cache. After that, referencing the acronym in `target` will reference the cache. Examples are above. +
From 289e90616edc6b9b84bf8a56d37faee13c00ffc5 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 19:03:40 -0400 Subject: [PATCH 175/185] Update README.md --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index a93b5f9..2ad6ef7 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp
Argument Details -`source_terms`**_—Strings to be mapped to an ontology_**, which can be specified as a: +`source_terms`—Strings to be mapped to an ontology, which can be specified as a: 1. list of strings 2. string containing a file path 3. dictionary of terms and associated tags, where each key is a term and the value is a list of tags @@ -132,39 +132,39 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp - If a term is tagged with "Ignore", text2term will not map it - Unmapped terms can still be included in the output if `incl_unmapped` is True -`target_ontology`:str—Path, URL or name of 'target' ontology to map the source terms to +`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to : Ontology names can be given as values to `target_ontology` (eg "EFO" or "CL")--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. : When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. : When the target ontology has been cached, this should be the ontology name given when it was first cached. -`base_iris`:_tuple_—Map only to ontology terms whose IRIs start with one of the strings given in this tuple +`base_iris`—Map only to ontology terms whose IRIs start with one of the strings given in this tuple -`excl_deprecated`:_bool_—Exclude ontology terms stated as deprecated via `owl:deprecated true` +`excl_deprecated`—Exclude ontology terms stated as deprecated via `owl:deprecated true` -`source_terms_ids`:_tuple_—Collection of identifiers for the given source terms +`source_terms_ids`—Collection of identifiers for the given source terms -`csv_column`:_tuple_—Specify the name of the column containing the terms to map, when the input file is a table. Optionally provide a second column name, containing the respective term identifiers +`csv_column`—Specify the name of the column containing the terms to map, when the input file is a table. Optionally provide a second column name, containing the respective term identifiers -`separator`:_str_—Character that separates columns when input is a table (eg '\t' for TSV) +`separator`—Character that separates columns when input is a table (eg '\t' for TSV) -`mapper`:mapper.Mapper—Method used to compare source terms with ontology terms +`mapper`—Method used to compare source terms with ontology terms : One of levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal -`max_mappings`:_int_—Maximum number of top-ranked mappings returned per source term +`max_mappings`—Maximum number of top-ranked mappings returned per source term -`min_score`:_float_—Minimum similarity score [0,1] for the mappings (1=exact match) +`min_score`—Minimum similarity score [0,1] for the mappings (1=exact match) -`save_mappings`:_bool_—Save the generated mappings to a file (specified by `output_file`) +`save_mappings`—Save the generated mappings to a file (specified by `output_file`) -`output_file`:_str_—Path to desired output file for the mappings dataframe +`output_file`—Path to desired output file for the mappings dataframe -`save_graphs`:_bool_—Save vis.js graphs representing the neighborhood of each ontology term +`save_graphs`—Save vis.js graphs representing the neighborhood of each ontology term -`use_cache`:_bool_—Use the cache for the ontology +`use_cache`—Use the cache for the ontology -`term_type`:_term.OntologyTermType_—Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any'] +`term_type`—Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any'] -`incl_unmapped`:_bool_—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame +`incl_unmapped`—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame
From 669823a06df750070f37a6349d99538a0841605f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 20:35:40 -0400 Subject: [PATCH 176/185] Update formatting of collapsible headers in README --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2ad6ef7..feafcff 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ pip install text2term ## Basic Examples
- Examples of Programmatic Use + Examples of Programmatic Use text2term supports mapping strings specified in different input formats: @@ -46,7 +46,7 @@ dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"])
- Examples of Command Line Interface Use + Examples of Command Line Interface Use To show a help message describing all arguments type into a terminal: ```shell @@ -201,7 +201,7 @@ Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `
- Input Preprocessing +

Input Preprocessing

text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. @@ -230,7 +230,7 @@ If an ignore tag `"ignore"` or `"Ignore"` is added to a term, that term will not
-## Command Line Usage +## Command Line Interface Usage After installing, execute the tool from a command line as follows: @@ -246,7 +246,7 @@ To display a help message with descriptions of tool arguments do: `-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies.
- Optional arguments + Optional arguments `-o OUTPUT` Path to desired output file for the mappings. From 2e27644e5d99f3405c4df5c938136d86dba8d3a5 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 20:40:10 -0400 Subject: [PATCH 177/185] Update formatting of collapsible headers in README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index feafcff..3c0bcd9 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ pip install text2term ## Basic Examples
- Examples of Programmatic Use + Examples of Programmatic Use text2term supports mapping strings specified in different input formats: @@ -46,7 +46,7 @@ dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"])
- Examples of Command Line Interface Use + Examples of Command Line Interface Use To show a help message describing all arguments type into a terminal: ```shell @@ -121,7 +121,7 @@ text2term.map_terms(source_terms, The function returns a pandas `DataFrame` containing the generated ontology mappings.
- Argument Details +

Argument Details

`source_terms`—Strings to be mapped to an ontology, which can be specified as a: 1. list of strings @@ -169,7 +169,7 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp
- Ontology Caching +

Ontology Caching

text2term supports caching ontologies for faster or repeated mapping to the same ontology. An ontology can be cached using the function: @@ -201,7 +201,7 @@ Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `
-

Input Preprocessing

+

Input Preprocessing

text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. @@ -246,7 +246,7 @@ To display a help message with descriptions of tool arguments do: `-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies.
- Optional arguments + Optional arguments `-o OUTPUT` Path to desired output file for the mappings. From 275181deec7a5fa9a3f207eb6b6960e805e7e6fa Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Tue, 4 Jun 2024 20:49:11 -0400 Subject: [PATCH 178/185] Update formatting of collapsible headers in README --- README.md | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3c0bcd9..d6e3397 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ pip install text2term ## Basic Examples
- Examples of Programmatic Use + Examples of Programmatic Use +### Examples of Programmatic Use text2term supports mapping strings specified in different input formats: ```python @@ -46,8 +47,9 @@ dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"])
- Examples of Command Line Interface Use + Examples of Command Line Interface Use +### Examples of Command Line Interface Use To show a help message describing all arguments type into a terminal: ```shell python text2term --help @@ -121,7 +123,9 @@ text2term.map_terms(source_terms, The function returns a pandas `DataFrame` containing the generated ontology mappings.
-

Argument Details

+ Argument Details + +### Argument Details `source_terms`—Strings to be mapped to an ontology, which can be specified as a: 1. list of strings @@ -169,8 +173,9 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp
-

Ontology Caching

+ Ontology Caching +### Ontology Caching text2term supports caching ontologies for faster or repeated mapping to the same ontology. An ontology can be cached using the function: ```python @@ -201,8 +206,9 @@ Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `
-

Input Preprocessing

+ Input Preprocessing +### Input Preprocessing text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. ```python @@ -240,13 +246,15 @@ To display a help message with descriptions of tool arguments do: `python text2term -h` or `python text2term --help` -### Required arguments +### Required Arguments `-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). `-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies.
- Optional arguments + Optional Arguments + +### Optional Arguments `-o OUTPUT` Path to desired output file for the mappings. From 133be1308aeb9531115530c92091c8476f8eeac3 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 10:32:33 -0400 Subject: [PATCH 179/185] Add description of mappers (closes #43) --- README.md | 69 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index d6e3397..2534e5e 100644 --- a/README.md +++ b/README.md @@ -10,44 +10,64 @@ pip install text2term ## Basic Examples
- Examples of Programmatic Use + Examples of Programmatic Mapping -### Examples of Programmatic Use -text2term supports mapping strings specified in different input formats: +### Examples of Programmatic Mapping +text2term supports mapping strings specified in multiple input formats. In the first example, we map strings in a list to an ontology specified by its URL: ```python -import text2term - -# map strings in a list to an ontology specified by its URL +import text2term dfl = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -# map strings listed in a file 'test/unstruct_terms.txt' to an ontology specified by its URL +There is also support for file-based input, for example a file containing a list of strings: +```python dff = text2term.map_terms(source_terms="test/unstruct_terms.txt", target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` + +or a table where we can specify the column of terms to map and the table value separator: +```python +dff = text2term.map_terms(source_terms="test/some_table.tsv", + csv_columns=('diseases','optional_ids'), separator="\t", + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -# map strings in a dictionary with associated tags to an ontology specified by its URL +Finally it is possible map strings in a dictionary with associated tags that are preserved in the output: +```python dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":["disease", "lung"]}, target_ontology="http://purl.obolibrary.org/obo/mondo.owl") ``` -text2term supports caching an ontology for repeated use: +
+ +
+ Examples of Programmatic Caching + +### Examples of Programmatic Caching +text2term supports caching an ontology for repeated use. The next example caches an ontology and gives it a name for use later on ```python -# cache ontology and give it a name for use later on mondo = text2term.cache_ontology(ontology_url="http://purl.obolibrary.org/obo/mondo.owl", ontology_acronym="MONDO") +``` -# now map strings to the cached ontology by specifying as `target_ontology` the name chosen above and the flag `use_cache=True` -dfc = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], target_ontology="MONDO", use_cache=True) +Now we can map strings to the cached ontology by specifying as `target_ontology` the name chosen above and the flag `use_cache=True` -# or more succinctly, use the OntologyCache object `mondo` +```python +dfc = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], + target_ontology="MONDO", use_cache=True) +``` + +More succinctly, we can use the returned `OntologyCache` object `mondo` as such: +```python dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"]) ```
- Examples of Command Line Interface Use + Examples of Command Line Interface Use ### Examples of Command Line Interface Use To show a help message describing all arguments type into a terminal: @@ -281,3 +301,24 @@ To display a help message with descriptions of tool arguments do: `-u` Include all unmapped terms in the output
+ + +## Supported Mappers + +The mapping score associated with each mapping is indicative of how similar an input term is to an ontology term (via its labels or synonyms). The mapping/similarity scores generated by text2term are the result of applying one of the following "mappers": + +TF-IDF-based mapper +: [TF-IDF](https://en.wikipedia.org/wiki/Tf–idf), a statistical measure often used in information retrieval, measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). + +BioPortal Web API-based mapper +: uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. To use it, make sure to specify the target ontology name(s) as they appear in BioPortal. + +: _Note_: there are no confidence scores associated with BioPortal annotations, so we decided to set the mapping score of all mappings to 1. + +Zooma Web API-based mapper +: uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. To use it, make sure to specify the target ontology name(s) as they appear in OLS. + +Syntactic distance-based mappers +: text2term provides support for commonly used and popular syntactic (edit) distance metrics. Specifically, we implemented support for Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel metrics. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances, and [rapidfuzz](https://pypi.org/project/rapidfuzz/) for all others. + +_Note_: syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads. \ No newline at end of file From ae99a2ac53bdff2c875de58a0191eeed69e2527e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 12:24:06 -0400 Subject: [PATCH 180/185] Update README.md --- README.md | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 2534e5e..3f273dc 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":[ Examples of Programmatic Caching ### Examples of Programmatic Caching -text2term supports caching an ontology for repeated use. The next example caches an ontology and gives it a name for use later on +text2term supports caching an ontology for repeated use. Here we cache an ontology and give it a name for later use: ```python mondo = text2term.cache_ontology(ontology_url="http://purl.obolibrary.org/obo/mondo.owl", ontology_acronym="MONDO") @@ -107,7 +107,7 @@ python text2term.py -s test/unstruct_terms.txt -t test/mondo.owl -iris http://pu While MONDO uses terms from other ontologies such as CHEBI and Uberon, the tool only considers terms whose IRIs start either with "http://purl.obolibrary.org/obo/mondo" or "http://identifiers.org/hgnc". --- -Cache an ontology for repeated use, by first running the tool as usual while instructing it to cache the ontology using `-c `: +Cache an ontology for repeated use by running the tool while instructing it to cache the ontology via `-c `: ```shell python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO ``` @@ -157,9 +157,14 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp - Unmapped terms can still be included in the output if `incl_unmapped` is True `target_ontology`—Path, URL or name of 'target' ontology to map the source terms to -: Ontology names can be given as values to `target_ontology` (eg "EFO" or "CL")--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. -: When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. -: When the target ontology has been cached, this should be the ontology name given when it was first cached. + +> [!TIP] +> Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. +> +> Similarly, when the target ontology has been cached, enter the name used upon caching. + +> [!NOTE] +> When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. `base_iris`—Map only to ontology terms whose IRIs start with one of the strings given in this tuple @@ -171,8 +176,7 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp `separator`—Character that separates columns when input is a table (eg '\t' for TSV) -`mapper`—Method used to compare source terms with ontology terms - : One of levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal +`mapper`—Method used to compare source terms with ontology terms. One of `levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal` (see [Supported Mappers](#supported-mappers)) `max_mappings`—Maximum number of top-ranked mappings returned per source term @@ -307,18 +311,22 @@ To display a help message with descriptions of tool arguments do: The mapping score associated with each mapping is indicative of how similar an input term is to an ontology term (via its labels or synonyms). The mapping/similarity scores generated by text2term are the result of applying one of the following "mappers": -TF-IDF-based mapper -: [TF-IDF](https://en.wikipedia.org/wiki/Tf–idf), a statistical measure often used in information retrieval, measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). +**TF-IDF-based mapper**—[TF-IDF](https://en.wikipedia.org/wiki/Tf–idf) is a statistical measure often used in information retrieval that measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). + +**BioPortal Web API-based mapper**—uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. + +> [!IMPORTANT] +> Make sure to specify the target ontology name(s) as they appear in BioPortal -BioPortal Web API-based mapper -: uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. To use it, make sure to specify the target ontology name(s) as they appear in BioPortal. +> [!WARNING] +> there are no confidence scores associated with BioPortal annotations, so we decided to set the mapping score of all mappings to 1 -: _Note_: there are no confidence scores associated with BioPortal annotations, so we decided to set the mapping score of all mappings to 1. +**Zooma Web API-based mapper**—uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. -Zooma Web API-based mapper -: uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. To use it, make sure to specify the target ontology name(s) as they appear in OLS. +> [!IMPORTANT] +> Make sure to specify the target ontology name(s) as they appear in OLS -Syntactic distance-based mappers -: text2term provides support for commonly used and popular syntactic (edit) distance metrics. Specifically, we implemented support for Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel metrics. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances, and [rapidfuzz](https://pypi.org/project/rapidfuzz/) for all others. +**Syntactic distance-based mappers**—text2term provides support for commonly used and popular syntactic (edit) distance metrics: Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances and [rapidfuzz](https://pypi.org/project/rapidfuzz/) to compute all others. -_Note_: syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads. \ No newline at end of file +> [!NOTE] +> Syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads. \ No newline at end of file From c6384b4eaa95371fde61a4a5e9b89e1477bd151f Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 12:43:31 -0400 Subject: [PATCH 181/185] Fix some tooltip markup --- README.md | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3f273dc..bd850d4 100644 --- a/README.md +++ b/README.md @@ -159,12 +159,9 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp `target_ontology`—Path, URL or name of 'target' ontology to map the source terms to > [!TIP] -> Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. -> -> Similarly, when the target ontology has been cached, enter the name used upon caching. +> Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. -> [!NOTE] -> When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. +When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. `base_iris`—Map only to ontology terms whose IRIs start with one of the strings given in this tuple @@ -190,7 +187,7 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp `use_cache`—Use the cache for the ontology -`term_type`—Specifies whether to map to ontology classes, properties or any of the two. Possible values are ['class', 'property', 'any'] +`term_type`—Specifies whether to map to ontology classes, properties or both. Possible values are `class, property, any` `incl_unmapped`—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame @@ -223,9 +220,11 @@ text2term.clear_cache(ontology_acronym='') If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. -**_Notes_** -- The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. -- While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. +> [!TIP] +> The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. + +> [!NOTE] +> While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment.
@@ -286,7 +285,7 @@ To display a help message with descriptions of tool arguments do: `-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids'). -`-sep SEPARATOR`, Specifies the cell separator to be used when reading a table +`-sep SEPARATOR` Specifies the cell separator to be used when reading a table `-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. @@ -309,22 +308,19 @@ To display a help message with descriptions of tool arguments do: ## Supported Mappers -The mapping score associated with each mapping is indicative of how similar an input term is to an ontology term (via its labels or synonyms). The mapping/similarity scores generated by text2term are the result of applying one of the following "mappers": +The mapping score of each mapping is indicative of how similar an input term is to an ontology term (via its labels or synonyms). The mapping/similarity scores generated by text2term are the result of applying one of the following "mappers": **TF-IDF-based mapper**—[TF-IDF](https://en.wikipedia.org/wiki/Tf–idf) is a statistical measure often used in information retrieval that measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). **BioPortal Web API-based mapper**—uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. -> [!IMPORTANT] -> Make sure to specify the target ontology name(s) as they appear in BioPortal - > [!WARNING] -> there are no confidence scores associated with BioPortal annotations, so we decided to set the mapping score of all mappings to 1 +> There are no scores associated with BioPortal annotations, so the score of all mappings is always 1 **Zooma Web API-based mapper**—uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. > [!IMPORTANT] -> Make sure to specify the target ontology name(s) as they appear in OLS +> When using the BioPortal or Zooma interfaces, make sure to specify the target ontology name(s) as they appear in BioPortal or OLS, respectively **Syntactic distance-based mappers**—text2term provides support for commonly used and popular syntactic (edit) distance metrics: Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances and [rapidfuzz](https://pypi.org/project/rapidfuzz/) to compute all others. From 38503b37a46576c157a57a2d07a184a173b64066 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 12:49:33 -0400 Subject: [PATCH 182/185] Update README.md --- README.md | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index bd850d4..feac454 100644 --- a/README.md +++ b/README.md @@ -46,13 +46,13 @@ dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":[ Examples of Programmatic Caching ### Examples of Programmatic Caching -text2term supports caching an ontology for repeated use. Here we cache an ontology and give it a name for later use: +text2term supports caching an ontology for repeated use. Here we cache an ontology and give it a name: ```python mondo = text2term.cache_ontology(ontology_url="http://purl.obolibrary.org/obo/mondo.owl", ontology_acronym="MONDO") ``` -Now we can map strings to the cached ontology by specifying as `target_ontology` the name chosen above and the flag `use_cache=True` +The given name acts as a reference. Now we can map strings to the cached ontology by specifying as `target_ontology` the name specified above and the flag `use_cache=True` ```python dfc = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], @@ -156,10 +156,7 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp - If a term is tagged with "Ignore", text2term will not map it - Unmapped terms can still be included in the output if `incl_unmapped` is True -`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to - -> [!TIP] -> Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. +`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to. Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. @@ -187,7 +184,7 @@ When using BioPortal or Zooma, this should be a comma-separated list of ontology `use_cache`—Use the cache for the ontology -`term_type`—Specifies whether to map to ontology classes, properties or both. Possible values are `class, property, any` +`term_type`—Specifies whether to map to ontology classes, properties or both. One of `class, property, any` `incl_unmapped`—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame @@ -220,11 +217,9 @@ text2term.clear_cache(ontology_acronym='') If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. -> [!TIP] -> The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. - -> [!NOTE] -> While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. +**_Notes:_** +- The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. +- While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment.
From 0300f2b824d42a5cf79a444f6e9df62a125a36dd Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 12:55:28 -0400 Subject: [PATCH 183/185] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index feac454..ac35740 100644 --- a/README.md +++ b/README.md @@ -156,9 +156,9 @@ The function returns a pandas `DataFrame` containing the generated ontology mapp - If a term is tagged with "Ignore", text2term will not map it - Unmapped terms can still be included in the output if `incl_unmapped` is True -`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to. Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"--text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. +`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to. Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"—text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. -When using BioPortal or Zooma, this should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. +When using the BioPortal or Zooma interfaces, the value for `target_ontology` should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. `base_iris`—Map only to ontology terms whose IRIs start with one of the strings given in this tuple @@ -303,21 +303,21 @@ To display a help message with descriptions of tool arguments do: ## Supported Mappers -The mapping score of each mapping is indicative of how similar an input term is to an ontology term (via its labels or synonyms). The mapping/similarity scores generated by text2term are the result of applying one of the following "mappers": +The mapping score of each mapping indicates how similar an input term is to an ontology term (via its labels or synonyms). The mapping scores generated by text2term are the result of applying one of the following _mappers_: **TF-IDF-based mapper**—[TF-IDF](https://en.wikipedia.org/wiki/Tf–idf) is a statistical measure often used in information retrieval that measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). -**BioPortal Web API-based mapper**—uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. +**Syntactic distance-based mappers**—text2term provides support for commonly used and popular syntactic (edit) distance metrics: Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances and [rapidfuzz](https://pypi.org/project/rapidfuzz/) to compute all others. + +**BioPortal Web API-based mapper**—uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms in bulk to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. > [!WARNING] > There are no scores associated with BioPortal annotations, so the score of all mappings is always 1 -**Zooma Web API-based mapper**—uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. +**Zooma Web API-based mapper**—uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms in bulk to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. > [!IMPORTANT] -> When using the BioPortal or Zooma interfaces, make sure to specify the target ontology name(s) as they appear in BioPortal or OLS, respectively - -**Syntactic distance-based mappers**—text2term provides support for commonly used and popular syntactic (edit) distance metrics: Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances and [rapidfuzz](https://pypi.org/project/rapidfuzz/) to compute all others. +> When using the BioPortal or Zooma interfaces, make sure to specify the target ontology name(s) as they appear in BioPortal or OLS, respectively > [!NOTE] -> Syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads. \ No newline at end of file +> Syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads \ No newline at end of file From fad6ca2112636fc416abf5ec3e9c9aadd0fbac82 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 12:57:57 -0400 Subject: [PATCH 184/185] Update README.md --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ac35740..f78e361 100644 --- a/README.md +++ b/README.md @@ -265,34 +265,34 @@ To display a help message with descriptions of tool arguments do: `python text2term -h` or `python text2term --help` ### Required Arguments -`-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file). +`-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file) -`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies. +`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies
Optional Arguments ### Optional Arguments -`-o OUTPUT` Path to desired output file for the mappings. +`-o OUTPUT` Path to desired output file for the mappings -`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, indel, fuzzy, tfidf, zooma, bioportal*. +`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, indel, fuzzy, tfidf, zooma, bioportal* -`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids'). +`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') `-sep SEPARATOR` Specifies the cell separator to be used when reading a table -`-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. +`-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term -`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match). +`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match) -`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)'). +`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)') -`-d` Exclude ontology terms stated as deprecated via `owl:deprecated true`. +`-d` Exclude ontology terms stated as deprecated via `owl:deprecated true` -`-g` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term. +`-g` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term -`-c STORE_IN_CACHE` Cache the target ontology using the name given here. +`-c STORE_IN_CACHE` Cache the target ontology using the name given here `-type TERM_TYPE` Specify whether to map to ontology classes, properties, or both From 37e260574a496c3b5239c3d7f6d4ec6813098413 Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Wed, 5 Jun 2024 14:34:31 -0400 Subject: [PATCH 185/185] Remove collapsible headers they don't render properly in GitHub Pages --- README.md | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index f78e361..d8c474b 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,6 @@ pip install text2term ``` ## Basic Examples -
- Examples of Programmatic Mapping - ### Examples of Programmatic Mapping text2term supports mapping strings specified in multiple input formats. In the first example, we map strings in a list to an ontology specified by its URL: @@ -40,10 +37,6 @@ dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":[ target_ontology="http://purl.obolibrary.org/obo/mondo.owl") ``` -
- -
- Examples of Programmatic Caching ### Examples of Programmatic Caching text2term supports caching an ontology for repeated use. Here we cache an ontology and give it a name: @@ -63,12 +56,8 @@ More succinctly, we can use the returned `OntologyCache` object `mondo` as such: ```python dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"]) ``` -
-
- Examples of Command Line Interface Use - ### Examples of Command Line Interface Use To show a help message describing all arguments type into a terminal: ```shell @@ -117,8 +106,6 @@ Now the ontology is cached and we can refer to it as the target ontology using t python text2term -s test/unstruct_terms.txt -t MONDO ``` -
- ## Programmatic Usage After installing and importing to a Python environment, the main function is `map_terms`: @@ -142,9 +129,6 @@ text2term.map_terms(source_terms, ``` The function returns a pandas `DataFrame` containing the generated ontology mappings. -
- Argument Details - ### Argument Details `source_terms`—Strings to be mapped to an ontology, which can be specified as a: @@ -188,10 +172,6 @@ When using the BioPortal or Zooma interfaces, the value for `target_ontology` sh `incl_unmapped`—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame -
- -
- Ontology Caching ### Ontology Caching text2term supports caching ontologies for faster or repeated mapping to the same ontology. An ontology can be cached using the function: @@ -217,14 +197,12 @@ text2term.clear_cache(ontology_acronym='') If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. -**_Notes:_** -- The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. -- While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. +> [!NOTE] +> The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. -
+> [!CAUTION] +> While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. -
- Input Preprocessing ### Input Preprocessing text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. @@ -252,7 +230,6 @@ When the input to text2term is a table, any rows that contain `NA` values in the If an ignore tag `"ignore"` or `"Ignore"` is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. The following values are regarded as ignore tags: `"ignore", "Ignore". -
## Command Line Interface Usage @@ -269,8 +246,6 @@ To display a help message with descriptions of tool arguments do: `-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies -
- Optional Arguments ### Optional Arguments @@ -298,8 +273,6 @@ To display a help message with descriptions of tool arguments do: `-u` Include all unmapped terms in the output -
- ## Supported Mappers