diff --git a/.github/workflows/github_test.yml b/.github/workflows/github_test.yml new file mode 100644 index 0000000..947449f --- /dev/null +++ b/.github/workflows/github_test.yml @@ -0,0 +1,45 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: GitHub Unit Testing +run-name: Unit Testing on ${{ github.event_name }} + +on: + push: + branches: [ "development" ] + pull_request: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 pytest wheel + - name: Install text2term + run: | + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + cd test + python -m unittest simple_tests diff --git a/.github/workflows/upload_pypi.yml b/.github/workflows/upload_pypi.yml new file mode 100644 index 0000000..eef2a4f --- /dev/null +++ b/.github/workflows/upload_pypi.yml @@ -0,0 +1,45 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Upload PyPI +run-name: Upload ${{ github.event.release.tag_name }} to PyPI + +on: + release: + types: [published] + +permissions: + contents: write + id-token: write + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 twine sdist wheel build + - name: Install text2term + run: | + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Build dist/ + run: | + python -m build --sdist --wheel --no-isolation --outdir dist/ . + - name: Upload to pypi + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/upload_testpypi.yml b/.github/workflows/upload_testpypi.yml new file mode 100644 index 0000000..82fec26 --- /dev/null +++ b/.github/workflows/upload_testpypi.yml @@ -0,0 +1,47 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Upload Test PyPI +run-name: Upload ${{ github.event.release.tag_name }} to Test PyPI + +on: + release: + types: [published] + +permissions: + contents: write + id-token: write + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 twine sdist wheel build + - name: Install text2term + run: | + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Build dist/ + run: | + python -m build --sdist --wheel --no-isolation --outdir dist/ . + - name: Upload to pypi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.gitignore b/.gitignore index c67a9c3..aa2a1c4 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,13 @@ ipython_config.py # pyenv .python-version +# For PyPi upload +make-pypi.sh +.pypirc + +# Cache should not be uploaded +cache/ + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies @@ -130,5 +137,4 @@ dmypy.json # Other .idea -.DS_Store -test/* \ No newline at end of file +.DS_Store \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..8d044c5 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,32 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: requirements.txt \ No newline at end of file diff --git a/LICENSE b/LICENSE index d54e365..c39619b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 ccb-hms +Copyright (c) 2022 Center for Computational Biomedicine, Harvard Medical School Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README-UI.md b/README-UI.md new file mode 100644 index 0000000..7846136 --- /dev/null +++ b/README-UI.md @@ -0,0 +1,90 @@ +# ontology-mapper-ui +The following information pertains to the text2term UI, which is written [here](https://github.com/ccb-hms/ontology-mapper-ui) and runs online [here](https://text2term.hms.harvard.edu/). It supports fewer features than the base package does, but provides a user interface for non-programmers. + +### Running Locally via Node + Python + +##### Requirements + +- Node >= 16.0.0 +- npm >= 8.0.0 +- Python >= 3.9.0 +- pip >= 21.0.0 +- text2term >= 4.1.2 + +**\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions. + +**\*\*** If you are running this locally on Google Chrome, you will likely run into issues with CORS (Cross-Origin Requests) that I have been unable to completely resolve. I would recommend using a different browser, using the Docker method, or finding some way to disable CORS on Chrome while running this. + +#### Instructions + +##### Initial Setup + +When first cloned, run the command: + + +``` +npm install +``` + +to install all necessary packages for the React frontend. + +Next, go into the `flask-api` folder (perhaps by running `cd flask-api`) and run + +``` +pip install -r requirements-flask.txt +``` + +to install necessary packages for the Flask api. + +##### Running + +To run, make sure you are in the root of the repository and run, in two separate command line instances, the command + +``` +npm start +``` + +to start the front-end, which can be seen at `localhost:3000`, and the command + +``` +npm run flask-api +``` + +to start the back-end, which can be interacted with at `localhost:5000`. + +### Running Locally via Docker + +#### Requirements + +- Docker + +#### Instructions + +##### Initial Setup + +Before running, make sure you have the latest version of the repository built by running the command + +``` +docker-compose build +``` + +Docker should build two images: + +- `ontology-mapper-api`: the Flask backend API +- `ontology-mapper-client`: the React frontend + +##### Running + +To run the website, run the command: + +``` +docker-compose up +``` + +Docker should build two containers corresponding to the two images. + +In a browser, navigate to `localhost:8602` to see the front-end. + +### Acknowledgements + +Initial setup of React and Flask and Dockerization aided by an [article series](https://blog.miguelgrinberg.com/post/how-to-dockerize-a-react-flask-project) by Miguel Grinberg. \ No newline at end of file diff --git a/README.md b/README.md index f407474..d8c474b 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,296 @@ -# text2term Ontology Mapper +# *text2term* ontology mapper +A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in ontologies. -A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration. +## Installation +Install package using **pip**: -## Usage +``` +pip install text2term +``` +## Basic Examples -`text2term.py -s SOURCE -t TARGET [-o OUTPUT] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d EXCL_DEPRECATED] [-i INCL_INDIVIDUALS]` +### Examples of Programmatic Mapping +text2term supports mapping strings specified in multiple input formats. In the first example, we map strings in a list to an ontology specified by its URL: -### Required arguments -`-s SOURCE` Input file containing list of 'source' terms to map to ontology terms (one per line). +```python +import text2term +dfl = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -`-t TARGET` Path or URL of 'target' ontology to map the source terms to. +There is also support for file-based input, for example a file containing a list of strings: +```python +dff = text2term.map_terms(source_terms="test/unstruct_terms.txt", + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -### Optional arguments +or a table where we can specify the column of terms to map and the table value separator: +```python +dff = text2term.map_terms(source_terms="test/some_table.tsv", + csv_columns=('diseases','optional_ids'), separator="\t", + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -`-o OUTPUT` Path to desired output file for the mappings. +Finally it is possible map strings in a dictionary with associated tags that are preserved in the output: +```python +dfd = text2term.map_terms(source_terms={"asthma":"disease", "acute bronchitis":["disease", "lung"]}, + target_ontology="http://purl.obolibrary.org/obo/mondo.owl") +``` -`-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term. -`-min MIN_SCORE` Minimum score [0,1] for the mappings (0=dissimilar, 1=exact match). +### Examples of Programmatic Caching +text2term supports caching an ontology for repeated use. Here we cache an ontology and give it a name: +```python +mondo = text2term.cache_ontology(ontology_url="http://purl.obolibrary.org/obo/mondo.owl", + ontology_acronym="MONDO") +``` -`-iris BASE_IRIS` Map only to terms whose IRIs start with any IRI given in this comma-separated list. +The given name acts as a reference. Now we can map strings to the cached ontology by specifying as `target_ontology` the name specified above and the flag `use_cache=True` -`-d EXCL_DEPRECATED` Exclude terms stated as deprecated via owl:deprecated. +```python +dfc = text2term.map_terms(source_terms=["asthma", "acute bronchitis"], + target_ontology="MONDO", use_cache=True) +``` -`-i INCL_INDIVIDUALS` Include ontology individuals in addition to classes. +More succinctly, we can use the returned `OntologyCache` object `mondo` as such: +```python +dfo = mondo.map_terms(source_terms=["asthma", "acute bronchitis"]) +``` -## Examples -The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term.py -s unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` +### Examples of Command Line Interface Use +To show a help message describing all arguments type into a terminal: +```shell +python text2term --help +``` -Specify an output file where the mappings should be saved using `-o`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` +The basic use of text2term requires a `source` file containing the terms to map to a given `target` ontology: +```shell +python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl +``` +--- +Map to a local ontology and specify an output file where the mappings should be saved using `-o`: +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -o test/mymappings.csv +``` + +--- Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -min 0.8` +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -min 0.8 +``` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. +--- Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -d` +```shell +python text2term -s test/unstruct_terms.txt -t test/mondo.owl -d +``` +--- Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` -Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. +```shell +python text2term.py -s test/unstruct_terms.txt -t test/mondo.owl -iris http://purl.obolibrary.org/obo/mondo,http://identifiers.org/hgnc +``` +While MONDO uses terms from other ontologies such as CHEBI and Uberon, the tool only considers terms whose IRIs start either with "http://purl.obolibrary.org/obo/mondo" or "http://identifiers.org/hgnc". + +--- +Cache an ontology for repeated use by running the tool while instructing it to cache the ontology via `-c `: +```shell +python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO +``` + +Now the ontology is cached and we can refer to it as the target ontology using the name given beforehand: +```shell +python text2term -s test/unstruct_terms.txt -t MONDO +``` + + +## Programmatic Usage +After installing and importing to a Python environment, the main function is `map_terms`: + +```python +text2term.map_terms(source_terms, + target_ontology='http://some.ontology/v1.owl', + base_iris=(), + csv_columns=(), + excl_deprecated=False, + max_mappings=3, + mapper=Mapper.TFIDF, + min_score=0.3, + output_file='', + save_graphs=False, + save_mappings=False, + separator=',', + use_cache=False, + term_type=OntologyTermType.CLASS, + incl_unmapped=False) +``` +The function returns a pandas `DataFrame` containing the generated ontology mappings. + +### Argument Details + +`source_terms`—Strings to be mapped to an ontology, which can be specified as a: +1. list of strings +2. string containing a file path +3. dictionary of terms and associated tags, where each key is a term and the value is a list of tags +4. list of `TaggedTerm` objects + - Tags do not affect the mapping, they are simply added to the output dataframe + - If a term is tagged with "Ignore", text2term will not map it + - Unmapped terms can still be included in the output if `incl_unmapped` is True + +`target_ontology`—Path, URL or name of 'target' ontology to map the source terms to. Ontology names can be given as values to `target_ontology` e.g. "EFO" or "CL"—text2term uses [bioregistry](https://bioregistry.io) to get URLs for such names. Similarly, when the target ontology has been cached, enter the name used upon caching. + +When using the BioPortal or Zooma interfaces, the value for `target_ontology` should be a comma-separated list of ontology acronyms (eg 'EFO,HPO') or **'all'** to search all ontologies. + +`base_iris`—Map only to ontology terms whose IRIs start with one of the strings given in this tuple + +`excl_deprecated`—Exclude ontology terms stated as deprecated via `owl:deprecated true` + +`source_terms_ids`—Collection of identifiers for the given source terms + +`csv_column`—Specify the name of the column containing the terms to map, when the input file is a table. Optionally provide a second column name, containing the respective term identifiers + +`separator`—Character that separates columns when input is a table (eg '\t' for TSV) + +`mapper`—Method used to compare source terms with ontology terms. One of `levenshtein, jaro, jarowinkler, jaccard, fuzzy, tfidf, zooma, bioportal` (see [Supported Mappers](#supported-mappers)) + +`max_mappings`—Maximum number of top-ranked mappings returned per source term + +`min_score`—Minimum similarity score [0,1] for the mappings (1=exact match) + +`save_mappings`—Save the generated mappings to a file (specified by `output_file`) + +`output_file`—Path to desired output file for the mappings dataframe + +`save_graphs`—Save vis.js graphs representing the neighborhood of each ontology term + +`use_cache`—Use the cache for the ontology + +`term_type`—Specifies whether to map to ontology classes, properties or both. One of `class, property, any` + +`incl_unmapped`—Include unmapped terms in the output. If a term has been tagged 'Ignore' or has less than the `min_score`, it is included in the output data frame + + +### Ontology Caching +text2term supports caching ontologies for faster or repeated mapping to the same ontology. An ontology can be cached using the function: + +```python +cache_ontology(ontology_url, ontology_acronym="", base_iris=()) +``` +This caches a single ontology from a URL or file path, and takes an optional acronym that will be used to reference the cached ontology later. If no acronym is given, the URL is used as the name. + +It is also possible to cache multiple ontologies, whose names and URLs are specified in a table formatted as such `acronym,version,url`. An example is provided in [resources/ontologies.csv](https://github.com/ccb-hms/ontology-mapper/blob/main/text2term/resources/ontologies.csv): +```python +cache_ontology_set(ontology_registry_path) +``` + +Once an ontology has been cached by either function, it is stored in a cache folder locally, and thus can be referenced even in different Python instances. Users can leverage the cache by using the assigned acronym as the value for the `target_ontology` argument, and setting the `use_cache` argument to `True`. + +To clear the ontology cache, the following function can be used: + +```python +text2term.clear_cache(ontology_acronym='') +``` + +If no arguments are specified, the entire cache will be cleared. Otherwise, only the ontology with the given acronym will be cleared. +Finally, `cache_exists(ontology_acronym='')` is a simple function that returns `True` if the given acronym exists in the cache, and `False` otherwise. + +> [!NOTE] +> The `cache_ontology` function returns an object that can be used to directly call the `map_terms` function, as well as `clear_cache` and `cache_exists`. These have the same arguments, except `ontology_target` is no longer specified and there is no `use_cache` option, since it is always True. + +> [!CAUTION] +> While ontology URLs can be repeatedly used, acronyms must be distinct in a given environment. + + +### Input Preprocessing +text2term includes regular expression-based preprocessing functionality for input terms. There are functions that take the input terms and a collection of (user-defined) regular expressions, then match each term to each regular expression to simplify the input term. + +```python +preprocess_terms(terms, template_path, output_file='', blocklist_path='', + blocklist_char='', rem_duplicates=False) +``` +This returns a dictionary where the keys are the original terms and the values are the preprocessed terms. + +```python +preprocess_tagged_terms(file_path, template_path='', blocklist_path='', + blocklist_char='', rem_duplicates=False, separator=';:;') +``` + +This returns a list of `TaggedTerm` objects. + +The regex templates file `template_path` and the blocklist `blocklist_path` must each be a newline-separated file. If an output file is specified, the preprocessed strings are written to that file. + +The blocklist functionality allows specifying another file with regular expressions that, when terms match any such regex in the blocklist, they are removed from the list of terms to map. Alternatively, if a blocklist character is specified, the input is replaced with that character. + +The `rem_duplicates` option removes all duplicate terms after processing, if set to `True`. + +When the input to text2term is a table, any rows that contain `NA` values in the specified term column, or in the term ID column (if provided), will be ignored. + +If an ignore tag `"ignore"` or `"Ignore"` is added to a term, that term will not be mapped to any terms in the ontology. It will only be included in the output if the `incl_unmapped` argument is True. The following values are regarded as ignore tags: `"ignore", "Ignore". + + +## Command Line Interface Usage + +After installing, execute the tool from a command line as follows: + +`python text2term [-h] -s SOURCE -t TARGET [-o OUTPUT] [-m MAPPER] [-csv CSV_INPUT] [-sep SEPARATOR] [-top TOP_MAPPINGS] [-min MIN_SCORE] [-iris BASE_IRIS] [-d] [-g] [-c STORE_IN_CACHE] [-type TERM_TYPE] [-u]` + +To display a help message with descriptions of tool arguments do: + +`python text2term -h` or `python text2term --help` + +### Required Arguments +`-s SOURCE` Input file containing 'source' terms to map to ontology terms (list of terms or CSV file) + +`-t TARGET` Path or URL of 'target' ontology to map source terms to. When the chosen mapper is BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write `'all'` to search all ontologies + + +### Optional Arguments + +`-o OUTPUT` Path to desired output file for the mappings + +`-m MAPPER` Method used to compare source terms with ontology terms. One of: *levenshtein, jaro, jarowinkler, jaccard, indel, fuzzy, tfidf, zooma, bioportal* + +`-csv CSV_INPUT` Indicates a CSV format input—follow with the name of the column containing terms to map, optionally followed by the name of the column containing identifiers for the terms (eg 'my terms,my term ids') + +`-sep SEPARATOR` Specifies the cell separator to be used when reading a table + +`-top TOP_MAPPINGS` Maximum number of top-ranked mappings returned per source term + +`-min MIN_SCORE` Minimum similarity score [0,1] for the mappings (1=exact match) + +`-iris BASE_IRIS` Map only to ontology terms whose IRIs start with a value given in this comma-separated list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)') + +`-d` Exclude ontology terms stated as deprecated via `owl:deprecated true` + +`-g` Save [vis.js](https://visjs.org) graphs representing the neighborhood of each ontology term + +`-c STORE_IN_CACHE` Cache the target ontology using the name given here + +`-type TERM_TYPE` Specify whether to map to ontology classes, properties, or both + +`-u` Include all unmapped terms in the output + + +## Supported Mappers + +The mapping score of each mapping indicates how similar an input term is to an ontology term (via its labels or synonyms). The mapping scores generated by text2term are the result of applying one of the following _mappers_: + +**TF-IDF-based mapper**—[TF-IDF](https://en.wikipedia.org/wiki/Tf–idf) is a statistical measure often used in information retrieval that measures how important a word is to a document in a corpus of documents. We first generate TF-IDF-based vectors of the source terms and of labels and synonyms of ontology terms. Then we compute the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between vectors to determine how similar a source term is to a target term (label or synonym). + +**Syntactic distance-based mappers**—text2term provides support for commonly used and popular syntactic (edit) distance metrics: Levenshtein, Jaro, Jaro-Winkler, Jaccard, and Indel. We use the [nltk](https://pypi.org/project/nltk/) package to compute Jaccard distances and [rapidfuzz](https://pypi.org/project/rapidfuzz/) to compute all others. + +**BioPortal Web API-based mapper**—uses an interface to the [BioPortal Annotator](https://bioportal.bioontology.org/annotator) that we built to allow mapping terms in bulk to ontologies in the [BioPortal](https://bioportal.bioontology.org) repository. + +> [!WARNING] +> There are no scores associated with BioPortal annotations, so the score of all mappings is always 1 + +**Zooma Web API-based mapper**—uses a [Zooma](https://www.ebi.ac.uk/spot/zooma/) interface that we built to allow mapping terms in bulk to ontologies in the [Ontology Lookup Service (OLS)](https://www.ebi.ac.uk/ols4) repository. + +> [!IMPORTANT] +> When using the BioPortal or Zooma interfaces, make sure to specify the target ontology name(s) as they appear in BioPortal or OLS, respectively + +> [!NOTE] +> Syntactic distance-based mappers and Web API-based mappers perform slowly (much slower than the TF-IDF mapper). The former because they do pairwise comparisons between each input string and each ontology term label/synonym. In the Web API-based approaches there are networking and API load overheads \ No newline at end of file diff --git a/curie2iri.py b/curie2iri.py deleted file mode 100644 index 830d36f..0000000 --- a/curie2iri.py +++ /dev/null @@ -1,85 +0,0 @@ -import datetime -import logging -import ssl -import sys -import urllib.request -from urllib.error import HTTPError -import pandas as pd -import onto_utils - -ssl._create_default_https_context = ssl._create_stdlib_context - - -class OntoTag2Iri: - - def __init__(self): - self.logger = onto_utils.get_logger(__name__, logging.INFO) - - def get_iris(self, source_tags, resolve_iri): - iri_mappings = [] - for source_tag in source_tags: - source_tag, iri, iri_resolves = self.get_iri(source_tag, resolve_iri) - iri_mappings.append((source_tag, iri, iri_resolves)) - return iri_mappings - - def get_iri(self, source_tag, resolve_iri): - iri = source_tag - iri_resolves = False - if len(source_tag) > 0 and source_tag != "NA": - if ":" in source_tag: - iri = self.remove_whitespace(iri) - onto_name = iri.split(":")[0] - term_name = iri.replace(":", "_") - full_iri = self._get_iri(onto_name, term_name) - iri = full_iri if len(full_iri) > 0 else iri - elif "_" in source_tag: - iri = self.remove_whitespace(iri) - ont_name = iri.split("_")[0] - full_iri = self._get_iri(ont_name, iri) - iri = full_iri if len(full_iri) > 0 else iri - if source_tag != iri: - iri_resolves = self.resolves(iri) if resolve_iri else iri_resolves - else: - self.logger.info("Unable to find suitable IRI for: %s", source_tag) - return source_tag, iri, iri_resolves - - def _get_iri(self, ont_name, term_name): - iri = '' - if ont_name in onto_utils.ONTOLOGY_IRIS: - if ont_name == 'ORPHA': - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name.replace('ORPHA_', 'Orphanet_') - elif ont_name == 'SNOMED' or ont_name == 'OMIM': - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name.replace(ont_name + '_', '') - else: - iri = onto_utils.ONTOLOGY_IRIS.get(ont_name) + term_name - return iri - - def remove_whitespace(self, string): - return string.replace(' ', '') - - def resolves(self, iri): - resolves = False - try: - status_code = urllib.request.urlopen(iri).getcode() - resolves = status_code == 200 - except HTTPError as err: - self.logger.debug(err) - if not resolves: - self.logger.info("IRI does not resolve: %s", iri) - return resolves - - def get_iris_df_for_file(self, input_file, resolve_iri): - iris_file = self.get_iris(onto_utils.parse_list_file(input_file), resolve_iri=resolve_iri) - out_col_names = ['source_tag', 'target_iri', 'iri_resolves'] - return pd.DataFrame(iris_file, columns=out_col_names) - - -if __name__ == "__main__": - tag2iri = OntoTag2Iri() - if len(sys.argv) > 1: - input_tag_list_file = sys.argv[1] - output_file = "tag2iri-" + datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + ".csv" - output_df = tag2iri.get_iris_df_for_file(input_tag_list_file, resolve_iri=True) - output_df.to_csv(output_file, index=False) - else: - print("Provide input file with tags to convert to IRIs") diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/_static/ccb_logo.jpg b/docs/source/_static/ccb_logo.jpg new file mode 100644 index 0000000..422182b Binary files /dev/null and b/docs/source/_static/ccb_logo.jpg differ diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..5713476 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,31 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'text2term' +copyright = '2023, Harvard Medical School' +author = 'Rafael Goncalves and Jason Payne' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["myst_parser"] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'pyramid' +html_static_path = ['_static'] +# html_logo = "ccb_logo.jpg" +html_theme_options = { + 'sidebarwidth': 280 +} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..46ed444 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,28 @@ +.. text2term documentation master file, created by + sphinx-quickstart on Tue Jul 11 10:34:29 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. .. image:: ccb_logo.jpg +.. :alt: CCB's Logo +.. :scale: 50 % +.. :align: left + +Text2term +===================================== + +.. toctree:: + :maxdepth: 2 +.. include:: ../../README.md + :parser: myst_parser.sphinx_ +.. include:: ../../README-UI.md + :parser: myst_parser.sphinx_ + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` + + diff --git a/onto_utils.py b/onto_utils.py deleted file mode 100644 index 50b05bd..0000000 --- a/onto_utils.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging -import re -import sys -import bioregistry -from owlready2 import * -from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces - -STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', - 'ds', 'rd', 'rdgwas', 'average', 'weekly', 'monthly', 'daily'} - - -def normalize_list(token_list): - normalized_token_list = [] - for token in token_list: - normalized_token_list.append(normalize(token)) - return normalized_token_list - - -def normalize(token): - """ - Normalizes a given string by converting to lower case, removing non-word characters, stop words, white space - :param token: Text to be normalized - :return: Normalized string - """ - token = re.sub(r"[\(\[].*?[\)\]]", "", token) # remove text within parenthesis/brackets - token = strip_non_alphanum(token).lower() - token = token.replace("_", " ") - token = " ".join(w for w in token.split() if w not in STOP_WORDS) - token = strip_multiple_whitespaces(token) - return token - - -def curie_from_iri(iri): - return bioregistry.curie_from_iri(iri) - - -def label_from_iri(iri): - if "#" in iri: - return iri.split("#")[1] - else: - return iri.rsplit('/', 1)[1] - - -def remove_quotes(text): - text = text.replace("\"", "") - text = text.replace("\'", "") - return text - - -def get_logger(name, level): - formatter = logging.Formatter("%(asctime)s %(levelname)s [%(name)s]: %(message)s", "%Y-%m-%d %H:%M:%S") - logger = logging.getLogger(name) - logger.setLevel(level=level) - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - logger.propagate = False - return logger - - -def parse_list_file(file_path): - file = open(file_path) - lines = file.read().splitlines() - return lines - - -def get_ontology_from_labels(term_labels): - onto = owlready2.get_ontology("http://ccb.harvard.edu/t2t/") - onto.metadata.comment.append("Created dynamically using text2term") - onto.metadata.comment.append(datetime.datetime.now()) - for term_label in term_labels: - with onto: - new_class = types.new_class(term_label, (Thing,)) - new_class.label = term_label - return onto - - -OBO_BASE_IRI = "http://purl.obolibrary.org/obo/" -BIOPORTAL_BASE_IRI = "http://purl.bioontology.org/ontology/" -ORPHANET_IRI = "http://www.orpha.net/ORDO/" -ONTOLOGY_IRIS = {"EFO": "http://www.ebi.ac.uk/efo/", - "Orphanet": ORPHANET_IRI, - "ORPHA": ORPHANET_IRI, - "MONDO": OBO_BASE_IRI, - "HP": OBO_BASE_IRI, - "UBERON": OBO_BASE_IRI, - "GO": OBO_BASE_IRI, - "DOID": OBO_BASE_IRI, - "CHEBI": OBO_BASE_IRI, - "OMIT": OBO_BASE_IRI, - "NCIT": OBO_BASE_IRI, - "MAXO": OBO_BASE_IRI, - "DRON": OBO_BASE_IRI, - "OAE": OBO_BASE_IRI, - "CIDO": OBO_BASE_IRI, - "OMIM": BIOPORTAL_BASE_IRI + "OMIM/", - "PATO": OBO_BASE_IRI, - "SNOMED": "http://snomed.info/id/"} diff --git a/requirements.txt b/requirements.txt index 7162637..cf8334d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ -Owlready2~=0.31 +Owlready2~=0.44 argparse~=1.4.0 -pandas~=1.2.4 -numpy>=1.20.0 -gensim~=4.0.1 -scipy~=1.6.3 -sklearn~=0.0 -scikit-learn~=0.24.2 -setuptools~=47.1.0 -jellyfish~=0.8.9 -requests~=2.27.1 -thefuzz~=0.19.0 -tqdm~=4.62.3 -sparse_dot_topn~=0.3.1 -bioregistry~=0.4.46 +pandas~=2.0.3 +numpy~=1.24.2 +gensim~=4.3.0 +scipy~=1.10.1 +scikit-learn~=1.2.1 +setuptools~=68.2.2 +requests~=2.31.0 +tqdm~=4.66.1 +sparse_dot_topn~=0.3.4 +bioregistry~=0.10.6 +nltk~=3.8.1 +rapidfuzz~=2.13.7 +shortuuid~=1.0.11 +myst_parser~=2.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 2bb4d93..7292f02 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,33 @@ from setuptools import setup, find_packages +from text2term.config import VERSION - +description = 'A tool for mapping free-text descriptions of (biomedical) entities to controlled terms in ontologies' long_description = open('README.md').read() with open('requirements.txt') as f: requirements = f.read().splitlines() -version = '0.1' - setup( - name='text2term ontology mapper', - version=version, + name='text2term', + version=VERSION, install_requires=requirements, packages=find_packages(), include_package_data=True, url='https://github.com/ccb-hms/ontology-mapper', license='MIT', - description='A tool for mapping (uncontrolled) terms to ontology terms to facilitate semantic integration', + description=description, long_description=long_description, long_description_content_type='text/markdown', + author='Center for Computational Biomedicine, Harvard Medical School', + author_email='rafael_goncalves@hms.harvard.edu', classifiers=[ - 'Development Status :: 3 - Alpha', + 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', 'Topic :: Scientific/Engineering' ], + python_requires=">=3.9", ) diff --git a/term.py b/term.py deleted file mode 100644 index 7257410..0000000 --- a/term.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Provides OntologyTerm class""" - -import onto_utils -from owlready2 import Thing, ThingClass -from term_graph import OntologyTermGraph, Node, Edge - - -class OntologyTerm: - """ - Represents an ontology class or individual. In the case of an individual 'children' is always empty and - 'parents' specifies the individual's types. - """ - def __init__(self, iri, labels, synonyms, definition, ontology_iri, parents=(), children=(), instances=()): - self._iri = iri - self._labels = labels - self._synonyms = synonyms - self._definition = definition - self._ontology_iri = ontology_iri - self._parents = parents - self._children = children - self._instances = instances - - @property - def iri(self): - return self._iri - - @property - def labels(self): - return self._labels - - @property - def synonyms(self): - return self._synonyms - - @property - def definition(self): - return self._definition - - @property - def ontology_iri(self): - return self._ontology_iri - - @property - def parents(self): - return self._parents - - @property - def children(self): - return self._children - - @property - def instances(self): - return self._instances - - def graph(self): - """ Build and return a graph representing the neighborhood of an ontology term. """ - nodes, edges = set(), set() - nodes.add(Node(self.iri, self.label)) - self._add_superclasses(nodes, edges) - self._add_subclasses(self.children, nodes, edges) - self._add_instances(self.instances, nodes, edges) - return OntologyTermGraph(self.iri, nodes, edges) - - def _add_superclasses(self, nodes, edges): - for parent in self.parents: - self._add_node(parent, nodes) - edges.add(Edge(self.iri, parent.iri, Edge.IS_A)) - self._add_ancestors(parent, nodes, edges) - - def _add_ancestors(self, node, nodes, edges): - for ancestor in node.is_a: - if ancestor is not Thing and isinstance(ancestor, ThingClass): - self._add_node(ancestor, nodes) - edges.add(Edge(node.iri, ancestor.iri, Edge.IS_A)) - self._add_ancestors(ancestor, nodes, edges) - - def _add_children(self, term_list, edge_type, nodes, edges): - for term in term_list: - self._add_node(term, nodes) - edges.add(Edge(term.iri, self.iri, edge_type)) - - def _add_subclasses(self, subclasses, nodes, edges): - self._add_children(subclasses, Edge.IS_A, nodes, edges) - - def _add_instances(self, instances, nodes, edges): - self._add_children(instances, Edge.INSTANCE_OF, nodes, edges) - - def _add_node(self, term, term_set): - if len(term.label) == 0: - label = onto_utils.label_from_iri(term.iri) - else: - label = term.label[0] - term_set.add(Node(term.iri, label)) - - @property - def label(self): - """Return a single label for this term""" - return next(iter(self.labels)) - - def __eq__(self, other): - if isinstance(other, OntologyTerm): - return self._iri == other._iri - return False - - def __hash__(self): - return hash(str(self._iri)) - - def __str__(self): - return "Ontology Term: " + self.iri + ", Labels: " + str(self.labels) + ", Synonyms: " + \ - str(self.synonyms) + ", Definition: " + str(self.definition) + ", Parents: " + str(self.parents) + \ - ", Children: " + str(self.children) + ", Instances: " + str(self.instances) + ", Term graph: " + \ - str(self.graph().graph_dict()) diff --git a/term_collector.py b/term_collector.py deleted file mode 100644 index cca29fa..0000000 --- a/term_collector.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Provides OntologyTermCollector class""" - -import logging -import onto_utils -from owlready2 import * -from term import OntologyTerm - - -class OntologyTermCollector: - - def __init__(self, ontology_iri): - """" - :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) - """ - self.logger = onto_utils.get_logger(__name__, logging.INFO) - self.ontology_iri = ontology_iri - - def get_ontology_terms(self, base_iris=(), use_reasoning=False, exclude_deprecated=True, include_individuals=False): - """ - Collect the terms described in the ontology at the specified IRI - :param base_iris: Limit ontology term collection to terms whose IRIs start with any IRI given in this tuple - :param use_reasoning: Use a reasoner to compute inferred class hierarchy and individual types - :param exclude_deprecated: Exclude ontology terms stated as deprecated using owl:deprecated 'true' - :param include_individuals: Include OWL ontology individuals in addition to ontology classes - :return: Collection of ontology terms in the specified ontology - """ - ontology = self._load_ontology(self.ontology_iri) - if use_reasoning: - self._classify_ontology(ontology) - self.logger.info("Collecting ontology term details...") - start = time.time() - ontology_terms = [] - if len(base_iris) > 0: - for iri in base_iris: - query = iri + "*" - self.logger.info("...Collecting terms with IRIs starting in: " + iri) - iris = list(default_world.search(iri=query)) - ontology_terms.extend(self._get_ontology_terms(iris, ontology, exclude_deprecated)) - else: - ontology_terms = self._get_ontology_terms(ontology.classes(), ontology, exclude_deprecated) - if include_individuals: - ontology_terms.extend(self._get_ontology_terms(ontology.individuals(), ontology, exclude_deprecated)) - end = time.time() - self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), end-start) - return ontology_terms - - def _get_ontology_terms(self, term_list, ontology, exclude_deprecated): - ontology_terms = [] - for ontology_term in term_list: - if not isinstance(ontology_term, PropertyClass) and ontology_term is not Thing and ontology_term is not Nothing: - if (exclude_deprecated and not deprecated[ontology_term]) or (not exclude_deprecated): - labels = self._get_labels(ontology_term) - synonyms = self._get_synonyms(ontology_term) - parents = self._get_parents(ontology_term) - children = self._get_children(ontology_term, ontology) - instances = self._get_instances(ontology_term, ontology) - definition = self._get_definition(ontology_term) - term_details = OntologyTerm(ontology_term.iri, labels, synonyms, definition, ontology.base_iri, - parents=parents, children=children, instances=instances) - ontology_terms.append(term_details) - else: - self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) - return ontology_terms - - def _get_parents(self, ontology_term): - parents = set() # named/atomic superclasses except owl:Thing - try: - all_parents = ontology_term.is_a # obtain all (direct and indirect) parents of this entity - for parent in all_parents: - # exclude OWL restrictions and owl:Thing and Self - if isinstance(parent, ThingClass) and parent is not Thing and parent is not ontology_term: - parents.add(parent) - except AttributeError as err: - self.logger.debug(err) - return parents - - def _get_children(self, ontology_term, ontology): - children = set() - try: - children = set(ontology.get_children_of(ontology_term)) - except AttributeError as err: - self.logger.debug(err) - return children - - def _get_instances(self, ontology_term, ontology): - instances = set() - try: - instances = set(ontology.get_instances_of(ontology_term)) - except AttributeError as err: - self.logger.debug(err) - return instances - - def _get_labels(self, ontology_term): - """ - Collect the labels of the given ontology term both given by rdfs:label and skos:prefLabel - :param ontology_term: Ontology term - :return: Collection of labels of the ontology term - """ - labels = set() - for rdfs_label in self._get_rdfs_labels(ontology_term): - labels.add(rdfs_label) - for skos_label in self._get_skos_pref_labels(ontology_term): - labels.add(skos_label) - if len(labels) == 0: - label_from_iri = onto_utils.label_from_iri(ontology_term.iri) - self.logger.info("Ontology term %s has no labels (rdfs:label or skos:prefLabel). " - "Using a label based on the term IRI: %s", ontology_term.iri, label_from_iri) - labels.add(label_from_iri) - self.logger.debug("Collected %i labels and synonyms for %s", len(labels), ontology_term) - return labels - - def _get_synonyms(self, ontology_term): - """ - Collect the synonyms of the given ontology term - :param ontology_term: Ontology term - :return: Collection of synonyms of the ontology term - """ - synonyms = set() - for synonym in self._get_obo_exact_synonyms(ontology_term): - synonyms.add(synonym) - for nci_synonym in self._get_nci_synonyms(ontology_term): - synonyms.add(nci_synonym) - self.logger.debug("Collected %i synonyms for %s", len(synonyms), ontology_term) - return synonyms - - def _get_rdfs_labels(self, ontology_term): - """ - Collect labels of the given term that are specified using the standard rdfs:label annotation property - :param ontology_term: Ontology term to collect labels from - :return: Collection of RDFS labels - """ - rdfs_labels = [] - try: - for rdfs_label in ontology_term.label: - rdfs_labels.append(rdfs_label) - except AttributeError as err: - self.logger.debug(err) - return rdfs_labels - - def _get_skos_pref_labels(self, ontology_term): - """ - Collect labels of the given term that are specified using the skos:prefLabel annotation property - :param ontology_term: Ontology term to collect labels from - :return: Collection of SKOS preferred labels - """ - skos_labels = [] - try: - for skos_pref_label in ontology_term.prefLabel: - skos_labels.append(skos_pref_label) - except AttributeError as err: - self.logger.debug(err) - return skos_labels - - def _get_obo_exact_synonyms(self, ontology_term): - """ - Collect synonyms of the given term that are specified using the annotation property used by DOID, MONDO, EFO, - HPO, and other OBO ontologies: . - :param ontology_term: Ontology term to collect synonyms from - :return: Collection of synonyms - """ - synonyms = [] - try: - for synonym in ontology_term.hasExactSynonym: - synonyms.append(synonym) - except AttributeError as err: - self.logger.debug(err) - return synonyms - - def _get_nci_synonyms(self, ontology_term): - """ - Collect synonyms of the given term that are specified using the NCI Thesaurus annotation property: - . - :param ontology_term: Ontology term to collect synonyms from - :return: Collection of synonyms - """ - nci_synonyms = [] - try: - for synonym in ontology_term.P90: - nci_synonyms.append(synonym) - except AttributeError as err: - self.logger.debug(err) - return nci_synonyms - - def _get_definition(self, ontology_term): - """ - Get the definition (if one exists) of the given term as specified using the skos:definition annotation property - :param ontology_term: Ontology term to collect definition of - :return: String value of the skos:definition annotation property assertion on the given term - """ - definition = "" - try: - definition = ontology_term.definition - except AttributeError as err: - self.logger.debug(err) - return definition - - def _load_ontology(self, ontology_iri): - """ - Load the ontology at the specified IRI. - :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) - :return: Ontology document - """ - self.logger.info("Loading ontology %s...", ontology_iri) - start = time.time() - ontology = get_ontology(ontology_iri).load() - end = time.time() - self._log_ontology_metrics(ontology) - self.logger.info("done (loading time: %.2fs)", end-start) - return ontology - - def _classify_ontology(self, ontology): - """ - Perform reasoning over the given ontology (consistency checking and classification) - :param ontology: ontology instance - """ - self.logger.info("Reasoning over ontology...") - start = time.time() - with ontology: # entailments will be added to this ontology - sync_reasoner(infer_property_values=True) - end = time.time() - self.logger.info("done (reasoning time: %.2fs)", end - start) - - def _log_ontology_metrics(self, ontology): - self.logger.debug(" Ontology IRI: %s", ontology.base_iri) - self.logger.debug(" Class count: %i", len(list(ontology.classes()))) - self.logger.debug(" Individual count: %i", len(list(ontology.individuals()))) - self.logger.debug(" Object property count: %i", len(list(ontology.object_properties()))) - self.logger.debug(" Data property count: %i", len(list(ontology.data_properties()))) - self.logger.debug(" Annotation property count: %i", len(list(ontology.annotation_properties()))) diff --git a/test/simple_preprocess.txt b/test/simple_preprocess.txt new file mode 100644 index 0000000..fdd7467 --- /dev/null +++ b/test/simple_preprocess.txt @@ -0,0 +1,3 @@ +asthma;:;disease +acute bronchitis;:;important,tags +colon disease diff --git a/test/simple_tests.py b/test/simple_tests.py new file mode 100644 index 0000000..305281e --- /dev/null +++ b/test/simple_tests.py @@ -0,0 +1,238 @@ +import os +import unittest +import pandas as pd +import text2term +from text2term import OntologyTermType +from text2term import Mapper +from text2term import OntologyTermCollector + +pd.set_option('display.max_columns', None) + + +class Text2TermTestSuite(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(Text2TermTestSuite, cls).setUpClass() + print("Setting up test suite global variables...") + cls.EFO_URL = "https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl" + cls.SOURCE_TERM_ID_COLUMN = "Source Term ID" + cls.MAPPED_TERM_CURIE_COLUMN = "Mapped Term CURIE" + cls.MAPPING_SCORE_COLUMN = "Mapping Score" + cls.TAGS_COLUMN = "Tags" + + @classmethod + def tearDownClass(cls): + super(Text2TermTestSuite, cls).tearDownClass() + text2term.clear_cache() + + def test_caching_ontology_from_url(self): + # Test caching an ontology loaded from a URL + print("Test caching an ontology loaded from a URL...") + efo_cache = text2term.cache_ontology(ontology_url=self.EFO_URL, ontology_acronym="EFO") + print(f"Cache exists: {efo_cache.cache_exists()}\n") + assert efo_cache.cache_exists() is True + + print("Test using the returned ontology cache object to map a list of terms...") + mappings_efo_cache = efo_cache.map_terms(["asthma", "disease location", "food allergy"], + term_type=OntologyTermType.ANY) + assert mappings_efo_cache.size > 0 + + def test_caching_ontology_from_acronym(self): + # Test caching an ontology by resolving its acronym using bioregistry + print("Test caching an ontology by resolving its acronym using bioregistry...") + clo_cache = text2term.cache_ontology(ontology_url="CLO", ontology_acronym="CLO") + print(f"Cache exists: {clo_cache.cache_exists()}\n") + assert clo_cache.cache_exists() is True + + def test_caching_ontology_set(self): + nr_ontologies_in_registry = 8 + # Test caching the set of ontologies specified in resources/ontologies.csv + caches = text2term.cache_ontology_set(os.path.join("..", "text2term", "resources", "ontologies.csv")) + assert len(caches) == nr_ontologies_in_registry + + def test_mapping_to_cached_ontology(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + # Test mapping a list of terms to EFO loaded from cache + print("Test mapping a list of terms to EFO loaded from cache...") + mappings_efo_cache = text2term.map_terms(["asthma", "disease location", "food allergy"], target_ontology="EFO", + use_cache=True, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_cache}\n") + assert mappings_efo_cache.size > 0 + + # Test mapping a list of terms to EFO loaded from a URL + print("Test mapping a list of terms to EFO loaded from a URL...") + mappings_efo_url = text2term.map_terms(["asthma", "disease location", "food allergy"], + target_ontology=self.EFO_URL, term_type=OntologyTermType.ANY) + print(f"{mappings_efo_url}\n") + assert mappings_efo_url.size > 0 + + # Test that mapping to cached ontology is the same as to ontology loaded from its URL + print("Test that mapping to cached ontology is the same as to ontology loaded from its URL...") + mappings_match = self.check_df_equals(self.drop_source_term_ids(mappings_efo_cache), + self.drop_source_term_ids(mappings_efo_url)) + print(f"...{mappings_match}") + assert mappings_match is True + + def test_mapping_to_cached_ontology_using_syntactic_mapper(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + # Test mapping a list of terms to cached EFO using Jaro-Winkler syntactic similarity metric + print("Test mapping a list of terms to cached ontology using Jaro-Winkler syntactic similarity metric...") + df = text2term.map_terms(["asthma", "disease location", "food allergy"], "EFO", use_cache=True, + mapper=text2term.Mapper.JARO_WINKLER, term_type=OntologyTermType.ANY) + print(f"{df}\n") + assert df.size > 0 + + def test_mapping_using_ontology_acronym(self): + # Test mapping a list of terms by specifying the target ontology acronym, which gets resolved by bioregistry + print( + "Test mapping a list of terms to EFO by specifying the ontology acronym, which gets resolved by bioregistry") + df2 = text2term.map_terms(["contains", "asthma"], "MONDO", term_type=OntologyTermType.CLASS) + print(f"{df2}\n") + assert df2.size > 0 + + def test_mapping_tagged_terms(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + # Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output + print("Test mapping a dictionary of tagged terms to cached EFO, and include unmapped terms in the output...") + df3 = text2term.map_terms( + {"asthma": "disease", "allergy": ["ignore", "response"], "protein level": ["measurement"], + "isdjfnsdfwd": None}, target_ontology="EFO", excl_deprecated=True, use_cache=True, incl_unmapped=True) + print(f"{df3}\n") + assert df3.size > 0 + assert df3[self.TAGS_COLUMN].str.contains("disease").any() + assert df3[self.TAGS_COLUMN].str.contains("measurement").any() + + def test_preprocessing_from_file(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + # Test processing tagged terms where the tags are provided in a file + print("Test processing tagged terms where the tags are provided in a file...") + tagged_terms = text2term.preprocess_tagged_terms("simple_preprocess.txt") + df4 = text2term.map_terms(tagged_terms, target_ontology="EFO", use_cache=True, incl_unmapped=True) + print(f"{df4}\n") + assert df4.size > 0 + assert df4[self.TAGS_COLUMN].str.contains("disease").any() + assert df4[self.TAGS_COLUMN].str.contains("important").any() + + def test_mapping_to_properties(self): + # Test mapping a list of properties to EFO loaded from a URL and restrict search to properties + print("Test mapping a list of properties to EFO loaded from a URL and restrict search to properties...") + df5 = text2term.map_terms(source_terms=["contains", "location"], target_ontology=self.EFO_URL, + term_type=OntologyTermType.PROPERTY) + print(f"{df5}\n") + assert df5.size > 0 + + # Test mapping a list of properties to EFO loaded from cache and restrict search to properties + print("Test mapping a list of properties to EFO loaded from cache and restrict search to properties...") + self.ensure_cache_exists("EFO", self.EFO_URL) + df6 = text2term.map_terms(source_terms=["contains", "location"], target_ontology="EFO", use_cache=True, + term_type=OntologyTermType.PROPERTY) + print(f"{df6}\n") + assert df6.size > 0 + + # Test that mapping to properties in cached ontology is the same as to ontology loaded from its URL + properties_df_match = self.check_df_equals(self.drop_source_term_ids(df5), self.drop_source_term_ids(df6)) + print(f"Properties match: {properties_df_match}") + assert properties_df_match is True + + def test_mapping_zooma_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the Zooma mapper + print("Test mapping a list of terms to multiple ontologies using the Zooma mapper...") + df_zooma = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.ZOOMA, term_type=OntologyTermType.ANY) + print(f"{df_zooma}\n") + assert df_zooma.size > 0 + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_mapping_bioportal_ontologies(self): + # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper + print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") + df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + print(f"{df_bioportal}\n") + assert df_bioportal.size > 0 + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() + assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + + def test_term_collector(self): + expected_nr_efo_terms = 50867 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms() + assert len(terms) == expected_nr_efo_terms + + def test_term_collector_classes_only(self): + expected_nr_efo_classes = 50643 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.CLASS) + assert len(terms) == expected_nr_efo_classes + + def test_term_collector_properties_only(self): + expected_nr_efo_properties = 224 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_efo_properties + + def test_term_collector_iri_limit(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_terms_with_efo_iri = 17383 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.ANY) + assert len(terms) == expected_nr_terms_with_efo_iri + + def test_term_collector_iri_limit_properties_only(self): + efo_base_iri = "http://www.ebi.ac.uk/efo/" + expected_nr_properties_with_efo_iri = 29 + efo_term_collector = OntologyTermCollector(ontology_iri=self.EFO_URL) + terms = efo_term_collector.get_ontology_terms(base_iris=[efo_base_iri], term_type=OntologyTermType.PROPERTY) + assert len(terms) == expected_nr_properties_with_efo_iri + + def test_mapping_with_min_score_filter(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + min_score = 0.6 + search_terms = ["asthma attack", "location"] + + print("Test mapping to cached EFO using Zooma mapper and min_score filter...") + df_zooma = text2term.map_terms(search_terms, target_ontology="EFO,NCIT", mapper=Mapper.ZOOMA, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_zooma[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using TFIDF similarity metric and min_score filter...") + df_tfidf = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_tfidf[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + print("Test mapping to cached EFO using Levenshtein similarity metric and min_score filter...") + df_leven = text2term.map_terms(search_terms, target_ontology="EFO", use_cache=True, mapper=Mapper.LEVENSHTEIN, + term_type=OntologyTermType.ANY, min_score=min_score) + assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() + + def test_include_unmapped_terms(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + df = text2term.map_terms(["asthma", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + + def test_include_unmapped_terms_when_mappings_df_is_empty(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + df = text2term.map_terms(["mojito", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + incl_unmapped=True, min_score=0.8) + assert df[self.TAGS_COLUMN].str.contains("unmapped").any() + + def drop_source_term_ids(self, df): + # Unless specified, source term IDs are randomly generated UUIDs. We have to drop the ID column to be able to + # get a meaningful diff between two dataframes. Otherwise, the dataframes would always differ because of the IDs + return df.drop(self.SOURCE_TERM_ID_COLUMN, axis=1) + + def check_df_equals(self, df, expected_df): + # Use pandas::assert_frame_equal function to determine if two data frames are equal + pd.testing.assert_frame_equal(df, expected_df, check_names=False, check_like=True) + return True + + def ensure_cache_exists(self, ontology_name, ontology_url): + if not text2term.cache_exists(ontology_name): + text2term.cache_ontology(ontology_url=ontology_url, ontology_acronym=ontology_name) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_pypi.py b/test/test_pypi.py new file mode 100644 index 0000000..6d04fe2 --- /dev/null +++ b/test/test_pypi.py @@ -0,0 +1,46 @@ +import os +import sys +import text2term +from text2term.term import OntologyTermType +from contextlib import contextmanager + + +def main(): + try: + with suppress_stdout(): + # Simple set up and testing + text2term.map_terms(["fever", "headache"], + "https://github.com/EBISPOT/efo/releases/download/current/efo.owl") + text2term.cache_ontology("https://github.com/EBISPOT/efo/releases/download/current/efo.owl", "EFO") + text2term.map_terms(["fever", "headache"], "EFO", use_cache=True) + text2term.map_terms(["fever", "headache"], "EFO", base_iris=("http://www.ebi.ac.uk/efo",), + mapper=text2term.mapper.Mapper.LEVENSHTEIN, max_mappings=4, use_cache=True) + + # Properties and classes tests + text2term.map_terms(["fever", "headache"], "EFO", term_type=OntologyTermType.CLASS, use_cache=True) + text2term.map_terms(["contains", "location"], "EFO", term_type=OntologyTermType.PROPERTY, use_cache=True) + text2term.map_terms(["fever", "contains"], "EFO", term_type=OntologyTermType.ANY, use_cache=True) + + # Clear cache and set down + text2term.clear_cache("EFO") + except: + print("ERROR") + + +# From https://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python +@contextmanager +def suppress_stdout(): + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + old_stderr = sys.stderr + sys.stdout = devnull + sys.stderr = devnull + try: + yield + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + + +if __name__ == '__main__': + main() diff --git a/test/unstruct_terms.txt b/test/unstruct_terms.txt new file mode 100644 index 0000000..2dc3f97 --- /dev/null +++ b/test/unstruct_terms.txt @@ -0,0 +1,2 @@ +asthma +acute bronchitis \ No newline at end of file diff --git a/text2term.py b/text2term.py deleted file mode 100644 index 2217021..0000000 --- a/text2term.py +++ /dev/null @@ -1,61 +0,0 @@ -import argparse -import datetime -import json -import os -import sys -import onto_utils -from term_collector import OntologyTermCollector -from tfidf_mapper import TFIDFMapper - - -def get_arguments(): - timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") - output_file_name = "t2t-out-" + timestamp + ".csv" - parser = argparse.ArgumentParser(description="A tool to map unstructured terms to ontology terms") - parser.add_argument("-s", "--source", required=True, type=str, - help="Input file containing list of 'source' terms to map to ontology terms (one per line)") - parser.add_argument("-t", "--target", required=True, type=str, - help="Path or URL of 'target' ontology to map the source terms to") - parser.add_argument("-o", "--output", required=False, type=str, default=output_file_name, - help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-top", "--top_mappings", required=False, type=int, default=3, - help="Maximum number of top-ranked mappings returned per source term (default=3)") - parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, - help="Minimum score [0,1] for the mappings (0=dissimilar, 1=exact match; default=0.5)") - parser.add_argument("-iris", "--base_iris", required=False, type=str, default=(), - help="Map only to terms whose IRIs start with any IRI given in this comma-separated list") - parser.add_argument("-d", "--excl_deprecated", required=False, default=False, action="store_true", - help="Exclude terms stated as deprecated via owl:deprecated") - parser.add_argument("-i", "--incl_individuals", required=False, default=False, action="store_true", - help="Include ontology individuals in addition to classes") - arguments = parser.parse_args() - - source_file, target_file, out_file = arguments.source, arguments.target, arguments.output - if not os.path.exists(source_file): - parser.error("The file '{}' does not exist".format(source_file)) - sys.exit(1) - - # create output directories if needed - if os.path.dirname(out_file): - os.makedirs(os.path.dirname(out_file), exist_ok=True) - - iris = arguments.base_iris - if len(iris) > 0: - iris = tuple(iris.split(',')) - return source_file, target_file, out_file, arguments.top_mappings, arguments.min_score, iris, \ - arguments.excl_deprecated, arguments.incl_individuals - - -if __name__ == "__main__": - input_file, target_ontology, output_file, max_mappings, min_score, base_iris, excl_deprecated, incl_individuals = get_arguments() - source_terms = onto_utils.parse_list_file(input_file) - term_collector = OntologyTermCollector(target_ontology) - onto_terms = term_collector.get_ontology_terms(base_iris=base_iris, - exclude_deprecated=excl_deprecated, - include_individuals=incl_individuals) - if len(onto_terms) > 0: - mapper = TFIDFMapper(onto_terms) - mappings_df, term_graphs = mapper.map(source_terms, max_mappings=max_mappings, min_score=min_score) - mappings_df.to_csv(output_file, index=False) - with open(output_file + "-term-graphs.json", 'w') as json_file: - json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/__init__.py b/text2term/__init__.py new file mode 100644 index 0000000..49a3773 --- /dev/null +++ b/text2term/__init__.py @@ -0,0 +1,13 @@ +from .t2t import map_terms +from .t2t import cache_ontology +from .onto_cache import cache_ontology_set +from .onto_cache import cache_exists +from .onto_cache import clear_cache +from .mapper import Mapper +from .preprocess import preprocess_terms +from .preprocess import preprocess_tagged_terms +from .tagged_term import TaggedTerm +from .term_collector import OntologyTermCollector +from .term_collector import filter_terms +from .term import OntologyTermType +from .term import OntologyTerm diff --git a/text2term/__main__.py b/text2term/__main__.py new file mode 100644 index 0000000..9560fac --- /dev/null +++ b/text2term/__main__.py @@ -0,0 +1,67 @@ +import argparse +import os +import sys +from t2t import map_terms, cache_ontology +from onto_cache import cache_exists +from mapper import Mapper +from term import OntologyTermType + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' + 'entities to controlled terms in an ontology') + parser.add_argument("-s", "--source", required=True, type=str, + help="Input file containing 'source' terms to map to ontology terms: list of terms or CSV file") + parser.add_argument("-t", "--target", required=True, type=str, + help="Path or URL of 'target' ontology to map source terms to. When the chosen mapper is " + "BioPortal or Zooma, provide a comma-separated list of acronyms (eg 'EFO,HPO') or write " + "'all' to search all ontologies") + parser.add_argument("-o", "--output", required=False, type=str, default="", + help="Path to desired output file for the mappings (default=current working directory)") + parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", + help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + + " (default=tfidf)") + parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), + help="Specifies that the input is a CSV file—This should be followed by the name of the column " + "that contains the terms to map, optionally followed by the name of the column that " + "contains identifiers for the terms (eg 'my_terms,my_term_ids')") + parser.add_argument("-sep", "--separator", required=False, type=str, default=',', + help="Specifies the cell separator to be used when reading a non-comma-separated tabular file") + parser.add_argument("-top", "--top_mappings", required=False, type=int, default=3, + help="Maximum number of top-ranked mappings returned per source term (default=3)") + parser.add_argument("-min", "--min_score", required=False, type=float, default=0.5, + help="Minimum similarity score [0,1] for the mappings (1=exact match; default=0.5)") + parser.add_argument("-iris", "--base_iris", required=False, type=str, default=(), + help="Map only to ontology terms whose IRIs start with a value given in this comma-separated " + "list (eg 'http://www.ebi.ac.uk/efo,http://purl.obolibrary.org/obo/HP)')") + parser.add_argument("-d", "--excl_deprecated", required=False, default=False, action="store_true", + help="Exclude ontology terms stated as deprecated via `owl:deprecated true` (default=False)") + parser.add_argument("-g", "--save_term_graphs", required=False, default=False, action="store_true", + help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") + parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", + help="Store the target ontology into local cache under acronym") + parser.add_argument("-type", "--term_type", required=False, type=str, default="class", + help="Define whether to return ontology classes, properties, or both") + parser.add_argument('-u', "--incl_unmapped", required=False, default=False, action="store_true", + help="Include all unmapped terms in the output") + + arguments = parser.parse_args() + if not os.path.exists(arguments.source): + parser.error("The file '{}' does not exist".format(arguments.source)) + sys.exit(1) + mapper = Mapper(arguments.mapper) + iris = arguments.base_iris + if len(iris) > 0: + iris = tuple(iris.split(',')) + csv_columns = arguments.csv_input + if len(csv_columns) > 0: + csv_columns = tuple(csv_columns.split(',')) + target = arguments.target + acronym = arguments.store_in_cache + if acronym != "": + cache_ontology(target, acronym, iris) + target = acronym + map_terms(arguments.source, target, output_file=arguments.output, csv_columns=csv_columns, + excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, + min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, + save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), + term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped) diff --git a/text2term/bioportal_mapper.py b/text2term/bioportal_mapper.py new file mode 100644 index 0000000..2e08bf0 --- /dev/null +++ b/text2term/bioportal_mapper.py @@ -0,0 +1,89 @@ +"""Provides BioPortalAnnotatorMapper class""" + +import json +import logging +import time +import requests +from text2term.term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils + + +class BioPortalAnnotatorMapper: + + def __init__(self, bp_api_key): + """ + :param bp_api_key: BioPortal API key + """ + self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.url = "http://data.bioontology.org/annotator" + self.bp_api_key = bp_api_key + + def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_params=()): + """ + Find and return ontology mappings through the BioPortal Annotator Web service + :param source_terms: Collection of source terms to map to target ontologies + :param source_terms_ids: List of identifiers for the given source terms + :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies. + The ontology names accepted must match the names used in BioPortal. Here are some known ontologies: + GO, UBERON, "CL" for Cell Ontology, MESH, SNOMEDCT, FMA, NCIT, EFO, DOID, MONDO, "PR" for Protein Ontology, + "HP" for Human Phenotype Ontology + :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned + :param api_params: Additional BioPortal Annotator-specific parameters to include in the request + """ + mappings = [] + for term, term_id in zip(source_terms, source_terms_ids): + mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) + return TermMappingCollection(mappings).mappings_df() + + def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): + params = { + "text": onto_utils.normalize(source_term), + "longest_only": "true", + "expand_mappings": "true", + "ontologies": ontologies + } + if len(api_params) > 0: + params.update(api_params) + self.logger.debug("API parameters: " + str(params)) + mappings = [] + self.logger.debug("Searching for ontology terms to match: " + source_term) + response = self._do_get_request(self.url, params=params) + if response is not None: + self.logger.debug("...found " + str(len(response)) + " mappings") + for mapping in response: + if len(mappings) < max_mappings: + mappings.append(self._mapping_details(source_term, source_term_id, mapping)) + return mappings + + def _mapping_details(self, source_term, source_term_id, mapping): + ann_class = mapping["annotatedClass"] + term_iri = ann_class["@id"] + term_link_bp = ann_class["links"]["self"] + term_label = self.get_term_details(term_link_bp) + return TermMapping(source_term, source_term_id, term_label, term_iri, 1) + + def get_term_details(self, term_iri): + response = self._do_get_request(term_iri) + term_label = "" + if response is not None: + term_label = onto_utils.remove_quotes(response["prefLabel"]) + return term_label + + def _do_get_request(self, request_url, params=None): + headers = { + "Authorization": "apiKey token=" + self.bp_api_key, + } + response = requests.get(request_url, params=params, headers=headers, verify=True) + if response.ok: + json_resp = json.loads(response.content) + if len(json_resp) > 0: + return json_resp + else: + self.logger.info("Empty response for input: " + request_url + " with parameters " + str(params)) + elif response.status_code == 429: # API is throttling requests + self.logger.info(response.reason + ". Status code: " + str(response.status_code) + ". Waiting 15 seconds.") + time.sleep(15) + return self._do_get_request(request_url, params) + else: + json_resp = json.loads(response.content) + self.logger.error(response.reason + ":" + request_url + ". " + json_resp["errors"][0]) diff --git a/text2term/config.py b/text2term/config.py new file mode 100644 index 0000000..60e3b39 --- /dev/null +++ b/text2term/config.py @@ -0,0 +1 @@ +VERSION = "4.1.4" diff --git a/text2term/mapper.py b/text2term/mapper.py new file mode 100644 index 0000000..dd92d57 --- /dev/null +++ b/text2term/mapper.py @@ -0,0 +1,20 @@ +"""Provides Mapper enum""" + +from enum import Enum + + +class Mapper(str, Enum): + """ Enumeration of "mappers" (ie string similarity metrics and Web APIs) available """ + LEVENSHTEIN = 'levenshtein' + JARO = 'jaro' + JARO_WINKLER = 'jarowinkler' + JACCARD = 'jaccard' + INDEL = 'indel' + FUZZY = 'fuzzy' + TFIDF = 'tfidf' + ZOOMA = 'zooma' + BIOPORTAL = 'bioportal' + + @classmethod + def list(cls): + return list(map(lambda c: c.value, cls)) diff --git a/text2term/onto_cache.py b/text2term/onto_cache.py new file mode 100644 index 0000000..204dcb0 --- /dev/null +++ b/text2term/onto_cache.py @@ -0,0 +1,73 @@ +import os +import sys +import text2term +import owlready2 +import pandas as pd +from text2term.term import OntologyTermType +from text2term.mapper import Mapper +from shutil import rmtree + +CACHE_FOLDER = "cache" + +""" +CACHING FUNCTIONS -- Public +""" + + +# Caches many ontologies from a csv +def cache_ontology_set(ontology_registry_path): + registry = pd.read_csv(ontology_registry_path) + cache_set = {} + for index, row in registry.iterrows(): + try: + cache = text2term.cache_ontology(row.url, row.acronym) + cache_set.update({row.acronym: cache}) + except Exception as err: + err_message = "Could not cache ontology " + row.acronym + " due to error: " + str(err) + sys.stderr.write(err_message) + owlready2.default_world.ontologies.clear() + return cache_set + + +# Will check if an acronym exists in the cache +def cache_exists(ontology_acronym=''): + return os.path.exists(os.path.join(CACHE_FOLDER, ontology_acronym)) + + +# Clears the cache +def clear_cache(ontology_acronym=''): + cache_dir = CACHE_FOLDER + if ontology_acronym != '': + cache_dir = os.path.join(CACHE_FOLDER, ontology_acronym) + # Is equivalent to: rm -r cache_dir + try: + rmtree(cache_dir) + sys.stderr.write("Cache has been cleared successfully\n") + except OSError as error: + sys.stderr.write("Cache cannot be removed:") + sys.stderr.write(str(error)) + + +# Class that is returned to run +class OntologyCache: + def __init__(self, ontology_acronym): + self.acronym = ontology_acronym + self.ontology = os.path.join(CACHE_FOLDER, ontology_acronym) + + def map_terms(self, source_terms, base_iris=(), excl_deprecated=False, max_mappings=3, min_score=0.3, + mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), + term_type=OntologyTermType.CLASS): + return text2term.map_terms(source_terms, self.acronym, base_iris=base_iris, + excl_deprecated=excl_deprecated, max_mappings=max_mappings, min_score=min_score, + mapper=mapper, output_file=output_file, save_graphs=save_graphs, + save_mappings=save_mappings, source_terms_ids=source_terms_ids, use_cache=True, + term_type=term_type) + + def clear_cache(self): + clear_cache(self.acronym) + + def cache_exists(self): + return cache_exists(self.acronym) + + def acronym(self): + return self.acronym diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py new file mode 100644 index 0000000..d0bc45b --- /dev/null +++ b/text2term/onto_utils.py @@ -0,0 +1,184 @@ +import logging +import pandas as pd +import bioregistry +import shortuuid +from owlready2 import * +from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces + + +BASE_IRI = "http://ccb.hms.harvard.edu/t2t/" + +STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's', + 'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined', + 'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid', + 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'by', + 'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls', + 'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'} + +TEMPORAL_WORDS = {'age', 'time', 'times', 'date', 'initiation', 'cessation', 'progression', 'duration', 'early', 'late', + 'later', 'trimester'} + +QUANTITY_WORDS = {'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'frequently', 'per', 'hour', 'day', 'week', 'month', + 'year', 'years', 'total', 'quantity', 'amount', 'level', 'levels', 'volume', 'count', 'counts', + 'percentage', 'abundance', 'proportion', 'content', 'average', 'prevalence', 'mean', 'ratio'} + + +def normalize_list(token_list): + normalized_token_list = [] + for token in token_list: + normalized_token_list.append(normalize(token)) + return normalized_token_list + + +def normalize(token): + """ + Normalizes a given string by converting to lower case, removing non-word characters, stop words, white space + :param token: Text to be normalized + :return: Normalized string + """ + token = strip_non_alphanum(token).lower() + token = token.replace("_", " ") + token = " ".join(w for w in token.split() if w not in STOP_WORDS) + token = strip_multiple_whitespaces(token) + return token + + +def remove_quotes(string): + string = string.replace("\"", "") + string = string.replace("\'", "") + return string + + +def remove_whitespace(string): + return string.replace(' ', '') + + +def curie_from_iri(iri): + curie = bioregistry.curie_from_iri(iri) + if curie is None: + sys.stderr.write("Error obtaining CURIE for IRI: " + iri) + return "" + else: + return curie.upper() + + +def label_from_iri(iri): + if "#" in iri: + return iri.split("#")[1] + else: + return iri.rsplit('/', 1)[1] + + +def iri_from_tag(source_tag): + iri = source_tag + if len(source_tag) > 0 and source_tag != "NA": + iri = remove_whitespace(iri) + if ":" in source_tag: + onto_name = iri.split(":")[0] + term_name = iri.replace(":", "_") + full_iri = _get_iri(onto_name, term_name) + iri = full_iri if len(full_iri) > 0 else iri + elif "_" in source_tag: + onto_name = iri.split("_")[0] + full_iri = _get_iri(onto_name, iri) + iri = full_iri if len(full_iri) > 0 else iri + return iri + + +def _get_iri(ont_name, term_name): + iri = '' + if ont_name in ONTOLOGY_IRIS: + if ont_name == 'ORPHA': + iri = ONTOLOGY_IRIS.get(ont_name) + term_name.replace('ORPHA_', 'Orphanet_') + elif ont_name == 'SNOMED' or ont_name == 'OMIM': + iri = ONTOLOGY_IRIS.get(ont_name) + term_name.replace(ont_name + '_', '') + else: + iri = ONTOLOGY_IRIS.get(ont_name) + term_name + return iri + + +def get_logger(name, level=logging.INFO): + formatter = logging.Formatter("%(asctime)s %(levelname)s [%(name)s]: %(message)s", "%Y-%m-%d %H:%M:%S") + logger = logging.getLogger(name) + logger.setLevel(level=level) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + if not logger.hasHandlers(): + logger.addHandler(console_handler) + logger.propagate = False + return logger + + +def parse_list_file(file_path): + file = open(file_path) + lines = file.read().splitlines() + file.close() + return lines + + +def parse_csv_file(file_path, term_column_name, term_id_column_name, separator=','): + data = pd.read_csv(file_path, sep=separator, engine='python') + data = data.dropna(subset=[term_column_name, term_id_column_name]) + if term_column_name not in data.columns: + raise ValueError("Could not find specified column name for input terms: " + term_column_name) + terms = data[term_column_name].values + if term_id_column_name not in data.columns: + term_ids = generate_iris(len(terms)) + elif data[term_id_column_name].isnull().values.all(): + term_ids = generate_iris(len(terms)) + else: + term_ids = data[term_id_column_name].values + return terms, term_ids + + +def parse_tsv_file(file_path, term_column_name, term_id_column_name): + return parse_csv_file(file_path, term_column_name, term_id_column_name, separator="\t") + + +def get_ontology_from_labels(term_labels): + onto_iri = BASE_IRI + "Ontology-" + generate_uuid() + onto = owlready2.get_ontology(onto_iri) + onto.metadata.comment.append("Created dynamically using text2term") + onto.metadata.comment.append(datetime.datetime.now()) + for term_label in term_labels: + with onto: + new_term_iri = generate_iri() + new_term = types.new_class(new_term_iri, (Thing,)) + new_term.label = term_label + return onto + + +def generate_uuid(): + return str(shortuuid.ShortUUID().random(length=10)) + + +def generate_iri(): + return BASE_IRI + "R" + generate_uuid() + + +def generate_iris(quantity): + return [generate_iri() for _ in range(quantity)] + + +OBO_BASE_IRI = "http://purl.obolibrary.org/obo/" +BIOPORTAL_BASE_IRI = "http://purl.bioontology.org/ontology/" +ORPHANET_IRI = "http://www.orpha.net/ORDO/" +ONTOLOGY_IRIS = {"EFO": "http://www.ebi.ac.uk/efo/", + "Orphanet": ORPHANET_IRI, + "ORPHA": ORPHANET_IRI, + "CL": OBO_BASE_IRI, + "MONDO": OBO_BASE_IRI, + "HP": OBO_BASE_IRI, + "UBERON": OBO_BASE_IRI, + "GO": OBO_BASE_IRI, + "DOID": OBO_BASE_IRI, + "CHEBI": OBO_BASE_IRI, + "OMIT": OBO_BASE_IRI, + "NCIT": OBO_BASE_IRI, + "MAXO": OBO_BASE_IRI, + "DRON": OBO_BASE_IRI, + "OAE": OBO_BASE_IRI, + "CIDO": OBO_BASE_IRI, + "OMIM": BIOPORTAL_BASE_IRI + "OMIM/", + "PATO": OBO_BASE_IRI, + "SNOMED": "http://snomed.info/id/"} diff --git a/text2term/preprocess.py b/text2term/preprocess.py new file mode 100644 index 0000000..d16a036 --- /dev/null +++ b/text2term/preprocess.py @@ -0,0 +1,142 @@ +import re +from .tagged_term import TaggedTerm + + +## Tags should be stored with their terms in the same line, delineated by ";:;" +## ex: Age when diagnosed with (.*) ;:; age,diagnosis +## "Age when diagnosed with cancer" becomes: {"cancer", ["age", "diagnosis"]} +def preprocess_tagged_terms(file_path, template_path="", blocklist_path="", + blocklist_char='', rem_duplicates=False, separator=";:;"): + # Separate tags from the terms, put in TaggedTerm and add to list + raw_terms = _get_values(file_path) + terms = [] + for raw_term in raw_terms: + separated = raw_term.split(separator) + try: + tags = separated[1].split(",") + term = TaggedTerm(original_term=separated[0], tags=tags) + except IndexError: + term = TaggedTerm(original_term=raw_term) + terms.append(term) + + # Seperate tags from templates, store together in dictionary + templates = {} + if template_path != "": + raw_templates = _get_values(template_path) + for raw_template in raw_templates: + separated = raw_template.split(separator) + try: + tags = separated[1].split(",") + regex_term = re.compile(separated[0]) + templates[regex_term] = tags + except IndexError: + regex_term = re.compile(raw_template) + templates[regex_term] = [] + templates[re.compile("(.*)")] = [] + + # Create the blocklist, if it exists + blocklist = [] + if blocklist_path != "": + blocklist_strings = _get_values(blocklist_path) + blocklist = _make_regex_list(blocklist_strings) + + processed_terms = [] + for term in terms: + if _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=True): + continue + for template, term_tags in templates.items(): + match = template.fullmatch(term.get_original_term()) + if match: + combined_matches = ' '.join(map(str, match.groups())) + if combined_matches: + _update_tagged_term(processed_terms, term, combined_matches, term_tags) + break + + if rem_duplicates: + processed_terms = _remove_duplicates(processed_terms) + + return processed_terms + + +def preprocess_terms(terms, template_path, output_file="", blocklist_path="", blocklist_char='', rem_duplicates=False): + if isinstance(terms, str): + terms = _get_values(terms) # if 'terms' is a string, we assume it is a filepath + # Form the templates as regular expressions + template_strings = [] + if template_path != "": + template_strings = _get_values(template_path) + template_strings.append("(.*)") + templates = _make_regex_list(template_strings) + + # Create the blocklist, if it exists + blocklist = [] + if blocklist_path != "": + blocklist_strings = _get_values(blocklist_path) + blocklist = _make_regex_list(blocklist_strings) + + # Checks all terms against each blocklist then template + processed_terms = {} + for term in terms: + if _blocklist_term(processed_terms, term, blocklist, blocklist_char): + continue + for template in templates: + match = template.fullmatch(term) + if match: + combined_matches = ' '.join(map(str, match.groups())) + if combined_matches: + processed_terms[term] = combined_matches + break + + if rem_duplicates: + processed_terms = _remove_duplicates(processed_terms) + + if output_file != "": + with open(output_file, 'w') as fp: + fp.write('\n'.join(processed_terms.values())) + return processed_terms + + +## Note: Because Python Dictionaries and Lists are passed by reference (sort of), updating the +## dictionary/list here will update the dictionary in the caller +def _blocklist_term(processed_terms, term, blocklist, blocklist_char, tagged=False): + for banned in blocklist: + match = banned.fullmatch(term if type(term) is not TaggedTerm else term.get_original_term()) + if match: + if blocklist_char != '': + if tagged: + _update_tagged_term(processed_terms, term, blocklist_char) + else: + processed_terms[term] = blocklist_char + return True + return False + + +def _update_tagged_term(processed_terms, term, new_term, tags=[]): + term.update_term(new_term) + term.add_tags(tags) + processed_terms.append(term) + + +def _get_values(path): + return open(path).read().splitlines() + + +def _make_regex_list(strings): + regexes = [] + for string in strings: + regexes.append(re.compile(string)) + return regexes + + +def _remove_duplicates(terms): + if type(terms) is dict: + temp = {val : key for key, val in terms.items()} + final = {val : key for key, val in temp.items()} + else: + temp = [] + final = [] + for term in terms: + if term.get_term() not in temp: + temp.append(term.get_term()) + final.append(term) + return final diff --git a/text2term/resources/ontologies.csv b/text2term/resources/ontologies.csv new file mode 100644 index 0000000..910acbd --- /dev/null +++ b/text2term/resources/ontologies.csv @@ -0,0 +1,9 @@ +acronym,version,url +CL,2023-09-21,https://github.com/obophenotype/cell-ontology/releases/download/v2023-09-21/cl.owl +EFO,3.57.0,https://github.com/EBISPOT/efo/releases/download/v3.57.0/efo.owl +FOODON,0.6.0,https://github.com/FoodOntology/foodon/raw/v0.6.0/foodon.owl +GO,2023-07-27,http://purl.obolibrary.org/obo/go/releases/2023-07-27/go.owl +HPO,2023-09-01,https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-09-01/hp.owl +MONDO,2023-09-12,https://github.com/monarch-initiative/mondo/releases/download/v2023-08-02/mondo.owl +NCIT,2022-08-19,https://github.com/NCI-Thesaurus/thesaurus-obo-edition/releases/download/v2022-08-19/ncit.owl +UBERON,2023-09-05,https://github.com/obophenotype/uberon/releases/download/v2023-09-05/uberon.owl \ No newline at end of file diff --git a/text2term/syntactic_mapper.py b/text2term/syntactic_mapper.py new file mode 100644 index 0000000..5316303 --- /dev/null +++ b/text2term/syntactic_mapper.py @@ -0,0 +1,125 @@ +"""Provides SyntacticMapper class""" + +import logging +import nltk +import rapidfuzz +from tqdm import tqdm +from text2term import onto_utils +from text2term.mapper import Mapper +from text2term.term_mapping import TermMapping, TermMappingCollection + + +class SyntacticMapper: + + def __init__(self, target_ontology_terms): + """ + :param target_ontology_terms: Collection of ontology terms to be mapped against + """ + self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.target_ontology_terms = target_ontology_terms + + def map(self, source_terms, source_terms_ids, mapper=Mapper.JARO_WINKLER, max_mappings=3): + """ + :param source_terms: List of source terms to be mapped with ontology terms + :param source_terms_ids: List of identifiers for the given source terms + :param mapper: Mapping method to be used for matching + :param max_mappings: Maximum number of (top scoring) ontology term mappings that should be returned + """ + mappings = [] + for term, term_id in tqdm(zip(source_terms, source_terms_ids), total=len(source_terms)): + matches = self._map(term, term_id, mapper, max_mappings) + mappings.extend(matches) + return TermMappingCollection(mappings).mappings_df() + + def _map(self, source_term, source_term_id, mapper, max_matches=3): + self.logger.debug("Matching %s...", source_term) + term_matches = [] + for term in self.target_ontology_terms.values(): + highest_similarity = 0.0 + for target_name in self._term_names(term): + similarity = self.compare(source_term, target_name, mapper) + self.logger.debug("%s -> %s (%.2f)", source_term, target_name, similarity) + if similarity > highest_similarity: + highest_similarity = similarity + term_matches.append(TermMapping(source_term, source_term_id, term.label, term.iri, highest_similarity)) + matches_sorted = sorted(term_matches, key=lambda x: x.mapping_score, reverse=True) + del matches_sorted[max_matches:] + return matches_sorted + + def _term_names(self, ontology_term): + lbls_syns = [] + lbls_syns.extend(ontology_term.labels) + lbls_syns.extend(ontology_term.synonyms) + return lbls_syns + + def compare(self, s1, s2, mapper): + """ + Compare the given strings s1 and s2 with respect to the specified mapping method + :param s1: source string + :param s2: target string + :param mapper: Mapping method to be used + """ + if mapper == Mapper.LEVENSHTEIN: + return self.compare_levenshtein(s1, s2) + elif mapper == Mapper.JARO: + return self.compare_jaro(s1, s2) + elif mapper == Mapper.JARO_WINKLER: + return self.compare_jarowinkler(s1, s2) + elif mapper == Mapper.INDEL: + return self.compare_indel(s1, s2) + elif mapper == Mapper.FUZZY: + return self.compare_fuzzy_ratio(s1, s2) + elif mapper == Mapper.JACCARD: + return self.compare_jaccard(s1, s2) + else: + raise ValueError("Unsupported mapping method: " + str(mapper)) + + def compare_levenshtein(self, s1, s2): + """ + Calculates the normalized Levenshtein distance between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.normalized_levenshtein(s1, s2)/100 + return similarity + + def compare_jaro(self, s1, s2): + """ + Calculates the Jaro similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.jaro_similarity(s1, s2)/100 + return similarity + + def compare_jarowinkler(self, s1, s2): + """ + Calculates the Jaro-Winkler similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.string_metric.jaro_winkler_similarity(s1, s2)/100 + return similarity + + def compare_indel(self, s1, s2): + """ + Calculates the normalized Indel distance between s1 and s2. + See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#ratio + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.fuzz.ratio(s1, s2)/100 + return similarity + + def compare_fuzzy_ratio(self, s1, s2): + """ + Calculates a weighted ratio between s1 and s2 based on rapidfuzz's fuzzy ratio algorithms. + See: https://maxbachmann.github.io/RapidFuzz/Usage/fuzz.html#wratio + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = rapidfuzz.fuzz.WRatio(s1, s2)/100 + return similarity + + def compare_jaccard(self, s1, s2): + """ + Calculates a Jaccard-based similarity between s1 and s2. + :return similarity between s1 and s2 as a float between 0 and 1 + """ + similarity = 1-nltk.jaccard_distance(set(s1), set(s2)) + return similarity diff --git a/text2term/t2t.py b/text2term/t2t.py new file mode 100644 index 0000000..a2e27a4 --- /dev/null +++ b/text2term/t2t.py @@ -0,0 +1,346 @@ +import os +import json +import pickle +import logging +import datetime +import time +import pandas as pd +from text2term import onto_utils +from text2term import onto_cache +from text2term.mapper import Mapper +from text2term.term import OntologyTermType +from text2term.term_collector import OntologyTermCollector +from text2term.term_collector import filter_terms +from text2term.term_graph_generator import TermGraphGenerator +from text2term.bioportal_mapper import BioPortalAnnotatorMapper +from text2term.syntactic_mapper import SyntacticMapper +from text2term.tfidf_mapper import TFIDFMapper +from text2term.zooma_mapper import ZoomaMapper +from text2term.config import VERSION +from text2term.tagged_term import TaggedTerm +from text2term.term_mapping import TermMapping + +IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] +UNMAPPED_TAG = "unmapped" +OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label", + "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] + +LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) + + +def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, + min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, + source_terms_ids=(), separator=',', use_cache=False, term_type=OntologyTermType.CLASS, + incl_unmapped=False): + """ + Maps the terms in the given list to the specified target ontology. + + Parameters + ---------- + source_terms : str or list or dict + Path to file containing the terms to map to. Or list of terms to map to an ontology. Or dictionary containing + tagged terms, where the keys are the source terms and the values are tags attached to those terms + target_ontology : str + Filepath or URL of 'target' ontology to map the source terms to. When the chosen mapper is BioPortal or Zooma, + provide a comma-separated list of ontology acronyms (eg 'EFO,HPO') or write 'all' to search all ontologies. + When the target ontology has been previously cached, provide the ontology name as used when it was cached + base_iris : tuple + Map only to ontology terms whose IRIs start with one of the strings given in this tuple, for example: + ('http://www.ebi.ac.uk/efo','http://purl.obolibrary.org/obo/HP') + csv_columns : tuple + Name of column containing the terms to map, optionally followed by another column name containing the term IDs, + for example: ('disease', 'disease_identifier') + source_terms_ids : tuple + Collection of identifiers for the given source terms + excl_deprecated : bool + Exclude ontology terms stated as deprecated via `owl:deprecated true` + mapper : mapper.Mapper + Method used to compare source terms with ontology terms. One of: levenshtein, jaro, jarowinkler, jaccard, + fuzzy, tfidf, zooma, bioportal + max_mappings : int + Maximum number of top-ranked mappings returned per source term + min_score : float + Minimum similarity score [0,1] for the mappings (1=exact match) + output_file : str + Path to desired output file for the mappings + save_graphs : bool + Save vis.js graphs representing the neighborhood of each ontology term + save_mappings : bool + Save the generated mappings to a file (specified by `output_file`) + separator : str + Symbol used to separate columns in the input table (eg ',' or '\t' for csv or tsv, respectively) + use_cache : bool + Use a previously cached ontology + term_type : OntologyTermType + The type(s) of ontology terms to map to, which can be 'class' or 'property' or 'any' + incl_unmapped : bool + Include unmapped terms in the output data frame + + Returns + ---------- + df + Data frame containing the generated ontology mappings + """ + # Parse the possible source terms options and tags + source_terms, source_term_ids, tags = _parse_source_terms(source_terms, source_terms_ids, csv_columns, separator) + # Create source term IDs if they are not provided + if len(source_terms_ids) != len(source_terms): + if len(source_terms_ids) > 0: + LOGGER.warning(f"The number of Source Term IDs provided ({len(source_terms_ids)}) is different than the " + f"number of Source Terms ({len(source_terms)}). New Source Term IDs will be used instead.") + source_terms_ids = onto_utils.generate_iris(len(source_terms)) + # Create the output file + if output_file == '': + timestamp = datetime.datetime.now().strftime("%d-%m-%YT%H-%M-%S") + output_file = "t2t-mappings-" + timestamp + ".csv" + # Load the ontology for either Zooma, Bioportal, or directly + if mapper in {Mapper.ZOOMA, Mapper.BIOPORTAL}: + target_terms = '' if target_ontology.lower() == 'all' else target_ontology + else: + target_terms = _load_ontology(target_ontology, base_iris, excl_deprecated, use_cache, term_type) + # Run the mapper + LOGGER.info(f"Mapping {len(source_terms)} source terms to {target_ontology}") + mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, + incl_unmapped) + mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) + if save_mappings: + _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) + if save_graphs: + _save_graphs(target_terms, output_file) + return mappings_df + + +# Caches a single ontology +def cache_ontology(ontology_url, ontology_acronym="", base_iris=()): + if ontology_acronym == "": + ontology_acronym = ontology_url + ontology_terms = _load_ontology(ontology_url, base_iris, exclude_deprecated=False, term_type=OntologyTermType.ANY) + cache_dir = os.path.join("cache", ontology_acronym) + LOGGER.info(f"Caching ontology {ontology_url} to: {cache_dir}") + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + _serialize_ontology(ontology_terms, ontology_acronym, cache_dir) + _save_graphs(ontology_terms, output_file=os.path.join(cache_dir, ontology_acronym)) + ontology_terms.clear() + return onto_cache.OntologyCache(ontology_acronym) + + +""" +PRIVATE/HELPER FUNCTIONS +""" + + +# Parses the source terms and returns what is to be mapped, the term ids, and the tags +def _parse_source_terms(source_terms, source_terms_ids=(), csv_columns=(), separator=','): + # If source_terms is a string, we assume it is a file location + if isinstance(source_terms, str): + terms, source_terms_ids = _load_data(source_terms, csv_columns, separator) + tags = dict.fromkeys(terms) + # If source_terms is a dictionary, the keys are terms and the values are tags + elif isinstance(source_terms, dict): + terms = list(source_terms.keys()) + tags = source_terms + # Otherwise, it is a list of either TaggedTerms or strings + elif isinstance(source_terms[0], TaggedTerm): + terms = [] + source_terms_id_list = [] + for tagged_term in source_terms: + terms.append(tagged_term.get_term()) + if tagged_term.get_source_term_id() is None: + source_terms_id_list.append(tagged_term.get_source_term_id()) + source_terms_ids = source_terms_id_list + tags = source_terms + else: + terms = source_terms + tags = dict.fromkeys(terms) + return terms, source_terms_ids, tags + + +def _serialize_ontology(ontology_terms, ontology_acronym, cache_dir): + with open(os.path.join(cache_dir, ontology_acronym + "-term-details.pickle"), 'wb+') as out_file: + pickle.dump(ontology_terms, out_file) + + +def _load_data(input_file_path, csv_column_names, separator): + if len(csv_column_names) >= 1: + term_id_col_name = "" + if len(csv_column_names) == 2: + term_id_col_name = csv_column_names[1] + terms, term_ids = onto_utils.parse_csv_file(input_file_path, separator=separator, + term_column_name=csv_column_names[0], + term_id_column_name=term_id_col_name) + else: + terms = onto_utils.parse_list_file(input_file_path) + term_ids = onto_utils.generate_iris(len(terms)) + return terms, term_ids + + +def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_type=OntologyTermType.CLASS): + if use_cache: + pickle_file = os.path.join("cache", ontology, ontology + "-term-details.pickle") + LOGGER.info(f"Loading cached ontology from: {pickle_file}") + with open(pickle_file, "rb") as cached_ontology_pickle: + onto_terms_unfiltered = pickle.load(cached_ontology_pickle) + onto_terms = filter_terms(onto_terms_unfiltered, iris, exclude_deprecated, term_type) + else: + term_collector = OntologyTermCollector(ontology_iri=ontology) + onto_terms = term_collector.get_ontology_terms(base_iris=iris, exclude_deprecated=exclude_deprecated, + term_type=term_type) + term_collector.close() + LOGGER.info(f"Filtered ontology terms to those of type: {term_type}") + if len(onto_terms) == 0: + raise RuntimeError("Could not find any terms in the given ontology.") + return onto_terms + + +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): + to_map, tags = _process_tags(source_terms, tags) + start = time.time() + if mapper == Mapper.TFIDF: + term_mapper = TFIDFMapper(ontology_terms) + mappings_df = term_mapper.map(to_map, source_term_ids, max_mappings=max_mappings, min_score=min_score) + elif mapper == Mapper.ZOOMA: + term_mapper = ZoomaMapper() + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper == Mapper.BIOPORTAL: + term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") + mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) + elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: + term_mapper = SyntacticMapper(ontology_terms) + mappings_df = term_mapper.map(to_map, source_term_ids, mapper, max_mappings=max_mappings) + else: + raise ValueError("Unsupported mapper: " + mapper) + LOGGER.info("...done (mapping time: %.2fs seconds)", time.time() - start) + + # Filter terms by the mapping score specified + if mapper == Mapper.BIOPORTAL: + LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score " + "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.") + else: + LOGGER.debug("Filtering mappings by their score...") + start_filter = time.time() + mappings_df = _filter_mappings(mappings_df, min_score) + LOGGER.debug("...done (filtering time: %.2fs seconds)", time.time() - start_filter) + + # Include in output data frame any input terms that did not meet min_score threshold + if incl_unmapped: + LOGGER.debug("Adding unmapped terms...") + start_unmapped = time.time() + mappings_df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) + LOGGER.debug("...done (adding unmapped time: %.2fs seconds)", time.time() - start_unmapped) + + # Add tags + if not mappings_df.empty: + LOGGER.debug("Adding tags...") + start_tagging = time.time() + mappings_df = _add_tags_to_df(mappings_df, tags) + LOGGER.debug("...done (adding tags time: %.2fs seconds)", time.time() - start_tagging) + return mappings_df + + +# Takes in the tags and source terms and processes them accordingly +def _process_tags(source_terms, tags): + to_map = [] + # IGNORE TAGS SECTION + for term in source_terms: + if isinstance(tags, dict): + term_tags = tags[term] + else: + for tag in tags: + if tag.get_term() == term: + term_tags = tag.get_tags() + break + if isinstance(term_tags, list): + if not any(tag in IGNORE_TAGS for tag in term_tags): + to_map.append(term) + else: + if term_tags not in IGNORE_TAGS: + to_map.append(term) + return to_map, tags + + +def _add_tags_to_df(df, tags): + if isinstance(tags, dict): + for key, value in tags.items(): + if isinstance(value, list): + to_store = ','.join(value) + else: + to_store = str(value) + df.loc[df['Source Term'] == key, "Tags"] = to_store + else: + for term in tags: + to_store = ','.join(term.get_tags()) + df.loc[df['Source Term'] == term.get_term(), "Tags"] = to_store + return df + + +def _filter_mappings(mappings_df, min_score): + if mappings_df.empty: + return mappings_df + new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score] + return new_df + + +def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): + if mappings_df.size == 0: + mapped = [] + mappings_df = pd.DataFrame(columns=OUTPUT_COLUMNS) + else: + mapped = pd.unique(mappings_df["Source Term"]) + for (term, term_id) in zip(source_terms, source_terms_ids): + if term not in mapped: + non_mapping = TermMapping(term, term_id, "", "", 0) + _add_tag(tags, term, UNMAPPED_TAG, ignore=True) + mappings_df.loc[len(mappings_df.index)] = non_mapping.to_dict() + return mappings_df + + +def _add_tag(tags, term, to_add, ignore=False): + if isinstance(tags, dict): + new_tags = tags.get(term, []) + if new_tags is None: + new_tags = [] + if not (ignore and any(tag in IGNORE_TAGS for tag in new_tags)): + if isinstance(new_tags, list): + new_tags.append(to_add) + elif new_tags != "": + new_tags = [new_tags, to_add] + else: + new_tags = [to_add] + tags[term] = new_tags + else: + for tagged_term in tags: + check_ignore = not ignore and not any(tagged_term.has_tag(tag) for tag in IGNORE_TAGS) + if tagged_term.get_term() == term and check_ignore: + tagged_term.add_tags([to_add]) + + +def _save_mappings(mappings, output_file, min_score, mapper, target_ontology, base_iris, + excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped): + if os.path.dirname(output_file): # create output directories if needed + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, "a") as f: + f.write("# Timestamp: %s\n" % datetime.datetime.now()) + f.write("# Target Ontology: %s\n" % target_ontology) + f.write("# text2term version: %s\n" % VERSION) + f.write("# Minimum Score: %.2f\n" % min_score) + f.write("# Mapper: %s\n" % mapper.value) + f.write("# Base IRIs: %s\n" % (base_iris,)) + f.write("# Max Mappings: %d\n" % max_mappings) + f.write("# Term Type: %s\n" % term_type) + f.write("# Deprecated Terms ") + f.write("Excluded\n" if excl_deprecated else "Included\n") + f.write("# Unmapped Terms ") + f.write("Excluded\n" if not incl_unmapped else "Included\n") + writestring = "# Of " + str(len(source_terms)) + " entries, " + str(len(pd.unique(mappings["Source Term ID"]))) + writestring += " were mapped to " + str( + len(pd.unique(mappings["Mapped Term IRI"]))) + " unique terms\n" + f.write(writestring) + mappings.to_csv(output_file, index=False, mode='a') + + +def _save_graphs(terms, output_file): + term_graphs = TermGraphGenerator(terms).graphs_dicts() + with open(output_file + "-term-graphs.json", 'w') as json_file: + json.dump(term_graphs, json_file, indent=2) diff --git a/text2term/tagged_term.py b/text2term/tagged_term.py new file mode 100644 index 0000000..db26dd8 --- /dev/null +++ b/text2term/tagged_term.py @@ -0,0 +1,40 @@ +"""Provides TaggedTerm class""" + + +class TaggedTerm: + def __init__(self, term=None, tags=(), original_term=None, source_term_id=None): + self.term = term + self.tags = tags + self.original_term = original_term + self.source_term_id = source_term_id + + def __repr__(self): + return f" 0: + for iri in base_iris: + iri = iri.strip() + query = iri + "*" + self.logger.info("...collecting terms with IRIs starting in: " + iri) + iris = list(default_world.search(iri=query)) + ontology_terms = ontology_terms | self._get_ontology_terms(iris, self.ontology, exclude_deprecated, + term_type) + else: + ontology_signature = self._get_ontology_signature(self.ontology) + ontology_terms = self._get_ontology_terms(ontology_signature, self.ontology, exclude_deprecated, term_type) + end = time.time() + self.logger.info("...done: collected %i ontology terms (collection time: %.2fs)", len(ontology_terms), + end - start) + return ontology_terms + + def filter_terms(self, onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): + return filter_terms(onto_terms, iris, excl_deprecated, term_type) + + def _get_ontology_signature(self, ontology): + signature = list(ontology.classes()) + signature.extend(list(ontology.properties())) + # owlready2::ontology.classes() does not include classes in imported ontologies; we need to explicitly add them + for imported_ontology in ontology.imported_ontologies: + signature.extend(list(imported_ontology.classes())) + signature.extend(list(imported_ontology.properties())) + return signature + + def _get_ontology_terms(self, term_list, ontology, exclude_deprecated, term_type): + ontology_terms = dict() + for ontology_term in term_list: + # Parse if should include ontology classes, properties, or both + include = _filter_term_type(ontology_term, term_type, False) + if include and ontology_term is not Thing and ontology_term is not Nothing: + if (exclude_deprecated and not deprecated[ontology_term]) or (not exclude_deprecated): + iri = ontology_term.iri + labels = self._get_labels(ontology_term) + synonyms = self._get_synonyms(ontology_term) + named_parents, complex_parents = self._get_parents(ontology_term) + children = self._get_children(ontology_term, ontology) + instances = self._get_instances(ontology_term, ontology) + definitions = self._get_definitions(ontology_term) + is_deprecated = deprecated[ontology_term] == [True] + if _filter_term_type(ontology_term, OntologyTermType.CLASS, False): + owl_term_type = OntologyTermType.CLASS + elif _filter_term_type(ontology_term, OntologyTermType.PROPERTY, False): + owl_term_type = OntologyTermType.PROPERTY + else: + owl_term_type = "undetermined" + self.logger.warn("Term has undetermined type %s %s", iri, labels) + term_details = OntologyTerm(iri, labels, definitions=definitions, synonyms=synonyms, + parents=named_parents, children=children, instances=instances, + restrictions=complex_parents, deprecated=is_deprecated, + term_type=owl_term_type) + ontology_terms[iri] = term_details + else: + self.logger.debug("Excluding deprecated ontology term: %s", ontology_term.iri) + return ontology_terms + + def _get_parents(self, ontology_term): + parents = dict() # named/atomic superclasses except owl:Thing + restrictions = dict() # restrictions are class expressions such as 'pancreatitis disease_has_location pancreas' + try: + all_parents = ontology_term.is_a # obtain direct parents of this entity + for parent in all_parents: + # exclude owl:Thing and Self + if parent is not Thing and parent is not ontology_term: + if isinstance(parent, ThingClass): # get named parents (i.e. classes with IRIs) + self._add_named_parent(parent, parents) + elif isinstance(parent, And): # get conjuncts and add them to the respective structures + for conjunct in parent.Classes: + if isinstance(conjunct, ThingClass): # if conjunct is a named class, add it to parents dict + self._add_named_parent(conjunct, parents) + else: + self._add_complex_parent(conjunct, restrictions) + elif isinstance(parent, Restriction): # get complex parents, i.e. restrictions or class expressions + self._add_complex_parent(parent, restrictions) + except (AttributeError, ValueError) as err: + self.logger.debug(err) + return parents, restrictions + + def _add_named_parent(self, parent, parents): + if len(parent.label) > 0: + parents.update({parent.iri: parent.label[0]}) + else: + parents.update({parent.iri: onto_utils.label_from_iri(parent.iri)}) + + def _add_complex_parent(self, parent, restrictions): + property_iri = parent.property.iri + if isinstance(parent.value, ThingClass): # the filler is a named term (i.e., it has an IRI) + value = parent.value.iri + else: # the filler is another complex class expression + value = parent.value + if property_iri in restrictions.keys(): + current_restrictions = restrictions[property_iri] + current_restrictions.add(value) + restrictions.update({property_iri: current_restrictions}) + else: + restrictions.update({property_iri: str(value)}) + + def _get_children(self, ontology_term, ontology): + children = dict() + try: + for child in ontology.get_children_of(ontology_term): + if len(child.iri) > 0: + if len(child.label) > 0: + children.update({child.iri: child.label[0]}) + else: + children.update({child.iri: onto_utils.label_from_iri(child.iri)}) + except (TypeError, AttributeError, ValueError) as err: + self.logger.debug(err) + return children + + def _get_instances(self, ontology_term, ontology): + instances = dict() + try: + for instance in ontology.get_instances_of(ontology_term): + if len(instance.iri) > 0: + if len(instance.label) > 0: + instances.update({instance.iri: instance.label[0]}) + else: + instances.update({instance.iri: onto_utils.label_from_iri(instance.iri)}) + except AttributeError as err: + self.logger.debug(err) + return instances + + def _get_labels(self, ontology_term): + """ + Collect the labels of the given ontology term both given by rdfs:label and skos:prefLabel + :param ontology_term: Ontology term + :return: Collection of labels of the ontology term + """ + labels = set() + for rdfs_label in self._get_rdfs_labels(ontology_term): + labels.add(rdfs_label) + for skos_label in self._get_skos_pref_labels(ontology_term): + labels.add(skos_label) + if len(labels) == 0: + label_from_iri = onto_utils.label_from_iri(ontology_term.iri) + self.logger.debug("...ontology term %s has no labels (rdfs:label or skos:prefLabel). " + "Using a label based on the term IRI: %s", ontology_term.iri, label_from_iri) + labels.add(label_from_iri) + self.logger.debug("...collected %i labels and synonyms for %s", len(labels), ontology_term) + return labels + + def _get_synonyms(self, ontology_term, include_related_synonyms=False, include_broad_synonyms=False): + """ + Collect the synonyms of the given ontology term + :param ontology_term: Ontology term + :param include_broad_synonyms: true if broad (i.e. more generic) synonyms should be included, false otherwise + :return: Collection of synonyms of the ontology term + """ + synonyms = set() + for synonym in self._get_obo_exact_synonyms(ontology_term): + synonyms.add(synonym) + for nci_synonym in self._get_nci_synonyms(ontology_term): + synonyms.add(nci_synonym) + for efo_alt_term in self._get_efo_alt_terms(ontology_term): + synonyms.add(efo_alt_term) + if include_related_synonyms: + for synonym in self._get_obo_related_synonyms(ontology_term): + synonyms.add(synonym) + if include_broad_synonyms: + for synonym in self._get_obo_broad_synonyms(ontology_term): + synonyms.add(synonym) + self.logger.debug("...collected %i synonyms for %s", len(synonyms), ontology_term) + return synonyms + + def _get_rdfs_labels(self, ontology_term): + """ + Collect labels of the given term that are specified using the standard rdfs:label annotation property + :param ontology_term: Ontology term to collect labels from + :return: Collection of RDFS labels + """ + rdfs_labels = [] + try: + for rdfs_label in ontology_term.label: + rdfs_labels.append(rdfs_label) + except (AttributeError, ValueError) as err: + self.logger.debug(err) + return rdfs_labels + + def _get_skos_pref_labels(self, ontology_term): + """ + Collect labels of the given term that are specified using the skos:prefLabel annotation property + :param ontology_term: Ontology term to collect labels from + :return: Collection of SKOS preferred labels + """ + skos_labels = [] + try: + for skos_pref_label in ontology_term.prefLabel: + skos_labels.append(skos_pref_label) + except AttributeError as err: + self.logger.debug(err) + return skos_labels + + def _get_efo_alt_terms(self, ontology_term): + efo_alt_terms = [] + try: + for efo_alt_term in ontology_term.alternative_term: + efo_alt_terms.append(efo_alt_term) + except AttributeError as err: + self.logger.debug(err) + return efo_alt_terms + + def _get_obo_exact_synonyms(self, ontology_term): + """ + Collect exact synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect exact synonyms from + :return: Collection of exact synonyms + """ + synonyms = [] + try: + for synonym in ontology_term.hasExactSynonym: + if hasattr(synonym, 'iri'): + synonym = synonym.iri + synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return synonyms + + def _get_obo_related_synonyms(self, ontology_term): + """ + Collect related synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect related synonyms from + :return: Collection of related synonyms + """ + synonyms = [] + try: + for synonym in ontology_term.hasRelatedSynonym: + if hasattr(synonym, 'iri'): + synonym = synonym.iri + synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return synonyms + + def _get_obo_broad_synonyms(self, ontology_term): + """ + Collect broad synonyms of the given term that are specified using the annotation property: + . + :param ontology_term: Ontology term to collect broad synonyms from + :return: Collection of broad synonyms + """ + synonyms = [] + try: + for synonym in ontology_term.hasBroadSynonym: + if hasattr(synonym, 'iri'): + synonym = synonym.iri + synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return synonyms + + def _get_nci_synonyms(self, ontology_term): + """ + Collect synonyms of the given term that are specified using the NCI Thesaurus annotation property: + . + :param ontology_term: Ontology term to collect synonyms from + :return: Collection of synonyms + """ + nci_synonyms = [] + try: + for synonym in ontology_term.P90: + nci_synonyms.append(synonym) + except AttributeError as err: + self.logger.debug(err) + return nci_synonyms + + def _get_definitions(self, ontology_term): + """ + Get definitions (if any exist) of the given term as specified using either the skos:definition annotation + property or the IAO_0000115 ('definition') property + :param ontology_term: Ontology term to collect definition of + :return: Set of term definition strings + """ + definitions = set() + for definition in self._get_skos_definition(ontology_term): + definitions.add(definition) + for definition in self._get_iao_definition(ontology_term): + definitions.add(definition) + return definitions + + def _get_iao_definition(self, ontology_term): + definition = "" + try: + definition = ontology_term.IAO_0000115 + except AttributeError as err: + self.logger.debug(err) + return definition + + def _get_skos_definition(self, ontology_term): + definition = "" + try: + definition = ontology_term.definition + except AttributeError as err: + self.logger.debug(err) + return definition + + def _load_ontology(self, ontology_iri): + """ + Load the ontology at the specified IRI. + :param ontology_iri: IRI of the ontology (e.g., path of ontology document in the local file system, URL) + :return: Ontology document + """ + self.logger.info("Loading ontology %s...", ontology_iri) + start = time.time() + owl_link = bioregistry.get_owl_download(ontology_iri) + if owl_link is not None: + ontology_iri = owl_link + ontology = get_ontology(ontology_iri).load() + end = time.time() + self._log_ontology_metrics(ontology) + self.logger.info("...done (ontology loading time: %.2fs)", end - start) + return ontology + + def _classify_ontology(self, ontology): + """ + Perform reasoning over the given ontology (consistency checking and classification) + :param ontology: ontology instance + """ + self.logger.info("Reasoning over ontology...") + start = time.time() + with ontology: # entailments will be added to this ontology + sync_reasoner(infer_property_values=True) + end = time.time() + self.logger.info("...done (reasoning time: %.2fs)", end - start) + + def close(self): + # when multiple ontologies are loaded with owlready2, and they reference the same ontology term (IRI), a lookup + # for that IRI returns the term from the first ontology loaded —> need to unload previously loaded ontologies + try: + self.ontology.destroy() + except Exception as err: + self.logger.debug("Unable to destroy ontology: ", err) + + def _log_ontology_metrics(self, ontology): + self.logger.debug(" Ontology IRI: %s", ontology.base_iri) + self.logger.debug(" Class count: %i", len(list(ontology.classes()))) + self.logger.debug(" Object property count: %i", len(list(ontology.object_properties()))) + self.logger.debug(" Data property count: %i", len(list(ontology.data_properties()))) + self.logger.debug(" Annotation property count: %i", len(list(ontology.annotation_properties()))) + +def filter_terms(onto_terms, iris=(), excl_deprecated=False, term_type=OntologyTermType.ANY): + filtered_onto_terms = {} + for base_iri, term in onto_terms.items(): + if type(iris) == str: + begins_with_iri = (iris == ()) or base_iri.startswith(iris) + else: + begins_with_iri = (iris == ()) or any(base_iri.startswith(iri) for iri in iris) + is_not_deprecated = (not excl_deprecated) or (not term.deprecated) + include = _filter_term_type(term, term_type, True) + if begins_with_iri and is_not_deprecated and include: + filtered_onto_terms.update({base_iri: term}) + return filtered_onto_terms + +def _filter_term_type(ontology_term, term_type, cached): + if term_type == OntologyTermType.CLASS: + if cached: + return ontology_term.term_type == OntologyTermType.CLASS + else: + return isinstance(ontology_term, ThingClass) + elif term_type == OntologyTermType.PROPERTY: + if cached: + return ontology_term.term_type == OntologyTermType.PROPERTY + else: + return isinstance(ontology_term, PropertyClass) + elif term_type == OntologyTermType.ANY: + return True + else: + raise ValueError("Invalid term-type option. Acceptable term types are: 'class' or 'property' or 'any'") diff --git a/term_graph.py b/text2term/term_graph.py similarity index 96% rename from term_graph.py rename to text2term/term_graph.py index 29347be..b3f168d 100644 --- a/term_graph.py +++ b/text2term/term_graph.py @@ -1,7 +1,7 @@ -"""Provides OntologyTermGraph, Node and Edge classes""" +"""Provides TermGraph, Node and Edge classes""" -class OntologyTermGraph: +class TermGraph: """ Represents a graph of the neighborhood of an ontology term. The graph includes all (direct and indirect) superclasses and all direct subclasses. @@ -23,7 +23,7 @@ def nodes(self): def edges(self): return self._edges - def graph_dict(self): + def as_dict(self): graph = { "iri": self.term_iri, "nodes": self._nodes_dict(), diff --git a/text2term/term_graph_generator.py b/text2term/term_graph_generator.py new file mode 100644 index 0000000..c2a061b --- /dev/null +++ b/text2term/term_graph_generator.py @@ -0,0 +1,71 @@ +"""Provides TermGraphGenerator class""" + +from text2term import onto_utils +from text2term.term_graph import TermGraph, Node, Edge + + +class TermGraphGenerator: + + def __init__(self, terms): + self._terms = terms + self._logger = onto_utils.get_logger(__name__) + + def graph(self, term): + """ Build and return a graph representing the neighborhood of an ontology term. """ + nodes, edges = set(), set() + nodes.add(Node(term.iri, term.label)) + self._add_superclasses(term, nodes, edges) + self._add_subclasses(term, term.children, nodes, edges) + self._add_instances(term, term.instances, nodes, edges) + return TermGraph(term.iri, nodes, edges) + + def _add_superclasses(self, term, nodes, edges): + parents = term.parents + for parent_iri in parents: + self._add_node(parent_iri, parents[parent_iri], nodes) + edges.add(Edge(term.iri, parent_iri, Edge.IS_A)) + self._add_ancestors(parent_iri, nodes, edges) + + def _add_ancestors(self, node_iri, nodes, edges): + if node_iri in self._terms: + ancestors = self._terms[node_iri].parents + for ancestor_iri in ancestors: + self._add_node(ancestor_iri, ancestors[ancestor_iri], nodes) + edges.add(Edge(node_iri, ancestor_iri, Edge.IS_A)) + self._add_ancestors(ancestor_iri, nodes, edges) + else: + self._logger.debug("Unable to get ancestor term %s from the ontology term details dictionary " + "(possibly filtered out through the `base_iris` option)", node_iri) + + def _add_children(self, term, children, edge_type, nodes, edges): + for child_iri in children: + self._add_node(child_iri, children[child_iri], nodes) + edges.add(Edge(child_iri, term.iri, edge_type)) + + def _add_subclasses(self, term, subclasses, nodes, edges): + self._add_children(term, subclasses, Edge.IS_A, nodes, edges) + + def _add_instances(self, term, instances, nodes, edges): + self._add_children(term, instances, Edge.INSTANCE_OF, nodes, edges) + + def _add_node(self, term_iri, term_label, nodes): + if len(term_iri) > 0: + if isinstance(term_label, list) and len(term_label) > 0: + label = term_label[0] + elif isinstance(term_label, str): + label = term_label + else: + label = onto_utils.label_from_iri(term_iri) + if label is not None and len(label) > 0: + nodes.add(Node(term_iri, label)) + else: + self._logger.debug("Label is null or empty for term " + term_iri) + else: + self._logger.debug("The given term has no IRI") + + def graphs_dicts(self): + """Convenience function to get a list of all term graphs' dictionary representations""" + graph_dicts = [] + for term in self._terms.values(): + graph_dicts.append(self.graph(term).as_dict()) + return graph_dicts diff --git a/term_mapping.py b/text2term/term_mapping.py similarity index 52% rename from term_mapping.py rename to text2term/term_mapping.py index 0a9ace1..8da155c 100644 --- a/term_mapping.py +++ b/text2term/term_mapping.py @@ -1,21 +1,32 @@ """Provides TermMapping and TermMappingCollection classes""" import pandas as pd +from text2term import onto_utils class TermMapping: - - def __init__(self, source_term, mapped_term_label, mapped_term_iri, mapped_ontology_iri, mapping_score): + SRC_TERM = "Source Term" + SRC_TERM_ID = "Source Term ID" + TGT_TERM_LBL = "Mapped Term Label" + TGT_TERM_CURIE = "Mapped Term CURIE" + TGT_TERM_IRI = "Mapped Term IRI" + MAPPING_SCORE = "Mapping Score" + + def __init__(self, source_term, source_term_id, mapped_term_label, mapped_term_iri, mapping_score): self._source_term = source_term + self._source_term_id = source_term_id self._mapped_term_label = mapped_term_label self._mapped_term_iri = mapped_term_iri - self._mapped_ontology_iri = mapped_ontology_iri self._mapping_score = mapping_score @property def source_term(self): return self._source_term + @property + def source_term_id(self): + return self._source_term_id + @property def mapped_term_label(self): return self._mapped_term_label @@ -25,8 +36,10 @@ def mapped_term_iri(self): return self._mapped_term_iri @property - def mapped_ontology_iri(self): - return self._mapped_ontology_iri + def mapped_term_curie(self): + if self.mapped_term_iri == "": + return "" + return onto_utils.curie_from_iri(self.mapped_term_iri) @property def mapping_score(self): @@ -34,11 +47,12 @@ def mapping_score(self): def to_dict(self): return { - 'Source Term': self.source_term, - 'Mapped Term Label': self.mapped_term_label, - 'Mapped Term IRI': self.mapped_term_iri, - 'Mapped Ontology IRI': self.mapped_ontology_iri, - 'Mapping Score': self.mapping_score + self.SRC_TERM_ID: self.source_term_id, + self.SRC_TERM: self.source_term, + self.TGT_TERM_LBL: self.mapped_term_label, + self.TGT_TERM_CURIE: self.mapped_term_curie, + self.TGT_TERM_IRI: self.mapped_term_iri, + self.MAPPING_SCORE: self.mapping_score } def __eq__(self, other): @@ -47,8 +61,7 @@ def __eq__(self, other): return False def __str__(self): - return "Mapping: " + self.source_term + " -> " + self._mapped_term_label + \ - " (" + self.mapped_term_iri + ")" + return self.source_term + " -> " + self._mapped_term_label + " (" + self.mapped_term_iri + ")" class TermMappingCollection: diff --git a/tfidf_mapper.py b/text2term/tfidf_mapper.py similarity index 55% rename from tfidf_mapper.py rename to text2term/tfidf_mapper.py index c49dee3..f8e4f07 100644 --- a/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -1,11 +1,10 @@ """Provides TFIDFMapper class""" import logging -import time -import onto_utils import sparse_dot_topn as ct from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils +from text2term.term_mapping import TermMapping, TermMappingCollection class TFIDFMapper: @@ -15,25 +14,24 @@ def __init__(self, target_ontology_terms): :param target_ontology_terms: Collection of ontology terms to be mapped against """ self.logger = onto_utils.get_logger(__name__, logging.INFO) + self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) - def map(self, source_terms, max_mappings=3, min_score=0.3): + def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3, ngram_length=3): """ Main mapping function. Default settings return only the top candidate for every source string. :param source_terms: List of source terms to be mapped with ontology terms + :param source_terms_ids: List of identifiers for the given source terms :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates + :param ngram_length: The gram length n for the string tokenizer """ - self.logger.info("Mapping %i source terms...", len(source_terms)) - start = time.time() - source_terms = onto_utils.normalize_list(source_terms) - vectorizer = self._tokenize(source_terms, self.target_labels) - results_mtx = self._sparse_dot_top(vectorizer, source_terms, self.target_labels, min_score) - results_df, term_graphs = self._get_mappings(results_mtx, max_mappings, source_terms, self.target_terms) - end = time.time() - self.logger.info('done (mapping time: %.2fs seconds)', end-start) - return results_df, term_graphs + source_terms_norm = onto_utils.normalize_list(source_terms) + vectorizer = self._tokenize(source_terms_norm, self.target_labels, n=ngram_length) + results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) + results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) + return results_df def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): """ @@ -47,42 +45,51 @@ def _tokenize(self, source_terms, target_labels, analyzer='char_wb', n=3): # Create count vectorizer and fit it on both lists to get vocabulary count_vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=(n, n)) vocabulary = count_vectorizer.fit(source_terms + target_labels).vocabulary_ - # Create tf-idf vectorizer return TfidfVectorizer(vocabulary=vocabulary, analyzer=analyzer, ngram_range=(n, n)) def _sparse_dot_top(self, vectorizer, source_terms, target_labels, min_score): src_mtx = vectorizer.fit_transform(source_terms).tocsr() tgt_mtx = vectorizer.fit_transform(target_labels).transpose().tocsr() - return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=20, lower_bound=min_score) + # 'ntop' specifies the maximum number of labels/synonyms that should be considered + # multiple labels/synonyms in the 'ntop' matches may be from the same ontology term + return ct.awesome_cossim_topn(src_mtx, tgt_mtx, ntop=50, lower_bound=min_score) - def _get_mappings(self, results_mtx, max_mappings, source_terms, target_terms): + def _get_mappings(self, results_mtx, max_mappings, source_terms, source_terms_ids, target_terms): """ Build and return dataframe for mapping results along with term graphs for the obtained mappings """ coo_mtx = results_mtx.tocoo() - mapping_list = [] - mapping_graph_list = [] - last_source_string = "" - candidate_target_terms = set() + mappings = [] + last_source_term = "" + top_mappings = set() for row, col, score in zip(coo_mtx.row, coo_mtx.col, coo_mtx.data): source_term = source_terms[row] + source_term_id = source_terms_ids[row] onto_term = target_terms[col] - if source_term == last_source_string: - if len(candidate_target_terms) == max_mappings: + self.logger.debug("Source term: %s maps to %s (%f)", source_term, onto_term.label, score) + if source_term == last_source_term: + if len(top_mappings) == max_mappings: continue else: - last_source_string = source_term - candidate_target_terms.clear() - if onto_term.iri not in candidate_target_terms: - mapping = TermMapping(source_term, onto_term.label, onto_term.iri, onto_term.ontology_iri, score) - mapping_list.append(mapping) - mapping_graph_list.append(onto_term.graph().graph_dict()) - candidate_target_terms.add(onto_term.iri) - return TermMappingCollection(mapping_list).mappings_df(), mapping_graph_list + last_source_term = source_term + top_mappings.clear() + if onto_term.iri not in top_mappings: + mappings.append(TermMapping(source_term, source_term_id, onto_term.label, onto_term.iri, score)) + top_mappings.add(onto_term.iri) + return TermMappingCollection(mappings).mappings_df() def _get_target_labels_terms(self, ontology_terms): """Get lists of labels and terms to enable retrieving terms from their labels""" target_labels, target_terms = [], [] - for term in ontology_terms: + for term in ontology_terms.values(): for label in term.labels: - target_labels.append(label) - target_terms.append(term) + if not isinstance(label, str): + self.logger.debug(f"ontology term label {label} is not a string") + else: + target_labels.append(label) + target_terms.append(term) + for synonym in term.synonyms: + if not isinstance(synonym, str): + self.logger.debug(f"ontology term synonym {synonym} is not a string") + else: + target_labels.append(synonym) + target_terms.append(term) return target_labels, target_terms diff --git a/zooma_mapper.py b/text2term/zooma_mapper.py similarity index 55% rename from zooma_mapper.py rename to text2term/zooma_mapper.py index 196882c..8f72377 100644 --- a/zooma_mapper.py +++ b/text2term/zooma_mapper.py @@ -2,10 +2,9 @@ import json import logging -import time import requests -import onto_utils -from term_mapping import TermMapping, TermMappingCollection +from text2term import onto_utils +from text2term.term_mapping import TermMappingCollection, TermMapping class ZoomaMapper: @@ -14,20 +13,30 @@ def __init__(self): self.logger = onto_utils.get_logger(__name__, logging.INFO) self.url = "http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate" - def map(self, source_terms, ontologies, max_mappings=3): - self.logger.info("Mapping %i source terms against ontologies: %s", len(source_terms), ontologies) - start = time.time() + def map(self, source_terms, source_terms_ids, ontologies, max_mappings=3, api_params=()): + """ + Find and return ontology mappings through the Zooma Web service + :param source_terms: Collection of source terms to map to target ontologies + :param source_terms_ids: List of identifiers for the given source terms + :param ontologies: Comma-separated list of ontology acronyms (eg 'HP,EFO') or 'all' to search all ontologies + :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned + :param api_params: Additional Zooma API-specific parameters to include in the request + """ mappings = [] - for term in source_terms: - mappings.extend(self._map_term(term, ontologies, max_mappings)) - self.logger.info('done (mapping time: %.2fs seconds)', time.time()-start) + for term, term_id in zip(source_terms, source_terms_ids): + mappings.extend(self._map_term(term, term_id, ontologies, max_mappings, api_params)) return TermMappingCollection(mappings).mappings_df() - def _map_term(self, source_term, ontologies, max_mappings): + def _map_term(self, source_term, source_term_id, ontologies, max_mappings, api_params): + # see https://www.ebi.ac.uk/spot/zooma/docs/api for details of API parameters + # If 'required:[none]' is specified, Zooma will search the OLS without looking into the datasources. params = { - "propertyValue": source_term, - "ontologies": ontologies + "propertyValue": onto_utils.normalize(source_term), + "filter": "required:[none],ontologies:[" + ontologies + "]" } + if len(api_params) > 0: + params.update(api_params) + self.logger.debug("API parameters: " + str(params)) mappings = [] self.logger.debug("Searching for ontology terms to match: " + source_term) response = self._do_get_request(self.url, params=params) @@ -35,10 +44,10 @@ def _map_term(self, source_term, ontologies, max_mappings): self.logger.debug("...found " + str(len(response)) + " mappings") for mapping in response: if len(mappings) < max_mappings: - mappings.append(self._mapping_details(source_term, mapping)) + mappings.append(self._mapping_details(source_term, source_term_id, mapping)) return mappings - def _mapping_details(self, text, mapping_response): + def _mapping_details(self, source_term, source_term_id, mapping_response): # get ontology term label ann_class = mapping_response["annotatedProperty"] term_label = ann_class["propertyValue"] @@ -47,11 +56,8 @@ def _mapping_details(self, text, mapping_response): tags = mapping_response["semanticTags"] term_iri = tags[0] - ontology_iri = "" # TODO: Get Ontology IRI - - # get mapping confidence score mapping_score = self._mapping_score(mapping_response["confidence"]) - return TermMapping(text, term_label, term_iri, ontology_iri, mapping_score) + return TermMapping(source_term, source_term_id, term_label, term_iri, mapping_score) def _mapping_score(self, confidence): """Represent numerically the mapping confidence categories returned by Zooma (high, good, medium or low)"""