diff --git a/.github/workflows/indigo-ci.yaml b/.github/workflows/indigo-ci.yaml index 5229c3b8f3..1dbcc50372 100644 --- a/.github/workflows/indigo-ci.yaml +++ b/.github/workflows/indigo-ci.yaml @@ -643,7 +643,7 @@ jobs: run: pip install ${GITHUB_WORKSPACE}/dist/epam.indigo-*manylinux1_x86_64.whl working-directory: bingo/bingo-elastic/python - name: Install dev dependencies - run: pip install -r requirements_dev.txt + run: pip install ".[dev]" working-directory: bingo/bingo-elastic/python - name: Setup elasticsearch run: docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "indices.query.bool.max_clause_count=4096" docker.elastic.co/elasticsearch/elasticsearch:7.16.2 @@ -652,7 +652,7 @@ jobs: - name: Build wheel working-directory: bingo/bingo-elastic/python run: | - pip install . + pip install ".[async]" python setup.py bdist_wheel cp dist/*.whl ${GITHUB_WORKSPACE}/dist/ - name: Run pylint diff --git a/bingo/bingo-elastic/python/.pylintrc b/bingo/bingo-elastic/python/.pylintrc index 32fd57594f..8e662f33b8 100644 --- a/bingo/bingo-elastic/python/.pylintrc +++ b/bingo/bingo-elastic/python/.pylintrc @@ -6,3 +6,5 @@ disable= R0903, # too-few-public-methods E1136, # unsubscriptable-object R0801, # duplicate-code + W0621, # redefined-outer-name + W0613, # unused-argument diff --git a/bingo/bingo-elastic/python/README.md b/bingo/bingo-elastic/python/README.md index 9f14e775cf..f3409d01b2 100644 --- a/bingo/bingo-elastic/python/README.md +++ b/bingo/bingo-elastic/python/README.md @@ -1,4 +1,4 @@ -## Bingo API for using with Elasticsearch +## Bingo API for using with Elasticsearch **IN DEVELOPMENT** @@ -25,6 +25,15 @@ Install dependency using pip pip install bingo-elastic ``` +Install async version + +``` +pip install bingo-elastic[async] +``` + +bingo-elastic async version supports all the same methods to index and search +molecules as sync. To use async version, just instantiate `AsyncElasticRepository` + #### Elastisearch installation @@ -41,21 +50,51 @@ Something simple could be done as following: docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "indices.query.bool.max_clause_count=4096" docker.elastic.co/elasticsearch/elasticsearch:7.15.1 ``` -### Usage +### Usage #### Create ElasticRepository +Sync + ``` -repository = ElasticRepository(host="127.0.0.1", port=9200) +repository = ElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) ``` -Other customisations like SSL, custom number of shards/replicas, refresh interval, and many more are supported +Async + +``` +repository = AsyncElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) +... +repository.close() +``` + +Async version also supports async context manager to auto close connections: + +``` +async with AsyncElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) as rep: + ... +``` + +Using with FastAPI: + +``` +app = FastAPI() +rep = AsyncElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) + +# This gets called once the app is shutting down. +@app.on_event("shutdown") +async def app_shutdown(): + await rep.close() +``` + + +Other customisations like SSL, custom number of shards/replicas, refresh interval, and many more are supported by `ElasticRepository` and `AsyncElasticRepository` #### Read Indigo records from file IndigoRecord can be created from IndigoObject. - -Full usage example: + +Full usage example: ``` from bingo_elastic.model.record import IndigoRecord from indigo import Indigo @@ -75,8 +114,8 @@ cml = helpers.iterate_cml("compounds.cml") smi = helpers.iterate_smiles("compounds.smi") ``` -Also function `helpers.iterate_file(file: Path)` is available. This function -selects correct iterate function by file extension. The `file` argument must +Also function `helpers.iterate_file(file: Path)` is available. This function +selects correct iterate function by file extension. The `file` argument must be `pathlib.Path` instance ``` @@ -89,30 +128,58 @@ sdf = helpers.iterate_file(Path("compounds.sdf")) #### Index records into Elasticsearch -Full usage example: +Full usage example sync: ``` from bingo_elastic.model import helpers +from bingo_elastic.elastic import, ElasticRepository IndexName from pathlib import Path -repository = ElasticRepository(host="127.0.0.1", port=9200) +repository = ElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) sdf = helpers.iterate_file(Path("compounds.sdf")) -repository.index_records(sdf); +repository.index_records(sdf) +``` + +Full usage example async: + +_Async indexing and search requires event loop created_ + +``` +import asyncio +from bingo_elastic.model import helpers +from bingo_elastic.elastic import AsyncElasticRepository, IndexName +from pathlib import Path + +async def index_compounds(): + repository = AsyncElasticRepository(IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200) + sdf = helpers.iterate_file(Path("compounds.sdf")) + await repository.index_records(sdf) + +asyncio.run(index_compounds) + ``` *CAVEAT*: Elasticsearch doesn't have strict notion of commit, so records might appear in the index later on Read more about it here - https://www.elastic.co/guide/en/elasticsearch/reference/master/index-modules.html#index-refresh-interval-setting -For indexing one record the the method `ElasticRepository.index_record` can be used +For indexing one record the the method `ElasticRepository.index_record` can be used #### Retrieve similar records from Elasticsearch +Sync: ``` from bingo_elastic.predicates import SimilarityMatch alg = SimilarityMatch(target, 0.9) similar_records = repository.filter(similarity=alg, limit=20) ``` +Async: +``` +from bingo_elastic.predicates import SimilarityMatch +alg = SimilarityMatch(target, 0.9) +similar_records = await repository.filter(similarity=alg, limit=20) +``` + In this case we requested top-20 most similar molecules compared to `target` based on Tanimoto similarity metric Supported similarity algorithms: @@ -122,26 +189,38 @@ Supported similarity algorithms: #### Find exact records from Elasticsearch +Sync: ``` exact_records = repository.filter(exact=target, limit=20) ``` -In this case we requested top-20 candidate molecules with exact same fingerprint to `target`. -`target` should be an instance of `IndigoRecord` class. - +Async: +``` +exact_records = await repository.filter(exact=target, limit=20) +``` +In this case we requested top-20 candidate molecules with exact same fingerprint to `target`. +`target` should be an instance of `IndigoRecord` class. #### Subsctructure match of the records from Elasticsearch +Sync: ``` submatch_records = repository.filter(substructure=target) ``` +Async: +``` +submatch_records = await repository.filter(substructure=target) +``` + In this case we requested top-10 candidate molecules with exact same fingerprint to `target`. #### Custom fields for molecule records +_Async protocol exact same, just don't forget to `await`_ + Indexing records with custom fields ``` diff --git a/bingo/bingo-elastic/python/bingo_elastic/elastic.py b/bingo/bingo-elastic/python/bingo_elastic/elastic.py index c3c315023e..300f80ddc7 100644 --- a/bingo/bingo-elastic/python/bingo_elastic/elastic.py +++ b/bingo/bingo-elastic/python/bingo_elastic/elastic.py @@ -1,9 +1,27 @@ from enum import Enum -from typing import Any, Dict, Generator, List, Optional, Tuple, Union +from typing import ( + Any, + AsyncGenerator, + Dict, + Generator, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, +) from elasticsearch import Elasticsearch from elasticsearch.exceptions import NotFoundError, RequestError from elasticsearch.helpers import streaming_bulk + +try: + from elasticsearch import AsyncElasticsearch + from elasticsearch.helpers import async_streaming_bulk +except ImportError: + pass + from indigo import Indigo # type: ignore from bingo_elastic.model.record import ( @@ -14,6 +32,8 @@ from bingo_elastic.queries import BaseMatch, query_factory from bingo_elastic.utils import PostprocessType +ElasticRepositoryT = TypeVar("ElasticRepositoryT") + class IndexName(Enum): BINGO_MOLECULE = "bingo-molecules" @@ -46,7 +66,104 @@ def elastic_repository_reaction(*args, **kwargs): return ElasticRepository(IndexName.BINGO_REACTION, *args, **kwargs) -class ElasticRepository: +def get_client( + *, + client_type: Type[ElasticRepositoryT], + host: Union[str, List[str]] = "localhost", + port: int = 9200, + scheme: str = "", + http_auth: Optional[Tuple[str]] = None, + ssl_context: Any = None, + request_timeout: int = 60, + retry_on_timeout: bool = True, +) -> ElasticRepositoryT: + arguments = { + "port": port, + "scheme": "https" if scheme == "https" else "http", + "request_timeout": request_timeout, + "retry_on_timeout": retry_on_timeout, + } + if isinstance(host, str): + arguments["host"] = host + else: + arguments["hosts"] = host + + if http_auth: + arguments["http_auth"] = http_auth + + if ssl_context: + arguments["ssl_context"] = ssl_context + + return client_type(**arguments) # type: ignore + + +index_body = { + "mappings": { + "properties": { + "sim_fingerprint": {"type": "keyword", "similarity": "boolean"}, + "sim_fingerprint_len": {"type": "integer"}, + "sub_fingerprint": {"type": "keyword", "similarity": "boolean"}, + "sub_fingerprint_len": {"type": "integer"}, + "cmf": {"type": "binary"}, + } + } +} + + +def check_index_exception(err_: RequestError) -> None: + if not isinstance(err_.info, dict): + raise err_ + cause = err_.info.get("error", {}).get("root_cause", []) + if ( + len(cause) == 1 + and cause[0].get("type") == "resource_already_exists_exception" + ): + return + raise err_ + + +def create_index(index_name: str, el_client: Elasticsearch) -> None: + try: + el_client.indices.create(index=index_name, body=index_body) + except RequestError as err_: + check_index_exception(err_) + + +async def a_create_index( + index_name: str, el_client: "AsyncElasticsearch" +) -> None: + try: + await el_client.indices.create(index=index_name, body=index_body) + except RequestError as err_: + check_index_exception(err_) + + +def prepare( + index_name: str, records: Generator[IndigoRecord, None, None] +) -> Generator[Dict, None, None]: + for record in records: + if get_index_name(record).value != index_name: + raise ValueError( + f"Index {index_name} doesn't support store value " + f"of type {type(record)}" + ) + yield record.as_dict() + + +def response_to_records( + res, index_name, postprocess_actions +) -> Generator[IndigoRecord, None, None]: + indigo_session = Indigo() + for el_response in res.get("hits", {}).get("hits", []): + record = get_record_by_index(el_response, index_name) + for action_fn in postprocess_actions: + record = action_fn(record, indigo_session) # type: ignore + if not record: + continue + yield record + + +class AsyncElasticRepository: def __init__( self, index_name: IndexName, @@ -69,37 +186,107 @@ def __init__( :param timeout: :param retry_on_timeout: """ - arguments = { - "port": port, - "scheme": "https" if scheme == "https" else "http", - "request_timeout": request_timeout, - "retry_on_timeout": retry_on_timeout, - } - if isinstance(host, str): - arguments["host"] = host - else: - arguments["hosts"] = host + self.index_name = index_name.value - if http_auth: - arguments["http_auth"] = http_auth + self.el_client = get_client( + client_type=AsyncElasticsearch, + host=host, + port=port, + scheme=scheme, + http_auth=http_auth, + ssl_context=ssl_context, + request_timeout=request_timeout, + retry_on_timeout=retry_on_timeout, + ) - if ssl_context: - arguments["ssl_context"] = ssl_context + async def index_record(self, record: IndigoRecord): + def gen(): + yield record - self.index_name = index_name.value + return await self.index_records(gen(), chunk_size=1) + + async def index_records(self, records: Generator, chunk_size: int = 500): + await a_create_index(self.index_name, self.el_client) + # pylint: disable=unused-variable + async for is_ok, action in async_streaming_bulk( + self.el_client, + prepare(self.index_name, records), + index=self.index_name, + chunk_size=chunk_size, + ): + pass + + async def filter( + self, + similarity: Union[BaseMatch] = None, + exact: IndigoRecord = None, + substructure: IndigoRecord = None, + limit=10, + **kwargs, + ) -> AsyncGenerator[IndigoRecord, None]: + + # actions needed to be called on elastic_search result + postprocess_actions: PostprocessType = [] + + query = compile_query( + similarity=similarity, + exact=exact, + substructure=substructure, + limit=limit, + postprocess_actions=postprocess_actions, + **kwargs, + ) + res = await self.el_client.search(index=self.index_name, body=query) + for record in response_to_records( + res, self.index_name, postprocess_actions + ): + yield record - self.el_client = Elasticsearch(**arguments) # type: ignore + async def close(self) -> None: + await self.el_client.close() - def __prepare( - self, records: Generator[IndigoRecord, None, None] - ) -> Generator[Dict, None, None]: - for record in records: - if get_index_name(record).value != self.index_name: - raise ValueError( - f"Index {self.index_name} doesn't support store value " - f"of type {type(record)}" - ) - yield record.as_dict() + async def __aenter__(self, *args, **kwargs) -> "AsyncElasticRepository": + return self + + async def __aexit__(self, *args, **kwargs) -> None: + await self.close() + + +class ElasticRepository: + def __init__( + self, + index_name: IndexName, + *, + host: Union[str, List[str]] = "localhost", + port: int = 9200, + scheme: str = "", + http_auth: Optional[Tuple[str]] = None, + ssl_context: Any = None, + request_timeout: int = 60, + retry_on_timeout: bool = True, + ) -> None: + """ + :param index_name: use function get_index_name for setting this argument + :param host: host or list of hosts + :param port: + :param scheme: http or https + :param http_auth: + :param ssl_context: + :param timeout: + :param retry_on_timeout: + """ + self.index_name = index_name.value + + self.el_client = get_client( + client_type=Elasticsearch, + host=host, + port=port, + scheme=scheme, + http_auth=http_auth, + ssl_context=ssl_context, + request_timeout=request_timeout, + retry_on_timeout=retry_on_timeout, + ) def index_record(self, record: IndigoRecord): def gen(): @@ -108,46 +295,16 @@ def gen(): return self.index_records(gen(), chunk_size=1) def index_records(self, records: Generator, chunk_size: int = 500): - self.create_index() + create_index(self.index_name, self.el_client) # pylint: disable=unused-variable for is_ok, action in streaming_bulk( self.el_client, - self.__prepare(records), + prepare(self.index_name, records), index=self.index_name, chunk_size=chunk_size, ): pass - def create_index(self) -> None: - body = { - "mappings": { - "properties": { - "sim_fingerprint": { - "type": "keyword", - "similarity": "boolean", - }, - "sim_fingerprint_len": {"type": "integer"}, - "sub_fingerprint": { - "type": "keyword", - "similarity": "boolean", - }, - "sub_fingerprint_len": {"type": "integer"}, - "cmf": {"type": "binary"}, - } - } - } - try: - self.el_client.indices.create(index=self.index_name, body=body) - except RequestError as err_: - assert isinstance(err_.info, dict) - cause = err_.info.get("error", {}).get("root_cause", []) - if ( - len(cause) == 1 - and cause[0].get("type") == "resource_already_exists_exception" - ): - return - raise err_ - def delete_all_records(self): try: self.el_client.indices.delete(index=self.index_name) @@ -166,7 +323,7 @@ def filter( # actions needed to be called on elastic_search result postprocess_actions: PostprocessType = [] - query = self.__compile( + query = compile_query( similarity=similarity, exact=exact, substructure=substructure, @@ -175,53 +332,48 @@ def filter( **kwargs, ) res = self.el_client.search(index=self.index_name, body=query) - indigo_session = Indigo() - for el_response in res.get("hits", {}).get("hits", []): - record = get_record_by_index(el_response, self.index_name) - for action_fn in postprocess_actions: - record = action_fn(record, indigo_session) # type: ignore - if not record: - continue - yield record + yield from response_to_records( + res, self.index_name, postprocess_actions + ) - # pylint: disable=no-self-use,too-many-arguments - def __compile( - self, - similarity: BaseMatch = None, - exact: IndigoRecord = None, - substructure: IndigoRecord = None, - limit: int = 10, - postprocess_actions: PostprocessType = None, - **kwargs, - ) -> Dict: - - query = { - "size": limit, - "_source": { - "includes": ["*"], - "excludes": [ - "sim_fingerprint", - "sim_fingerprint_len", - "sub_fingerprint_len", - "sub_fingerprint", - ], - }, - } - if similarity and substructure: - raise AttributeError( - "similarity and substructure search " "is not supported" - ) - if similarity: - similarity.compile(query, postprocess_actions) - elif exact: - query_factory("exact", exact).compile(query, postprocess_actions) - elif substructure: - query_factory("substructure", substructure).compile( - query, postprocess_actions - ) +# pylint: disable=too-many-arguments +def compile_query( + similarity: BaseMatch = None, + exact: IndigoRecord = None, + substructure: IndigoRecord = None, + limit: int = 10, + postprocess_actions: PostprocessType = None, + **kwargs, +) -> Dict: + + query = { + "size": limit, + "_source": { + "includes": ["*"], + "excludes": [ + "sim_fingerprint", + "sim_fingerprint_len", + "sub_fingerprint_len", + "sub_fingerprint", + ], + }, + } + if similarity and substructure: + raise AttributeError( + "similarity and substructure search " "is not supported" + ) + + if similarity: + similarity.compile(query, postprocess_actions) + elif exact: + query_factory("exact", exact).compile(query, postprocess_actions) + elif substructure: + query_factory("substructure", substructure).compile( + query, postprocess_actions + ) - for key, value in kwargs.items(): - query_factory(key, value).compile(query) + for key, value in kwargs.items(): + query_factory(key, value).compile(query) - return query + return query diff --git a/bingo/bingo-elastic/python/requirements_dev.txt b/bingo/bingo-elastic/python/requirements_dev.txt deleted file mode 100644 index 7d7bd416e8..0000000000 --- a/bingo/bingo-elastic/python/requirements_dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -pylint -pytest -wheel -elasticsearch==7.16.2 diff --git a/bingo/bingo-elastic/python/setup.py b/bingo/bingo-elastic/python/setup.py index 39a0959a45..9ccaa6c447 100644 --- a/bingo/bingo-elastic/python/setup.py +++ b/bingo/bingo-elastic/python/setup.py @@ -34,10 +34,18 @@ }, download_url="https://pypi.org/project/bingo_elastic", python_requires=">=3.7", - packages=[ - "bingo_elastic", - "bingo_elastic.model", - ], + packages=["bingo_elastic", "bingo_elastic.model"], install_requires=["epam.indigo==1.7.0-beta", "elasticsearch==7.16.2"], + extras_require={ + "async": ["elasticsearch[async]==7.16.2"], + "dev": [ + "pylint", + "pytest", + "wheel", + "black", + "pytest-asyncio", + "mypy", + ], + }, classifiers=[_f for _f in CLASSIFIERS.split("\n") if _f], ) diff --git a/bingo/bingo-elastic/python/tests/conftest.py b/bingo/bingo-elastic/python/tests/conftest.py index d59deabdd1..058cb1dcd9 100644 --- a/bingo/bingo-elastic/python/tests/conftest.py +++ b/bingo/bingo-elastic/python/tests/conftest.py @@ -5,7 +5,11 @@ import pytest from indigo import Indigo # type: ignore -from bingo_elastic.elastic import ElasticRepository, IndexName +from bingo_elastic.elastic import ( + AsyncElasticRepository, + ElasticRepository, + IndexName, +) from bingo_elastic.model.helpers import iterate_file, load_reaction from bingo_elastic.model.record import IndigoRecordMolecule @@ -43,6 +47,26 @@ def elastic_repository_reaction() -> ElasticRepository: ) +@pytest.fixture +def a_elastic_repository_molecule() -> Callable[[], AsyncElasticRepository]: + def wraped(): + return AsyncElasticRepository( + IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200 + ) + + return wraped + + +@pytest.fixture +def a_elastic_repository_reaction() -> Callable[[], AsyncElasticRepository]: + def wraped(): + return AsyncElasticRepository( + IndexName.BINGO_REACTION, host="127.0.0.1", port=9200 + ) + + return wraped + + @pytest.fixture(autouse=True) def clear_index( elastic_repository_molecule: ElasticRepository, @@ -54,8 +78,7 @@ def clear_index( @pytest.fixture def loaded_sdf( - elastic_repository_molecule: ElasticRepository, - resource_loader, + elastic_repository_molecule: ElasticRepository, resource_loader ) -> IndigoRecordMolecule: resource = resource_loader("molecules/rand_queries_small.sdf") sdf = iterate_file(Path(resource)) diff --git a/bingo/bingo-elastic/python/tests/model/test_helpers.py b/bingo/bingo-elastic/python/tests/model/test_helpers.py index 4a1df7c8c4..85d663f4af 100644 --- a/bingo/bingo-elastic/python/tests/model/test_helpers.py +++ b/bingo/bingo-elastic/python/tests/model/test_helpers.py @@ -1,6 +1,6 @@ from pathlib import Path -import bingo_elastic.model.helpers as helpers +from bingo_elastic.model import helpers from bingo_elastic.model.record import ( IndigoRecordMolecule, IndigoRecordReaction, diff --git a/bingo/bingo-elastic/python/tests/model/test_record.py b/bingo/bingo-elastic/python/tests/model/test_record.py index 1fe222deab..6764e91238 100644 --- a/bingo/bingo-elastic/python/tests/model/test_record.py +++ b/bingo/bingo-elastic/python/tests/model/test_record.py @@ -58,12 +58,9 @@ def test_create_reaction( resource_loader("reactions/rheadb/58029.rxn") ) indigo_reaction = IndigoRecordReaction(indigo_object=reaction) - test_smiles = set( - [ - reactant.canonicalSmiles() - for reactant in reaction.iterateReactants() - ] - ) + test_smiles = { + reactant.canonicalSmiles() for reactant in reaction.iterateReactants() + } count_reactants = reaction.countReactants() count_products = reaction.countProducts() assert isinstance(indigo_reaction, IndigoRecordReaction) diff --git a/bingo/bingo-elastic/python/tests/test_elastic.py b/bingo/bingo-elastic/python/tests/test_elastic.py index c3aafc8f58..af5a05ed03 100644 --- a/bingo/bingo-elastic/python/tests/test_elastic.py +++ b/bingo/bingo-elastic/python/tests/test_elastic.py @@ -1,10 +1,15 @@ import time from pathlib import Path +from typing import Callable import pytest from indigo import Indigo # type: ignore -from bingo_elastic.elastic import ElasticRepository +from bingo_elastic.elastic import ( + AsyncElasticRepository, + ElasticRepository, + IndexName, +) from bingo_elastic.model.helpers import iterate_file from bingo_elastic.model.record import IndigoRecordMolecule, as_iob from bingo_elastic.queries import ( @@ -15,10 +20,11 @@ WildcardQuery, ) +AsyncRepositoryT = Callable[[], AsyncElasticRepository] + def test_create_index( - elastic_repository_molecule: ElasticRepository, - resource_loader, + elastic_repository_molecule: ElasticRepository, resource_loader ): sdf = iterate_file( Path(resource_loader("molecules/rand_queries_small.sdf")) @@ -26,6 +32,28 @@ def test_create_index( elastic_repository_molecule.index_records(sdf, chunk_size=10) +@pytest.mark.asyncio +async def test_a_create_index( + a_elastic_repository_molecule: AsyncRepositoryT, resource_loader +): + sdf = iterate_file( + Path(resource_loader("molecules/rand_queries_small.sdf")) + ) + async with a_elastic_repository_molecule() as rep: + await rep.index_records(sdf, chunk_size=10) + + +@pytest.mark.asyncio +async def test_a_cm_create_index(resource_loader): + sdf = iterate_file( + Path(resource_loader("molecules/rand_queries_small.sdf")) + ) + async with AsyncElasticRepository( + IndexName.BINGO_MOLECULE, host="127.0.0.1", port=9200 + ) as elastic_rep: + await elastic_rep.index_records(sdf, chunk_size=10) + + def test_similarity_matches( elastic_repository_molecule: ElasticRepository, indigo_fixture: Indigo, @@ -43,6 +71,29 @@ def test_similarity_matches( ) +@pytest.mark.asyncio +async def test_a_similarity_matches( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + loaded_sdf: IndigoRecordMolecule, +): + for sim_alg in [ + TanimotoSimilarityMatch(loaded_sdf, 0.9), + EuclidSimilarityMatch(loaded_sdf, 0.9), + TverskySimilarityMatch(loaded_sdf, 0.9, 0.5, 0.5), + ]: + async with a_elastic_repository_molecule() as rep: + result = rep.filter(similarity=sim_alg) + async for mol in result: + assert ( + loaded_sdf.as_indigo_object( + indigo_fixture + ).canonicalSmiles() + == mol.as_indigo_object(indigo_fixture).canonicalSmiles() + ) + break + + def test_exact_match( elastic_repository_molecule: ElasticRepository, indigo_fixture: Indigo, @@ -55,10 +106,28 @@ def test_exact_match( ) -def test_filter_by_name( - elastic_repository_molecule: ElasticRepository, +@pytest.mark.asyncio +async def test_a_exact_match( + a_elastic_repository_molecule: AsyncRepositoryT, indigo_fixture: Indigo, loaded_sdf: IndigoRecordMolecule, +): + async with a_elastic_repository_molecule() as rep: + result = rep.filter(exact=loaded_sdf) + async for mol in result: + assert ( + loaded_sdf.as_indigo_object(indigo_fixture).canonicalSmiles() + == mol.as_indigo_object(indigo_fixture).canonicalSmiles() + ) + break + + +@pytest.mark.asyncio +async def test_filter_by_name( + elastic_repository_molecule: ElasticRepository, + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + loaded_sdf: IndigoRecordMolecule, # pylint: disable=unused-argument resource_loader, ): mol = indigo_fixture.loadMoleculeFromFile( @@ -68,10 +137,22 @@ def test_filter_by_name( IndigoRecordMolecule(indigo_object=mol) ) time.sleep(1) + + ################################################################## + + # Sync test result = elastic_repository_molecule.filter(name="Composition1") for item in result: assert item.name == "Composition1" + # Async test + async with a_elastic_repository_molecule() as rep: + async for item in rep.filter(name="Composition1"): + assert item.name == "Composition1" + + ################################################################## + + # Sync test result = elastic_repository_molecule.filter( similarity=TanimotoSimilarityMatch( IndigoRecordMolecule(indigo_object=mol), 0.1 @@ -83,6 +164,21 @@ def test_filter_by_name( i += 1 assert i == 10 + # Async test + async with a_elastic_repository_molecule() as rep: + i = 0 + async for _ in rep.filter( + similarity=TanimotoSimilarityMatch( + IndigoRecordMolecule(indigo_object=mol), 0.1 + ) + ): + i += 1 + assert i == 10 + + ################################################################## + + # Sync test + result = elastic_repository_molecule.filter( similarity=TanimotoSimilarityMatch( IndigoRecordMolecule(indigo_object=mol), 0.1 @@ -93,6 +189,16 @@ def test_filter_by_name( for item in result: assert item.name == "Composition1" + # Async test + async with a_elastic_repository_molecule() as rep: + async for item in rep.filter( + similarity=TanimotoSimilarityMatch( + IndigoRecordMolecule(indigo_object=mol), 0.1 + ), + name="Composition1", + ): + assert item.name == "Composition1" + def test_substructure_search( elastic_repository_molecule: ElasticRepository, @@ -107,6 +213,23 @@ def test_substructure_search( ) +@pytest.mark.asyncio +async def test_a_substructure_search( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + loaded_sdf: IndigoRecordMolecule, +): + async with a_elastic_repository_molecule() as rep: + result = rep.filter(substructure=loaded_sdf) + async for item in result: + assert ( + item.as_indigo_object(indigo_fixture).canonicalSmiles() + == loaded_sdf.as_indigo_object( + indigo_fixture + ).canonicalSmiles() + ) + + def test_range_search( elastic_repository_molecule: ElasticRepository, indigo_fixture: Indigo, @@ -124,6 +247,29 @@ def test_range_search( assert i == 10 +@pytest.mark.asyncio +async def test_a_range_search( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + resource_loader, +): + async with a_elastic_repository_molecule() as rep: + for i, item in enumerate( + iterate_file( + Path(resource_loader("molecules/rand_queries_small.sdf")) + ) + ): + item.ind_number = i # type: ignore + await rep.index_record(item) + + async with a_elastic_repository_molecule() as rep: + result = rep.filter(ind_number=RangeQuery(1, 10)) + i = 0 + async for _ in result: + i += 1 + assert i == 10 + + def test_wildcard_search( elastic_repository_molecule: ElasticRepository, indigo_fixture: Indigo, @@ -142,6 +288,24 @@ def test_wildcard_search( assert item.name == "Composition1" +@pytest.mark.asyncio +async def test_a_wildcard_search( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + loaded_sdf: IndigoRecordMolecule, + resource_loader, +): + mol = indigo_fixture.loadMoleculeFromFile( + resource_loader("molecules/composition1.mol") + ) + async with a_elastic_repository_molecule() as rep: + rep.index_record(IndigoRecordMolecule(indigo_object=mol)) + async with a_elastic_repository_molecule() as rep: + result = rep.filter(name=WildcardQuery("Comp*")) + async for item in result: + assert item.name == "Composition1" + + def test_custom_fields( elastic_repository_molecule: ElasticRepository, indigo_fixture: Indigo, @@ -161,7 +325,33 @@ def test_custom_fields( PUBCHEM_IUPAC_INCHIKEY="RDHQFKQIGNGIED-UHFFFAOYSA-N" ) for item in result: - assert item.PUBCHEM_IUPAC_INCHIKEY == "RDHQFKQIGNGIED-UHFFFAOYSA-N" # type: ignore + iupac_inch = item.PUBCHEM_IUPAC_INCHIKEY # type: ignore + assert iupac_inch == "RDHQFKQIGNGIED-UHFFFAOYSA-N" + + +@pytest.mark.asyncio +async def test_a_custom_fields( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + loaded_sdf: IndigoRecordMolecule, + resource_loader, +): + mol = indigo_fixture.loadMoleculeFromFile( + resource_loader("molecules/composition1.mol") + ) + rec = IndigoRecordMolecule( + indigo_object=mol, PUBCHEM_IUPAC_INCHIKEY="RDHQFKQIGNGIED-UHFFFAOYSA-N" + ) + async with a_elastic_repository_molecule() as rep: + rep.index_record(rec) + + async with a_elastic_repository_molecule() as rep: + result = rep.filter( + PUBCHEM_IUPAC_INCHIKEY="RDHQFKQIGNGIED-UHFFFAOYSA-N" + ) + async for item in result: + iupac_inch = item.PUBCHEM_IUPAC_INCHIKEY # type: ignore + assert iupac_inch == "RDHQFKQIGNGIED-UHFFFAOYSA-N" def test_search_empty_fingerprint( @@ -190,6 +380,35 @@ def test_search_empty_fingerprint( next(result).as_indigo_object(indigo_fixture).canonicalSmiles() +@pytest.mark.asyncio +async def test_a_search_empty_fingerprint( + a_elastic_repository_molecule: AsyncRepositoryT, + indigo_fixture: Indigo, + resource_loader, +): + async with a_elastic_repository_molecule() as rep: + for smile in ["[H][H]", "[H][F]"]: + rec = IndigoRecordMolecule( + indigo_object=indigo_fixture.loadMolecule(smile), + skip_errors=True, + ) + await rep.index_record(rec) + + async with a_elastic_repository_molecule() as rep: + result = rep.filter( + exact=IndigoRecordMolecule( + indigo_object=indigo_fixture.loadMolecule("[H][H]"), + skip_errors=True, + ) + ) + + async for mol in result: + assert ( + "[H][H]" + == mol.as_indigo_object(indigo_fixture).canonicalSmiles() + ) + + def test_similaririty_matches_reactions( elastic_repository_reaction: ElasticRepository, loaded_rxns, @@ -234,3 +453,49 @@ def test_similaririty_matches_reactions( as_iob(found_reaction, indigo_fixture).countReactants() == reaction.countReactants() ) + + +@pytest.mark.asyncio +async def test_a_similaririty_matches_reactions( + a_elastic_repository_reaction: AsyncRepositoryT, + loaded_rxns, + resource_loader, + indigo_fixture, +) -> None: + + reaction = indigo_fixture.loadReactionFromFile( + resource_loader("reactions/rheadb/50353.rxn") + ) + + reaction_rec = IndigoRecordMolecule(indigo_object=reaction) + + async with a_elastic_repository_reaction() as rep: + async for found_reaction in rep.filter( + similarity=TanimotoSimilarityMatch(reaction_rec, 0.99) + ): + assert ( + as_iob(found_reaction, indigo_fixture).countReactants() + == reaction.countReactants() + ) + + async for found_reaction in rep.filter( + similarity=EuclidSimilarityMatch(reaction_rec, 0.99) + ): + assert ( + as_iob(found_reaction, indigo_fixture).countReactants() + == reaction.countReactants() + ) + + async for found_reaction in rep.filter( + similarity=TverskySimilarityMatch(reaction_rec, 0.99) + ): + assert ( + as_iob(found_reaction, indigo_fixture).countReactants() + == reaction.countReactants() + ) + + async for found_reaction in rep.filter(exact=reaction_rec): + assert ( + as_iob(found_reaction, indigo_fixture).countReactants() + == reaction.countReactants() + )