From 143bf7085b7f2064403f5a3c793c192077f10a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Thu, 6 Jul 2023 07:56:59 +0000 Subject: [PATCH 01/52] example changed --- src/connectors/__init__.py | 3 + src/connectors/abstract/resource_connector.py | 10 ++- src/connectors/example/example_connector.py | 18 ++++- .../example/example_dataset_connector.py | 65 ++++++++++++++++++- .../example/resources/publications.json | 2 +- .../huggingface_dataset_connector.py | 8 ++- .../openml/openml_dataset_connector.py | 2 + .../zenodo/zenodo_dataset_connector.py | 2 + src/database/setup.py | 2 +- .../example/test_example_connector.py | 25 ++++++- .../test_huggingface_dataset_connector.py | 4 ++ .../openml/test_openml_dataset_connector.py | 8 ++- .../zenodo/test_get_datasets_zenodo.py | 2 + 13 files changed, 136 insertions(+), 15 deletions(-) diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index 795f243b..f4dddbdc 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -13,6 +13,8 @@ from .abstract.resource_connector import ResourceConnector # noqa:F401 from .example.example_connector import ExampleConnector from .example.example_dataset_connector import ExampleDatasetConnector + +""" from .huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector from .openml.openml_dataset_connector import OpenMlDatasetConnector from .zenodo.zenodo_dataset_connector import ZenodoDatasetConnector @@ -25,6 +27,7 @@ ZenodoDatasetConnector(), ) } +""" _path_example_resources = pathlib.Path(__file__).parent / "example" / "resources" diff --git a/src/connectors/abstract/resource_connector.py b/src/connectors/abstract/resource_connector.py index ff096b3d..706d026a 100644 --- a/src/connectors/abstract/resource_connector.py +++ b/src/connectors/abstract/resource_connector.py @@ -1,4 +1,5 @@ import abc +from datetime import datetime from typing import Generic, TypeVar, Iterator from sqlmodel import SQLModel @@ -27,8 +28,13 @@ def platform_name(self) -> PlatformName: pass @abc.abstractmethod - def fetch_all( - self, limit: int | None = None + def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel]: + """Retrieve information of the resource identified by id""" + pass + + @abc.abstractmethod + def fetch( + self, from_incl: datetime | None = None, to_excl: datetime | None = None ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel]]: """Retrieve information of all resources""" pass diff --git a/src/connectors/example/example_connector.py b/src/connectors/example/example_connector.py index 31cb4309..956de247 100644 --- a/src/connectors/example/example_connector.py +++ b/src/connectors/example/example_connector.py @@ -1,3 +1,4 @@ +from datetime import datetime import json import pathlib from typing import Iterator, TypeVar @@ -29,9 +30,22 @@ def resource_class(self) -> type[RESOURCE]: def platform_name(self) -> PlatformName: return PlatformName.example - def fetch_all(self, limit: int | None = None) -> Iterator[RESOURCE]: + def retry(self, id: str) -> RESOURCE: + """Retrieve information of the resource identified by id""" with open(self.json_path) as f: json_data = json.load(f) pydantic_class = resource_create(self.resource_class) - for json_item in json_data[:limit]: + for json_item in json_data: + if json_item.get("platform_identifier") == id: + return pydantic_class(**json_item) + raise Exception("No resource associated with the id") + return + + def fetch( + self, from_incl: datetime | None = None, to_excl: datetime | None = None + ) -> Iterator[RESOURCE]: + with open(self.json_path) as f: + json_data = json.load(f) + pydantic_class = resource_create(self.resource_class) + for json_item in json_data: yield pydantic_class(**json_item) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index d126a3cd..c1f7cafe 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -1,3 +1,4 @@ +from datetime import datetime import typing # noqa:F401 (flake8 raises incorrect 'Module imported but unused' error) from connectors import ResourceConnector @@ -17,8 +18,66 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.example - def fetch_all( - self, limit: int | None = None + def retry(self, id: str) -> ResourceWithRelations[Dataset]: + """Retrieve information of the resource identified by id""" + pydantic_class = resource_create(Dataset) + pydantic_class_publication = resource_create(Publication) + datasets = [ + ResourceWithRelations[Dataset]( + resource=pydantic_class( + name="Higgs", + platform="openml", + description="Higgs dataset", + same_as="non-existing-url/1", + platform_identifier="42769", + alternate_names=[], + citations=[], + distributions=[], + is_part=[], + has_parts=[], + keywords=["keyword1", "keyword2"], + measured_values=[], + ), + related_resources={ + "citations": [ + pydantic_class_publication( + title=( + "Searching for exotic particles in high-energy physics with deep " + "learning" + ), + doi="2", + platform="example", + platform_identifier="2", + datasets=[], + ) + ] + }, + ), + ResourceWithRelations[Dataset]( + resource=pydantic_class( + name="porto-seguro", + platform="openml", + description="Porto seguro dataset", + same_as="non-existing-url/2", + platform_identifier="42742", + alternate_names=[], + citations=[], + distributions=[], + is_part=[], + has_parts=[], + keywords=[], + measured_values=[], + ) + ), + ] + for dataset in datasets: + if dataset.resource.platform_identifier == id: + return dataset + raise Exception("No resource associated with the id") + return None + + def fetch( + self, from_incl: datetime | None = None, to_excl: datetime | None = None ) -> typing.Iterator[ResourceWithRelations[Dataset]]: pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) @@ -69,4 +128,4 @@ def fetch_all( measured_values=[], ) ), - ][:limit] + ] diff --git a/src/connectors/example/resources/publications.json b/src/connectors/example/resources/publications.json index 427d19d2..b3e1501f 100644 --- a/src/connectors/example/resources/publications.json +++ b/src/connectors/example/resources/publications.json @@ -1,7 +1,7 @@ [ { "platform": "example", - "platform_identifier": 1, + "platform_identifier": "1", "title": "The Art of Fiction", "doi": "10.1234/567890", "creators": "Jane Smith", diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index dec85919..5eb05036 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -1,3 +1,4 @@ +""" import itertools import logging import typing @@ -27,9 +28,11 @@ def platform_name(self) -> PlatformName: @staticmethod def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]]: - """ +""" +""" Perform a GET request and raise an exception if the response code is not OK. - """ +""" +""" response = requests.get(url, params={"dataset": dataset_id}) response_json = response.json() if not response.ok: @@ -130,3 +133,4 @@ def fetch_all( logging.error( f"Error while fetching huggingface dataset with id {dataset.id}: " f"{str(e)}" ) +""" diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 707aeb8b..fd96752a 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -2,6 +2,7 @@ This module knows how to load an OpenML object based on its AIoD implementation, and how to convert the OpenML response to some agreed AIoD format. """ +""" import logging from typing import Iterator @@ -107,3 +108,4 @@ def _as_int(v: str) -> int: if not as_float.is_integer(): raise ValueError(f"The input should be an integer, but was a float: {v}") return int(as_float) +""" diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 84abdefd..c0d3e070 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -1,3 +1,4 @@ +""" from datetime import datetime import logging from typing import Iterator @@ -156,3 +157,4 @@ def fetch_all(self, limit: int | None = None) -> Iterator[Dataset]: sickle = Sickle("https://zenodo.org/oai2d") date = datetime(2000, 1, 1, 12, 0, 0) # this should be a paramater return self._retrieve_dataset_from_datetime(sickle, date, limit) +""" diff --git a/src/database/setup.py b/src/database/setup.py index 5d54445a..4a8a7441 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -86,7 +86,7 @@ def populate_database( # This is a temporary solution. After finishing the Connectors (so that they're # synchronizing), we will probably just perform a HTTP POST instead. - for item in connector.fetch_all(limit=limit): + for item in connector.fetch(): if isinstance(item, ResourceWithRelations): resource_create_instance = item.resource _create_or_fetch_related_objects(session, item) diff --git a/src/tests/connectors/example/test_example_connector.py b/src/tests/connectors/example/test_example_connector.py index 2434efcd..0a6f792c 100644 --- a/src/tests/connectors/example/test_example_connector.py +++ b/src/tests/connectors/example/test_example_connector.py @@ -17,10 +17,31 @@ "organisations", ], ) -def test_fetch_all_happy_path(datatype: str): +def test_fetch_happy_path(datatype: str): connector = connectors.example_connectors[datatype] - resources = list(connector.fetch_all(limit=None)) + resources = list(connector.fetch()) assert len(resources) >= 1 resource = resources[0] if hasattr(resource, "keywords"): # otherwise, only tested that connector can run assert set(resource.keywords) == {"keyword1", "keyword2"} + + +@pytest.mark.parametrize( + "datatype", + [ + "case_studies", + "computational_resources", + "educational_resources", + "events", + "presentations", + "projects", + "publications", + "news", + "organisations", + ], +) +def test_retry_happy_path(datatype: str): + connector = connectors.example_connectors[datatype] + resource = connector.retry("1") + if hasattr(resource, "keywords"): # otherwise, only tested that connector can run + assert set(resource.keywords) == {"keyword1", "keyword2"} diff --git a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py index 6a943150..5cb75261 100644 --- a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py +++ b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py @@ -1,7 +1,9 @@ +""" import json import responses + import connectors from connectors.resource_with_relations import ResourceWithRelations from database.model.platform.platform_names import PlatformName @@ -44,6 +46,7 @@ def test_fetch_all_happy_path(): assert all(len(r.related_resources["citations"]) == 1 for r in resources_with_relations) + def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str): filename = f"parquet_{dataset_id.replace('/', '_')}.json" path_split = path_test_resources() / "connectors" / "huggingface" / filename @@ -56,3 +59,4 @@ def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str): json=response, status=status, ) +""" diff --git a/src/tests/connectors/openml/test_openml_dataset_connector.py b/src/tests/connectors/openml/test_openml_dataset_connector.py index 3f49fae2..20cbdfc1 100644 --- a/src/tests/connectors/openml/test_openml_dataset_connector.py +++ b/src/tests/connectors/openml/test_openml_dataset_connector.py @@ -1,3 +1,4 @@ +""" import json import responses @@ -66,9 +67,11 @@ def test_fetch_all_happy_path(): def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_identifier: str): - """ +""" +""" Mocking requests to the OpenML dependency, so that we test only our own services - """ +""" +""" with open( path_test_resources() / "connectors" / "openml" / f"data_{platform_identifier}.json", "r", @@ -94,3 +97,4 @@ def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_iden json=data_qualities_response, status=200, ) +""" diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 416b71b8..06274887 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -1,3 +1,4 @@ +""" from datetime import datetime import responses import connectors @@ -51,3 +52,4 @@ def mock_zenodo_responses(mocked_requests: responses.RequestsMock): body=records_list, status=200, ) +""" From d698a6acf7c5ea8ba28781cfe6dd31fff41ee63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Thu, 6 Jul 2023 12:23:26 +0000 Subject: [PATCH 02/52] connector by date --- src/connectors/__init__.py | 4 ++-- .../{resource_connector.py => resource_connector_by_date.py} | 2 +- src/connectors/example/example_connector.py | 5 +++-- src/connectors/example/example_dataset_connector.py | 5 +++-- src/database/setup.py | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) rename src/connectors/abstract/{resource_connector.py => resource_connector_by_date.py} (94%) diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index f4dddbdc..abab5e91 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -10,7 +10,7 @@ from database.model.project.project import Project from database.model.publication.publication import Publication from database.model.organisation.organisation import Organisation -from .abstract.resource_connector import ResourceConnector # noqa:F401 +from .abstract.resource_connector_by_date import ResourceConnectorByDate # noqa:F401 from .example.example_connector import ExampleConnector from .example.example_dataset_connector import ExampleDatasetConnector @@ -60,4 +60,4 @@ "organisations": ExampleConnector( resource_class=Organisation, json_path=_path_example_resources / "organisations.json" ), -} # type: Dict[str, ResourceConnector] +} # type: Dict[str, ResourceConnectorByDate] diff --git a/src/connectors/abstract/resource_connector.py b/src/connectors/abstract/resource_connector_by_date.py similarity index 94% rename from src/connectors/abstract/resource_connector.py rename to src/connectors/abstract/resource_connector_by_date.py index 706d026a..1773c1ec 100644 --- a/src/connectors/abstract/resource_connector.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -11,7 +11,7 @@ RESOURCE = TypeVar("RESOURCE", bound=SQLModel) -class ResourceConnector(abc.ABC, Generic[RESOURCE]): +class ResourceConnectorByDate(abc.ABC, Generic[RESOURCE]): """ For every platform that offers this resource, this ResourceConnector should be implemented. """ diff --git a/src/connectors/example/example_connector.py b/src/connectors/example/example_connector.py index 956de247..64f65238 100644 --- a/src/connectors/example/example_connector.py +++ b/src/connectors/example/example_connector.py @@ -4,8 +4,9 @@ from typing import Iterator, TypeVar from sqlmodel import SQLModel +from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate + -from connectors import ResourceConnector from database.model.resource import resource_create from database.model.platform.platform_names import PlatformName @@ -13,7 +14,7 @@ RESOURCE = TypeVar("RESOURCE", bound=SQLModel) -class ExampleConnector(ResourceConnector[RESOURCE]): +class ExampleConnector(ResourceConnectorByDate[RESOURCE]): """ Creating hardcoded values example values based on json files """ diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index c1f7cafe..07fda2bf 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -1,7 +1,8 @@ from datetime import datetime import typing # noqa:F401 (flake8 raises incorrect 'Module imported but unused' error) -from connectors import ResourceConnector + +from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset from database.model.publication.publication import Publication @@ -9,7 +10,7 @@ from database.model.platform.platform_names import PlatformName -class ExampleDatasetConnector(ResourceConnector[Dataset]): +class ExampleDatasetConnector(ResourceConnectorByDate[Dataset]): @property def resource_class(self) -> type[Dataset]: return Dataset diff --git a/src/database/setup.py b/src/database/setup.py index 4a8a7441..0d052090 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -10,7 +10,7 @@ from sqlmodel import create_engine, Session, select, SQLModel import routers -from connectors import ResourceConnector +from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset from database.model.platform.platform import Platform @@ -62,7 +62,7 @@ def drop_or_create_database(url: str, delete_first: bool): def populate_database( engine: Engine, - connectors: List[ResourceConnector], + connectors: List[ResourceConnectorByDate], only_if_empty: bool = True, limit: int | None = None, ): From 5128a38adab95298471eb9339aa3ec91e26372cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 10 Jul 2023 10:46:24 +0000 Subject: [PATCH 03/52] zenodo conector --- src/connectors/__init__.py | 12 +- .../zenodo/zenodo_dataset_connector.py | 47 +++++-- .../zenodo/test_get_datasets_zenodo.py | 46 ++++++- .../resources/connectors/zenodo/dataset.json | 129 ++++++++++++++++++ src/uploader/hugging_face_uploader.py | 6 +- 5 files changed, 212 insertions(+), 28 deletions(-) create mode 100644 src/tests/resources/connectors/zenodo/dataset.json diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index abab5e91..2d350787 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -14,20 +14,20 @@ from .example.example_connector import ExampleConnector from .example.example_dataset_connector import ExampleDatasetConnector -""" -from .huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector -from .openml.openml_dataset_connector import OpenMlDatasetConnector + +# from .huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector +# from .openml.openml_dataset_connector import OpenMlDatasetConnector from .zenodo.zenodo_dataset_connector import ZenodoDatasetConnector dataset_connectors = { c.platform_name: c for c in ( - OpenMlDatasetConnector(), - HuggingFaceDatasetConnector(), + # OpenMlDatasetConnector(), + # HuggingFaceDatasetConnector(), ZenodoDatasetConnector(), ) } -""" + _path_example_resources = pathlib.Path(__file__).parent / "example" / "resources" diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index c0d3e070..8f9d2435 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -1,11 +1,11 @@ -""" from datetime import datetime import logging from typing import Iterator +import requests from sickle import Sickle import xmltodict -from connectors import ResourceConnector +from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate from database.model.dataset.dataset import Dataset from database.model.general.keyword import Keyword from database.model.general.license import License @@ -14,7 +14,7 @@ DATE_FORMAT = "%Y-%m-%d" -class ZenodoDatasetConnector(ResourceConnector[Dataset]): +class ZenodoDatasetConnector(ResourceConnectorByDate[Dataset]): @property def resource_class(self) -> type[Dataset]: return Dataset @@ -23,6 +23,24 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo + def retry(self, id: str) -> Dataset: + """Retrieve information of the resource identified by id""" + record = requests.get(f"https://zenodo.org/api/records/{id}").json() + + creators_list = [item["name"] for item in record["metadata"]["creators"]] + creator = "; ".join(creators_list) # TODO change field to an array + return Dataset( + platform="zenodo", + platform_identifier=id, + date_published=record.get("created"), + name=record.get("metadata").get("title"), + description=record.get("metadata").get("description"), + creator=creator, + publisher="Zenodo", + license=License(name=record.get("metadata").get("license").get("id")), + keywords=[Keyword(name=k) for k in record.get("metadata").get("keywords")], + ) + def _get_record_dictionary(self, record): xml_string = record.raw xml_dict = xmltodict.parse(xml_string) @@ -132,29 +150,28 @@ def _get_resource_type(self, record): return None def _retrieve_dataset_from_datetime( - self, - sk: Sickle, - dt: datetime, - limit: int | None = None, + self, sk: Sickle, from_incl: datetime, to_excl: datetime | None = None ) -> Iterator[Dataset]: records = sk.ListRecords( **{ "metadataPrefix": "oai_datacite", - "from": dt.isoformat(), + "from": from_incl.isoformat(), } ) - counter = 0 + record = next(records, None) - while record and (limit is None or counter < limit): + last_date = None + while record and (to_excl is None or (last_date is not None and last_date < to_excl)): if self._get_resource_type(record) == "Dataset": dataset = self._dataset_from_record(record) if dataset is not None: - counter += 1 + last_date = dataset.date_published yield dataset record = next(records, None) - def fetch_all(self, limit: int | None = None) -> Iterator[Dataset]: + def fetch( + self, from_incl: datetime | None = None, to_excl: datetime | None = None + ) -> Iterator[Dataset]: sickle = Sickle("https://zenodo.org/oai2d") - date = datetime(2000, 1, 1, 12, 0, 0) # this should be a paramater - return self._retrieve_dataset_from_datetime(sickle, date, limit) -""" + date = from_incl if from_incl is not None else datetime(2000, 1, 1, 12, 0, 0) + return self._retrieve_dataset_from_datetime(sickle, date, to_excl=to_excl) diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 06274887..15c7ce0f 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -1,5 +1,4 @@ -""" -from datetime import datetime +import datetime import responses import connectors from database.model.platform.platform_names import PlatformName @@ -12,11 +11,11 @@ def read_file(path): return content -def test_fetch_all_happy_path(): +def test_fetch_happy_path(): connector = connectors.dataset_connectors[PlatformName.zenodo] with responses.RequestsMock() as mocked_requests: mock_zenodo_responses(mocked_requests) - datasets = list(connector.fetch_all()) + datasets = list(connector.fetch()) assert len(datasets) == 1 dataset = datasets[0] assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" @@ -25,7 +24,7 @@ def test_fetch_all_happy_path(): dataset.creator == "Hansen, Peter Martin; Alstrøm, henrik; Damm-Hejmdal, Anders; Mikkelsen, Søren" ) - assert dataset.date_published == datetime(2023, 5, 6) + assert dataset.date_published == datetime.datetime(2023, 5, 6) assert dataset.license.name == "https://creativecommons.org/licenses/by/4.0/legalcode" assert dataset.platform == "zenodo" assert dataset.platform_identifier == "zenodo.org:7961614" @@ -40,6 +39,42 @@ def test_fetch_all_happy_path(): } +def test_retry_happy_path(): + connector = connectors.dataset_connectors[PlatformName.zenodo] + with responses.RequestsMock() as mocked_requests: + with open(path_test_resources() / "connectors" / "zenodo" / "dataset.json", "r") as f: + dataset = f.read() + mocked_requests.add( + responses.GET, + "https://zenodo.org/api/records/7902672", # noqa E501 + body=dataset, + status=200, + ) + id = "7902672" + dataset = connector.retry(id) + assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" + assert dataset.description == "This is a description paragraph" + assert ( + dataset.creator + == "Hansen, Peter Martin; Alstrøm, henrik; Damm-Hejmdal, Anders; Mikkelsen, Søren; Rehn, Marius; Berlac, Peter Anthony" # noqa E501 + ) + assert dataset.date_published == datetime.datetime( + 2023, 5, 23, 7, 56, 17, 414652, tzinfo=datetime.timezone.utc + ) + assert dataset.license.name == "CC-BY-4.0" + assert dataset.platform == "zenodo" + assert dataset.platform_identifier == "7902672" + assert dataset.publisher == "Zenodo" + assert len(dataset.keywords) == 5 + assert {k.name for k in dataset.keywords} == { + "Mass casualty", + "Major incident", + "Management and leadership", + "Disaster", + "Mass shooting", + } + + def mock_zenodo_responses(mocked_requests: responses.RequestsMock): with open( path_test_resources() / "connectors" / "zenodo" / "list_records.xml", @@ -52,4 +87,3 @@ def mock_zenodo_responses(mocked_requests: responses.RequestsMock): body=records_list, status=200, ) -""" diff --git a/src/tests/resources/connectors/zenodo/dataset.json b/src/tests/resources/connectors/zenodo/dataset.json new file mode 100644 index 00000000..3f63ea9a --- /dev/null +++ b/src/tests/resources/connectors/zenodo/dataset.json @@ -0,0 +1,129 @@ +{ + "conceptdoi": "10.5281/zenodo.7902672", + "conceptrecid": "7902672", + "created": "2023-05-23T07:56:17.414652+00:00", + "doi": "10.5281/zenodo.7961614", + "files": [ + { + "bucket": "ec86c6d8-aa64-4eea-aa9b-b2ca0880a9c4", + "checksum": "md5:97f511d24f8867405a8f87afbc76939d", + "key": "FIELDS_CONFIDE_CHECLIST.docx", + "links": { + "self": "https://zenodo.org/api/files/ec86c6d8-aa64-4eea-aa9b-b2ca0880a9c4/FIELDS_CONFIDE_CHECLIST.docx" + }, + "size": 15600, + "type": "docx" + }, + { + "bucket": "ec86c6d8-aa64-4eea-aa9b-b2ca0880a9c4", + "checksum": "md5:18431d2ad50b7d82935a1dda6ee6db61", + "key": "FIELDS_UNITS_XL.xlsx", + "links": { + "self": "https://zenodo.org/api/files/ec86c6d8-aa64-4eea-aa9b-b2ca0880a9c4/FIELDS_UNITS_XL.xlsx" + }, + "size": 12369, + "type": "xlsx" + } + ], + "id": 7961614, + "links": { + "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.7961614.svg", + "bucket": "https://zenodo.org/api/files/ec86c6d8-aa64-4eea-aa9b-b2ca0880a9c4", + "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.7902672.svg", + "conceptdoi": "https://doi.org/10.5281/zenodo.7902672", + "doi": "https://doi.org/10.5281/zenodo.7961614", + "html": "https://zenodo.org/record/7961614", + "latest": "https://zenodo.org/api/records/7961614", + "latest_html": "https://zenodo.org/record/7961614", + "self": "https://zenodo.org/api/records/7961614" + }, + "metadata": { + "access_right": "open", + "access_right_category": "success", + "creators": [ + { + "affiliation": "Odense University Hospital", + "name": "Hansen, Peter Martin" + }, + { + "affiliation": "Copenhagen University Hopsital", + "name": "Alstrøm, henrik" + }, + { + "affiliation": "Copenhagen Emergency Medical Services", + "name": "Damm-Hejmdal, Anders" + }, + { + "affiliation": "Odense University Hospital", + "name": "Mikkelsen, Søren" + }, + { + "affiliation": "Oslo University Hospital", + "name": "Rehn, Marius" + }, + { + "affiliation": "Copenhagen University Hospital", + "name": "Berlac, Peter Anthony" + } + ], + "description": "This is a description paragraph", + "doi": "10.5281/zenodo.7961614", + "keywords": [ + "Major incident", + "Disaster", + "Mass shooting", + "Mass casualty", + "Management and leadership" + ], + "license": { + "id": "CC-BY-4.0" + }, + "publication_date": "2023-05-06", + "related_identifiers": [ + { + "identifier": "10.5281/zenodo.7902672", + "relation": "isVersionOf", + "scheme": "doi" + } + ], + "relations": { + "version": [ + { + "count": 2, + "index": 1, + "is_last": true, + "last_child": { + "pid_type": "recid", + "pid_value": "7961614" + }, + "parent": { + "pid_type": "recid", + "pid_value": "7902672" + } + } + ] + }, + "resource_type": { + "title": "Dataset", + "type": "dataset" + }, + "title": "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" + }, + "owners": [ + 543260 + ], + "revision": 4, + "stats": { + "downloads": 2.0, + "unique_downloads": 1.0, + "unique_views": 3.0, + "version_downloads": 3.0, + "version_unique_downloads": 2.0, + "version_unique_views": 29.0, + "version_views": 30.0, + "version_volume": 53543.0, + "views": 3.0, + "volume": 27969.0 + }, + "updated": "2023-05-25T02:28:52.350812+00:00" +} \ No newline at end of file diff --git a/src/uploader/hugging_face_uploader.py b/src/uploader/hugging_face_uploader.py index d46253b3..e6f7c70b 100644 --- a/src/uploader/hugging_face_uploader.py +++ b/src/uploader/hugging_face_uploader.py @@ -78,7 +78,11 @@ def _get_resource(self, engine: Engine, identifier: int) -> Dataset: with Session(engine) as session: query = ( session.query(Dataset) - .options(joinedload(Dataset.keywords), joinedload(Dataset.distributions)) + .options( + joinedload(Dataset.keywords), + joinedload(Dataset.distributions), + joinedload(Dataset.license), + ) .filter(Dataset.identifier == identifier) ) From f848db75a35c3eb5b1bb39b409b28ffdb8dd06b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 12 Jul 2023 13:30:50 +0000 Subject: [PATCH 04/52] openml connector --- .../abstract/resource_connector_by_id.py | 39 ++++++++++++++++ .../openml/openml_dataset_connector.py | 44 +++++++------------ .../openml/test_openml_dataset_connector.py | 30 +++++-------- 3 files changed, 66 insertions(+), 47 deletions(-) create mode 100644 src/connectors/abstract/resource_connector_by_id.py diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py new file mode 100644 index 00000000..ba98c801 --- /dev/null +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -0,0 +1,39 @@ +import abc +from typing import Generic, TypeVar, Iterator + +from sqlmodel import SQLModel + +from connectors.resource_with_relations import ResourceWithRelations +from database.model.platform.platform_names import PlatformName + + +RESOURCE = TypeVar("RESOURCE", bound=SQLModel) + + +class ResourceConnectorById(abc.ABC, Generic[RESOURCE]): + """ + For every platform that offers this resource, this ResourceConnector should be implemented. + """ + + @property + @abc.abstractmethod + def resource_class(self) -> type[RESOURCE]: + pass + + @property + @abc.abstractmethod + def platform_name(self) -> PlatformName: + """The platform of this connector""" + pass + + @abc.abstractmethod + def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel]: + """Retrieve information of the resource identified by id""" + pass + + @abc.abstractmethod + def fetch( + self, from_id: int | None = None, to_id: int | None = None + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel]]: + """Retrieve information of all resources""" + pass diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index fd96752a..5dbbf44b 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -2,8 +2,7 @@ This module knows how to load an OpenML object based on its AIoD implementation, and how to convert the OpenML response to some agreed AIoD format. """ -""" -import logging + from typing import Iterator import dateutil.parser @@ -11,14 +10,15 @@ from fastapi import HTTPException from sqlmodel import SQLModel -from connectors.abstract.resource_connector import ResourceConnector + +from connectors.abstract.resource_connector_by_id import ResourceConnectorById from database.model.dataset.data_download import DataDownload from database.model.dataset.dataset import Dataset from database.model.resource import resource_create from database.model.platform.platform_names import PlatformName -class OpenMlDatasetConnector(ResourceConnector[Dataset]): +class OpenMlDatasetConnector(ResourceConnectorById[Dataset]): @property def resource_class(self) -> type[Dataset]: return Dataset @@ -27,8 +27,8 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.openml - def fetch(self, platform_identifier: str) -> SQLModel: - url_data = f"https://www.openml.org/api/v1/json/data/{platform_identifier}" + def retry(self, id: int) -> SQLModel: + url_data = f"https://www.openml.org/api/v1/json/data/{id}" response = requests.get(url_data) if not response.ok: code = response.status_code @@ -43,7 +43,7 @@ def fetch(self, platform_identifier: str) -> SQLModel: # Here we can format the response into some standardized way, maybe this includes some # dataset characteristics. These need to be retrieved separately from OpenML: - url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{platform_identifier}" + url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{id}" response = requests.get(url_qual) if not response.ok: msg = response.json()["error"]["message"] @@ -59,7 +59,7 @@ def fetch(self, platform_identifier: str) -> SQLModel: pydantic_class = resource_create(Dataset) return pydantic_class( platform=self.platform_name, - platform_identifier=platform_identifier, + platform_identifier=id, name=dataset_json["name"], same_as=url_data, description=dataset_json["description"], @@ -82,25 +82,14 @@ def fetch(self, platform_identifier: str) -> SQLModel: measured_values=[], ) - def fetch_all(self, limit: int | None = None) -> Iterator[SQLModel]: - url = "https://www.openml.org/api/v1/json/data/list" - if limit is not None: - url = f"{url}/limit/{limit}" - response = requests.get(url) - response_json = response.json() - if not response.ok: - msg = response_json["error"]["message"] - raise HTTPException( - status_code=response.status_code, - detail=f"Error while fetching data list from OpenML: '{msg}'.", - ) - for dataset_json in response_json["data"]["dataset"]: - try: - yield self.fetch(dataset_json["did"]) - except Exception as e: - logging.error( - f"Error while fetching openml dataset {dataset_json['did']}: '{str(e)}'" - ) + def fetch(self, from_id: int | None = None, to_id: int | None = None) -> Iterator[SQLModel]: + if from_id is None: + from_id = 1 + if to_id is None: + to_id = from_id + 10 + + for id in range(from_id, to_id): + yield self.retry(id) def _as_int(v: str) -> int: @@ -108,4 +97,3 @@ def _as_int(v: str) -> int: if not as_float.is_integer(): raise ValueError(f"The input should be an integer, but was a float: {v}") return int(as_float) -""" diff --git a/src/tests/connectors/openml/test_openml_dataset_connector.py b/src/tests/connectors/openml/test_openml_dataset_connector.py index 20cbdfc1..ad22fa15 100644 --- a/src/tests/connectors/openml/test_openml_dataset_connector.py +++ b/src/tests/connectors/openml/test_openml_dataset_connector.py @@ -1,15 +1,12 @@ -""" import json - import responses -import connectors -from database.model.platform.platform_names import PlatformName + +from connectors.openml.openml_dataset_connector import OpenMlDatasetConnector from tests.testutils.paths import path_test_resources OPENML_URL = "https://www.openml.org/api/v1/json" - - +""" def test_fetch_happy_path(): connector = connectors.dataset_connectors[PlatformName.openml] id_ = "2" @@ -49,29 +46,25 @@ def test_fetch_happy_path(): "uci", } +""" -def test_fetch_all_happy_path(): - connector = connectors.dataset_connectors[PlatformName.openml] + +def test_fetch_happy_path(): + connector = OpenMlDatasetConnector() with responses.RequestsMock() as mocked_requests: - with open(path_test_resources() / "connectors" / "openml" / "data_list.json", "r") as f: - response = json.load(f) - mocked_requests.add( - responses.GET, f"{OPENML_URL}/data/list/limit/3", json=response, status=200 - ) for i in range(2, 5): mock_openml_responses(mocked_requests, str(i)) - datasets = list(connector.fetch_all(limit=3)) + datasets = list(connector.fetch(2, 5)) assert len(datasets) == 3 assert {len(d.citations) for d in datasets} == {0} def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_identifier: str): -""" -""" + """ Mocking requests to the OpenML dependency, so that we test only our own services -""" -""" + """ + with open( path_test_resources() / "connectors" / "openml" / f"data_{platform_identifier}.json", "r", @@ -97,4 +90,3 @@ def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_iden json=data_qualities_response, status=200, ) -""" From acaf5d19a62c0917b1467ade4d185ae50a3c911d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 12 Jul 2023 13:40:11 +0000 Subject: [PATCH 05/52] remove connectors from api --- src/connectors/__init__.py | 14 -------- src/main.py | 35 ++++++++----------- .../example/test_example_connector.py | 6 ++-- .../zenodo/test_get_datasets_zenodo.py | 9 ++--- 4 files changed, 23 insertions(+), 41 deletions(-) diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index 2d350787..176b1176 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -15,20 +15,6 @@ from .example.example_dataset_connector import ExampleDatasetConnector -# from .huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector -# from .openml.openml_dataset_connector import OpenMlDatasetConnector -from .zenodo.zenodo_dataset_connector import ZenodoDatasetConnector - -dataset_connectors = { - c.platform_name: c - for c in ( - # OpenMlDatasetConnector(), - # HuggingFaceDatasetConnector(), - ZenodoDatasetConnector(), - ) -} - - _path_example_resources = pathlib.Path(__file__).parent / "example" / "resources" example_connectors = { diff --git a/src/main.py b/src/main.py index 890d6177..5939939f 100644 --- a/src/main.py +++ b/src/main.py @@ -6,20 +6,19 @@ """ import argparse import logging -from typing import Dict + import uvicorn from fastapi import Depends, FastAPI, HTTPException from fastapi.responses import HTMLResponse from pydantic import Json from sqlalchemy.engine import Engine -from starlette.status import HTTP_400_BAD_REQUEST, HTTP_501_NOT_IMPLEMENTED +from starlette.status import HTTP_501_NOT_IMPLEMENTED import connectors import routers from authentication import get_current_user from config import DB_CONFIG, KEYCLOAK_CONFIG -from database.model.platform.platform_names import PlatformName from database.setup import connect_to_database, populate_database @@ -33,13 +32,14 @@ def _parse_args() -> argparse.Namespace: choices=["no", "only-if-empty", "always"], help="Determines if the database is recreated.", ) + """ parser.add_argument( "--populate-datasets", default=[], nargs="+", choices=[p.name for p in PlatformName], help="Zero, one or more platforms with which the datasets should get populated.", - ) + )""" parser.add_argument( "--fill-with-examples", default=[], @@ -47,13 +47,7 @@ def _parse_args() -> argparse.Namespace: choices=connectors.example_connectors.keys(), help="Zero, one or more resources with which the database will have examples.", ) - parser.add_argument( - "--limit", - type=int, - default=None, - help="Limit the number of initial resources with which the database is populated, " - "per resource and per platform.", - ) + parser.add_argument( "--reload", action="store_true", @@ -79,8 +73,12 @@ def _engine(rebuild_db: str) -> Engine: return connect_to_database(db_url, delete_first=delete_before_create) +""" +****IMPORTANT**** +Connector will be removed from the api +***************** def _connector_from_platform_name(connector_type: str, connector_dict: Dict, platform_name: str): - """Get the connector from the connector_dict, identified by its platform name.""" + #Get the connector from the connector_dict, identified by its platform name. try: platform = PlatformName(platform_name) except ValueError: @@ -98,6 +96,8 @@ def _connector_from_platform_name(connector_type: str, connector_dict: Dict, pla raise HTTPException(status_code=HTTP_501_NOT_IMPLEMENTED, detail=msg) return connector +""" + def _connector_example_from_resource(resource): connector_dict = connectors.example_connectors @@ -158,20 +158,15 @@ def create_app() -> FastAPI: }, ) - dataset_connectors = [ - _connector_from_platform_name("dataset", connectors.dataset_connectors, platform_name) - for platform_name in args.populate_datasets - ] - examples_connectors = [ _connector_example_from_resource(resource) for resource in args.fill_with_examples ] - connectors_ = dataset_connectors + examples_connectors + engine = _engine(args.rebuild_db) - if len(connectors_) > 0: + if len(examples_connectors) > 0: populate_database( engine, - connectors=connectors_, + connectors=examples_connectors, only_if_empty=True, limit=args.limit, ) diff --git a/src/tests/connectors/example/test_example_connector.py b/src/tests/connectors/example/test_example_connector.py index 0a6f792c..a2da0b87 100644 --- a/src/tests/connectors/example/test_example_connector.py +++ b/src/tests/connectors/example/test_example_connector.py @@ -1,6 +1,6 @@ import pytest -import connectors +from connectors import example_connectors @pytest.mark.parametrize( @@ -18,7 +18,7 @@ ], ) def test_fetch_happy_path(datatype: str): - connector = connectors.example_connectors[datatype] + connector = example_connectors[datatype] resources = list(connector.fetch()) assert len(resources) >= 1 resource = resources[0] @@ -41,7 +41,7 @@ def test_fetch_happy_path(datatype: str): ], ) def test_retry_happy_path(datatype: str): - connector = connectors.example_connectors[datatype] + connector = example_connectors[datatype] resource = connector.retry("1") if hasattr(resource, "keywords"): # otherwise, only tested that connector can run assert set(resource.keywords) == {"keyword1", "keyword2"} diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 15c7ce0f..770715cb 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -1,7 +1,8 @@ import datetime import responses -import connectors -from database.model.platform.platform_names import PlatformName +from connectors.zenodo.zenodo_dataset_connector import ZenodoDatasetConnector + + from tests.testutils.paths import path_test_resources @@ -12,7 +13,7 @@ def read_file(path): def test_fetch_happy_path(): - connector = connectors.dataset_connectors[PlatformName.zenodo] + connector = ZenodoDatasetConnector() with responses.RequestsMock() as mocked_requests: mock_zenodo_responses(mocked_requests) datasets = list(connector.fetch()) @@ -40,7 +41,7 @@ def test_fetch_happy_path(): def test_retry_happy_path(): - connector = connectors.dataset_connectors[PlatformName.zenodo] + connector = ZenodoDatasetConnector() with responses.RequestsMock() as mocked_requests: with open(path_test_resources() / "connectors" / "zenodo" / "dataset.json", "r") as f: dataset = f.read() From d4fd6870014caddf17e182cae65468160b571dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Thu, 13 Jul 2023 14:03:36 +0000 Subject: [PATCH 06/52] error on zenodo --- .../abstract/resource_connector_by_date.py | 3 +- src/connectors/record_error.py | 6 +++ .../zenodo/zenodo_dataset_connector.py | 40 +++++++++++++------ 3 files changed, 36 insertions(+), 13 deletions(-) create mode 100644 src/connectors/record_error.py diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 1773c1ec..57c0b6c0 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -1,6 +1,7 @@ import abc from datetime import datetime from typing import Generic, TypeVar, Iterator +from connectors.record_error import RecordError from sqlmodel import SQLModel @@ -35,6 +36,6 @@ def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel]: @abc.abstractmethod def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None - ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel]]: + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: """Retrieve information of all resources""" pass diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py new file mode 100644 index 00000000..1d965155 --- /dev/null +++ b/src/connectors/record_error.py @@ -0,0 +1,6 @@ +class RecordError: + def __init__(self, platform, id, type, error): + self.platform = platform + self.id = id + self.type = type + self.error = error diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 8f9d2435..a00c42f8 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -2,6 +2,7 @@ import logging from typing import Iterator import requests +from connectors.record_error import RecordError from sickle import Sickle import xmltodict @@ -55,7 +56,7 @@ def _bad_record_format(self, dataset_id, field): f"Error while fetching record info for dataset {dataset_id}: bad format {field}" ) - def _dataset_from_record(self, record_raw) -> Dataset | None: + def _dataset_from_record(self, record_raw) -> Dataset | RecordError: id_, record = self._get_record_dictionary(record_raw) if isinstance(record["creators"]["creator"], list): creators_list = [item["creatorName"] for item in record["creators"]["creator"]] @@ -64,13 +65,17 @@ def _dataset_from_record(self, record_raw) -> Dataset | None: creator = record["creators"]["creator"]["creatorName"] else: self._bad_record_format(id_, "creator") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding creator" + ) if isinstance(record["titles"]["title"], str): title = record["titles"]["title"] else: self._bad_record_format(id_, "title") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding title" + ) number_str = id_.rsplit("/", 1)[-1] idNumber = "".join(filter(str.isdigit, number_str)) same_as = f"https://zenodo.org/api/records/{idNumber}" @@ -84,7 +89,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | None: description = description_raw["#text"] else: self._bad_record_format(id_, "description") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding description" + ) date_published = None date_raw = record["dates"]["date"] @@ -95,13 +102,17 @@ def _dataset_from_record(self, record_raw) -> Dataset | None: date_published = datetime.strptime(date_string, DATE_FORMAT) else: self._bad_record_format(id_, "date_published") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding date_published" + ) if isinstance(record["publisher"], str): publisher = record["publisher"] else: self._bad_record_format(id_, "publisher") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding publisher" + ) if isinstance(record["rightsList"]["rights"], list): license_ = record["rightsList"]["rights"][0]["@rightsURI"] @@ -109,7 +120,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | None: license_ = record["rightsList"]["rights"]["@rightsURI"] else: self._bad_record_format(id_, "license") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding license" + ) keywords = [] if "subjects" in record: @@ -119,7 +132,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | None: keywords = [item for item in record["subjects"]["subject"] if isinstance(item, str)] else: self._bad_record_format(id_, "keywords") - return None + return RecordError( + id=id_, platform="zenodo", type="dataset", error="error decoding keywords" + ) dataset = Dataset( platform="zenodo", @@ -147,11 +162,12 @@ def _get_resource_type(self, record): return xml_string[start:end] id_, _ = self._get_record_dictionary(record) logging.error(f"Error while getting the resource type of the record {id_}") + # Can return an RecordError Because we dont know the type return None def _retrieve_dataset_from_datetime( self, sk: Sickle, from_incl: datetime, to_excl: datetime | None = None - ) -> Iterator[Dataset]: + ) -> Iterator[Dataset | RecordError]: records = sk.ListRecords( **{ "metadataPrefix": "oai_datacite", @@ -164,14 +180,14 @@ def _retrieve_dataset_from_datetime( while record and (to_excl is None or (last_date is not None and last_date < to_excl)): if self._get_resource_type(record) == "Dataset": dataset = self._dataset_from_record(record) - if dataset is not None: + if not isinstance(dataset, RecordError): last_date = dataset.date_published - yield dataset + yield dataset record = next(records, None) def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None - ) -> Iterator[Dataset]: + ) -> Iterator[Dataset | RecordError]: sickle = Sickle("https://zenodo.org/oai2d") date = from_incl if from_incl is not None else datetime(2000, 1, 1, 12, 0, 0) return self._retrieve_dataset_from_datetime(sickle, date, to_excl=to_excl) From 6eba04a3d8d3b0ef0129f3de5751ac09442cbef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Thu, 13 Jul 2023 14:13:35 +0000 Subject: [PATCH 07/52] openml --- src/connectors/abstract/resource_connector_by_id.py | 3 ++- src/connectors/openml/openml_dataset_connector.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index ba98c801..384f80f5 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -5,6 +5,7 @@ from connectors.resource_with_relations import ResourceWithRelations from database.model.platform.platform_names import PlatformName +from connectors.record_error import RecordError RESOURCE = TypeVar("RESOURCE", bound=SQLModel) @@ -34,6 +35,6 @@ def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel]: @abc.abstractmethod def fetch( self, from_id: int | None = None, to_id: int | None = None - ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel]]: + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: """Retrieve information of all resources""" pass diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 5dbbf44b..5b4dce2a 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -16,6 +16,7 @@ from database.model.dataset.dataset import Dataset from database.model.resource import resource_create from database.model.platform.platform_names import PlatformName +from connectors.record_error import RecordError class OpenMlDatasetConnector(ResourceConnectorById[Dataset]): @@ -82,14 +83,20 @@ def retry(self, id: int) -> SQLModel: measured_values=[], ) - def fetch(self, from_id: int | None = None, to_id: int | None = None) -> Iterator[SQLModel]: + def fetch( + self, from_id: int | None = None, to_id: int | None = None + ) -> Iterator[SQLModel | RecordError]: if from_id is None: from_id = 1 if to_id is None: to_id = from_id + 10 for id in range(from_id, to_id): - yield self.retry(id) + try: + dataset = self.retry(id) + yield dataset + except Exception as e: + return RecordError(id=id, platform="openml", type="dataset", error=e) def _as_int(v: str) -> int: From ac7e485e6d82585f53100ee3a8d3a590392f7046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 14 Jul 2023 09:26:55 +0000 Subject: [PATCH 08/52] delete retun --- src/connectors/example/example_dataset_connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index 07fda2bf..0a0380ad 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -75,7 +75,6 @@ def retry(self, id: str) -> ResourceWithRelations[Dataset]: if dataset.resource.platform_identifier == id: return dataset raise Exception("No resource associated with the id") - return None def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None From 6a182df20e354cc41e6ac8c77da89c5f902542af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 09:46:20 +0000 Subject: [PATCH 09/52] delete code duplication dataset example --- .../example/example_dataset_connector.py | 57 ++----------------- 1 file changed, 5 insertions(+), 52 deletions(-) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index 0a0380ad..4db589c2 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -19,7 +19,7 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.example - def retry(self, id: str) -> ResourceWithRelations[Dataset]: + def retry(self, id_: str) -> ResourceWithRelations[Dataset]: """Retrieve information of the resource identified by id""" pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) @@ -72,60 +72,13 @@ def retry(self, id: str) -> ResourceWithRelations[Dataset]: ), ] for dataset in datasets: - if dataset.resource.platform_identifier == id: + if dataset.resource.platform_identifier == id_: return dataset raise Exception("No resource associated with the id") def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None ) -> typing.Iterator[ResourceWithRelations[Dataset]]: - pydantic_class = resource_create(Dataset) - pydantic_class_publication = resource_create(Publication) - yield from [ - ResourceWithRelations[Dataset]( - resource=pydantic_class( - name="Higgs", - platform="openml", - description="Higgs dataset", - same_as="non-existing-url/1", - platform_identifier="42769", - alternate_names=[], - citations=[], - distributions=[], - is_part=[], - has_parts=[], - keywords=["keyword1", "keyword2"], - measured_values=[], - ), - related_resources={ - "citations": [ - pydantic_class_publication( - title=( - "Searching for exotic particles in high-energy physics with deep " - "learning" - ), - doi="2", - platform="example", - platform_identifier="2", - datasets=[], - ) - ] - }, - ), - ResourceWithRelations[Dataset]( - resource=pydantic_class( - name="porto-seguro", - platform="openml", - description="Porto seguro dataset", - same_as="non-existing-url/2", - platform_identifier="42742", - alternate_names=[], - citations=[], - distributions=[], - is_part=[], - has_parts=[], - keywords=[], - measured_values=[], - ) - ), - ] + id_list = ["42769", "42742"] + for id_ in id_list: + yield self.retry(id_) From eaf8d852fe3e346b3351919c33de0bf553e76b08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 09:47:11 +0000 Subject: [PATCH 10/52] change exception to Value error --- src/connectors/example/example_dataset_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index 4db589c2..e9ddf78a 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -74,7 +74,7 @@ def retry(self, id_: str) -> ResourceWithRelations[Dataset]: for dataset in datasets: if dataset.resource.platform_identifier == id_: return dataset - raise Exception("No resource associated with the id") + raise ValueError("No resource associated with the id") def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None From 3860b614bb0fb9e168527f49149bc50c4e3b7f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 09:47:51 +0000 Subject: [PATCH 11/52] change excepcion to value error --- src/connectors/example/example_connector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/connectors/example/example_connector.py b/src/connectors/example/example_connector.py index 64f65238..4df83c63 100644 --- a/src/connectors/example/example_connector.py +++ b/src/connectors/example/example_connector.py @@ -39,8 +39,7 @@ def retry(self, id: str) -> RESOURCE: for json_item in json_data: if json_item.get("platform_identifier") == id: return pydantic_class(**json_item) - raise Exception("No resource associated with the id") - return + raise ValueError("No resource associated with the id") def fetch( self, from_incl: datetime | None = None, to_excl: datetime | None = None From 06ebd635e850bcd58a750283933e2ac6b03b614b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 09:53:48 +0000 Subject: [PATCH 12/52] change id to id_ test zenodo --- src/tests/connectors/zenodo/test_get_datasets_zenodo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 770715cb..36950686 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -51,8 +51,8 @@ def test_retry_happy_path(): body=dataset, status=200, ) - id = "7902672" - dataset = connector.retry(id) + id_ = "7902672" + dataset = connector.retry(id_) assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" assert dataset.description == "This is a description paragraph" assert ( From daf0408f31d0ddcdff157b33888462e03e388cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 10:03:51 +0000 Subject: [PATCH 13/52] change a nmethod to static --- src/connectors/zenodo/zenodo_dataset_connector.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index a00c42f8..06b1c0f5 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -42,14 +42,15 @@ def retry(self, id: str) -> Dataset: keywords=[Keyword(name=k) for k in record.get("metadata").get("keywords")], ) - def _get_record_dictionary(self, record): + @staticmethod + def _get_record_dictionary(record): xml_string = record.raw xml_dict = xmltodict.parse(xml_string) - id = xml_dict["record"]["header"]["identifier"] - if id.startswith("oai:"): - id = id.replace("oai:", "") + id_ = xml_dict["record"]["header"]["identifier"] + if id_.startswith("oai:"): + id_ = id_.replace("oai:", "") resource = xml_dict["record"]["metadata"]["oai_datacite"]["payload"]["resource"] - return id, resource + return id_, resource def _bad_record_format(self, dataset_id, field): logging.error( @@ -57,7 +58,7 @@ def _bad_record_format(self, dataset_id, field): ) def _dataset_from_record(self, record_raw) -> Dataset | RecordError: - id_, record = self._get_record_dictionary(record_raw) + id_, record = ZenodoDatasetConnector._get_record_dictionary(record_raw) if isinstance(record["creators"]["creator"], list): creators_list = [item["creatorName"] for item in record["creators"]["creator"]] creator = "; ".join(creators_list) # TODO change field to an array @@ -160,7 +161,7 @@ def _get_resource_type(self, record): end = xml_string.find('"', start) if end != -1: return xml_string[start:end] - id_, _ = self._get_record_dictionary(record) + id_, _ = ZenodoDatasetConnector._get_record_dictionary(record) logging.error(f"Error while getting the resource type of the record {id_}") # Can return an RecordError Because we dont know the type return None From 0869102074dbfa5f6001e8529789a3160775c9bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 10:14:43 +0000 Subject: [PATCH 14/52] change var name --- src/connectors/zenodo/zenodo_dataset_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 06b1c0f5..61aca4a7 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -78,8 +78,8 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: id=id_, platform="zenodo", type="dataset", error="error decoding title" ) number_str = id_.rsplit("/", 1)[-1] - idNumber = "".join(filter(str.isdigit, number_str)) - same_as = f"https://zenodo.org/api/records/{idNumber}" + id_number = "".join(filter(str.isdigit, number_str)) + same_as = f"https://zenodo.org/api/records/{id_number}" description_raw = record["descriptions"]["description"] if isinstance(description_raw, list): From ea2fb14f8a376de8148a79a7dda9c2cc171eec3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 10:15:36 +0000 Subject: [PATCH 15/52] remove commnets --- src/main.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/main.py b/src/main.py index 5939939f..f53ef3c2 100644 --- a/src/main.py +++ b/src/main.py @@ -73,32 +73,6 @@ def _engine(rebuild_db: str) -> Engine: return connect_to_database(db_url, delete_first=delete_before_create) -""" -****IMPORTANT**** -Connector will be removed from the api -***************** -def _connector_from_platform_name(connector_type: str, connector_dict: Dict, platform_name: str): - #Get the connector from the connector_dict, identified by its platform name. - try: - platform = PlatformName(platform_name) - except ValueError: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail=f"platform " f"'{platform_name}' not recognized.", - ) - connector = connector_dict.get(platform, None) - if connector is None: - possibilities = ", ".join(f"`{c}`" for c in connectors.dataset_connectors.keys()) - msg = ( - f"No {connector_type} connector for platform '{platform_name}' available. Possible " - f"values: {possibilities}" - ) - raise HTTPException(status_code=HTTP_501_NOT_IMPLEMENTED, detail=msg) - return connector - -""" - - def _connector_example_from_resource(resource): connector_dict = connectors.example_connectors connector = connector_dict.get(resource, None) From 0344c826db1af9bab94e2700392affb03f586b4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 13:41:52 +0000 Subject: [PATCH 16/52] example filter by time --- .../example/example_dataset_connector.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index e9ddf78a..9d7916ee 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -38,6 +38,7 @@ def retry(self, id_: str) -> ResourceWithRelations[Dataset]: has_parts=[], keywords=["keyword1", "keyword2"], measured_values=[], + date_published=datetime.now(), ), related_resources={ "citations": [ @@ -68,6 +69,7 @@ def retry(self, id_: str) -> ResourceWithRelations[Dataset]: has_parts=[], keywords=[], measured_values=[], + date_published=datetime.now(), ) ), ] @@ -81,4 +83,14 @@ def fetch( ) -> typing.Iterator[ResourceWithRelations[Dataset]]: id_list = ["42769", "42742"] for id_ in id_list: - yield self.retry(id_) + dataset = self.retry(id_) + if from_incl is None: + from_incl = datetime.min + if to_excl is None: + to_excl = datetime.max + if ( + dataset.resource.date_published is not None + and dataset.resource.date_published > from_incl + and dataset.resource.date_published < to_excl + ): + yield dataset From 4f0a35899615a3a5fc00a74e6b7917a64d27b5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 17 Jul 2023 13:43:07 +0000 Subject: [PATCH 17/52] change default example date dataset --- src/connectors/example/example_dataset_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index 9d7916ee..f9dbdfa2 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -69,7 +69,7 @@ def retry(self, id_: str) -> ResourceWithRelations[Dataset]: has_parts=[], keywords=[], measured_values=[], - date_published=datetime.now(), + date_published=datetime.min, ) ), ] @@ -90,7 +90,7 @@ def fetch( to_excl = datetime.max if ( dataset.resource.date_published is not None - and dataset.resource.date_published > from_incl + and dataset.resource.date_published >= from_incl and dataset.resource.date_published < to_excl ): yield dataset From 14efe89cb43486391c0d32eda224649bf3958667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 21 Jul 2023 13:22:29 +0000 Subject: [PATCH 18/52] comments --- src/connectors/openml/openml_dataset_connector.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 5b4dce2a..c173dffb 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -20,6 +20,12 @@ class OpenMlDatasetConnector(ResourceConnectorById[Dataset]): + """ " + Openml orders its records with a numeric id in ascendent order but does not allow + gather them from a certain date. This is the reason why the ResourceConnectorById + is needed + """ + @property def resource_class(self) -> type[Dataset]: return Dataset From f16823f8066f9d8adb934010a54b831752a50059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 21 Jul 2023 13:29:08 +0000 Subject: [PATCH 19/52] reduce code duplication --- src/connectors/abstract/resource_connector.py | 29 +++++++++++++++++++ .../abstract/resource_connector_by_date.py | 18 ++---------- .../abstract/resource_connector_by_id.py | 19 +++--------- 3 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 src/connectors/abstract/resource_connector.py diff --git a/src/connectors/abstract/resource_connector.py b/src/connectors/abstract/resource_connector.py new file mode 100644 index 00000000..95142b18 --- /dev/null +++ b/src/connectors/abstract/resource_connector.py @@ -0,0 +1,29 @@ +import abc + +from typing import Generic, TypeVar + + +from sqlmodel import SQLModel + + +from database.model.platform.platform_names import PlatformName + + +RESOURCE = TypeVar("RESOURCE", bound=SQLModel) + + +class ResourceConnector(abc.ABC, Generic[RESOURCE]): + """ + For every platform that offers this resource, this ResourceConnector should be implemented. + """ + + @property + @abc.abstractmethod + def resource_class(self) -> type[RESOURCE]: + pass + + @property + @abc.abstractmethod + def platform_name(self) -> PlatformName: + """The platform of this connector""" + pass diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 57c0b6c0..f7c31d2c 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -1,33 +1,21 @@ import abc from datetime import datetime -from typing import Generic, TypeVar, Iterator +from typing import Generic, Iterator, TypeVar +from connectors.abstract.resource_connector import ResourceConnector from connectors.record_error import RecordError from sqlmodel import SQLModel from connectors.resource_with_relations import ResourceWithRelations -from database.model.platform.platform_names import PlatformName - RESOURCE = TypeVar("RESOURCE", bound=SQLModel) -class ResourceConnectorByDate(abc.ABC, Generic[RESOURCE]): +class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]): """ For every platform that offers this resource, this ResourceConnector should be implemented. """ - @property - @abc.abstractmethod - def resource_class(self) -> type[RESOURCE]: - pass - - @property - @abc.abstractmethod - def platform_name(self) -> PlatformName: - """The platform of this connector""" - pass - @abc.abstractmethod def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel]: """Retrieve information of the resource identified by id""" diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index 384f80f5..22dcfcc1 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -1,32 +1,21 @@ import abc -from typing import Generic, TypeVar, Iterator +from typing import Generic, Iterator, TypeVar from sqlmodel import SQLModel +from connectors.abstract.resource_connector import ResourceConnector from connectors.resource_with_relations import ResourceWithRelations -from database.model.platform.platform_names import PlatformName -from connectors.record_error import RecordError +from connectors.record_error import RecordError RESOURCE = TypeVar("RESOURCE", bound=SQLModel) -class ResourceConnectorById(abc.ABC, Generic[RESOURCE]): +class ResourceConnectorById(ResourceConnector, Generic[RESOURCE]): """ For every platform that offers this resource, this ResourceConnector should be implemented. """ - @property - @abc.abstractmethod - def resource_class(self) -> type[RESOURCE]: - pass - - @property - @abc.abstractmethod - def platform_name(self) -> PlatformName: - """The platform of this connector""" - pass - @abc.abstractmethod def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel]: """Retrieve information of the resource identified by id""" From d4b4b29f97965f992d993d70e480b8115258fd45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 21 Jul 2023 13:31:34 +0000 Subject: [PATCH 20/52] type hints in error class --- src/connectors/record_error.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py index 1d965155..d947c0c1 100644 --- a/src/connectors/record_error.py +++ b/src/connectors/record_error.py @@ -1,6 +1,9 @@ +import dataclasses + + +@dataclasses.dataclass class RecordError: - def __init__(self, platform, id, type, error): - self.platform = platform - self.id = id - self.type = type - self.error = error + platform: str + id: str + type: str + error: str From eba131180f615a0d3c4938c820b6385fd95e49a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 21 Jul 2023 13:32:18 +0000 Subject: [PATCH 21/52] record error --- src/connectors/abstract/resource_connector_by_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index 22dcfcc1..0c8533df 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -17,7 +17,7 @@ class ResourceConnectorById(ResourceConnector, Generic[RESOURCE]): """ @abc.abstractmethod - def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel]: + def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" pass From fb062d61e1ddc4bccf04f64b6cc65716b3f4bdee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 21 Jul 2023 13:32:57 +0000 Subject: [PATCH 22/52] add record to abstact connector class --- src/connectors/abstract/resource_connector_by_date.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index f7c31d2c..55f17c66 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -17,7 +17,7 @@ class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]): """ @abc.abstractmethod - def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel]: + def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" pass From 2bc4465619161522c68a4fc6d3fd6f5e3c37a662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:14:27 +0000 Subject: [PATCH 23/52] change id --- .../abstract/resource_connector_by_date.py | 4 +- .../abstract/resource_connector_by_id.py | 4 +- src/connectors/example/example_connector.py | 4 +- .../example/example_dataset_connector.py | 4 +- .../openml/openml_dataset_connector.py | 32 +++++------ src/connectors/record_error.py | 2 +- .../zenodo/zenodo_dataset_connector.py | 56 +++++++++---------- .../zenodo/test_get_datasets_zenodo.py | 7 ++- 8 files changed, 56 insertions(+), 57 deletions(-) diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 55f17c66..4af86a9b 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -17,13 +17,13 @@ class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]): """ @abc.abstractmethod - def retry(self, id: str) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: + def retry(self, _id: str) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" pass @abc.abstractmethod def fetch( - self, from_incl: datetime | None = None, to_excl: datetime | None = None + self, from_incl: datetime, to_excl: datetime ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: """Retrieve information of all resources""" pass diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index 0c8533df..b6e838a2 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -17,13 +17,13 @@ class ResourceConnectorById(ResourceConnector, Generic[RESOURCE]): """ @abc.abstractmethod - def retry(self, id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: + def retry(self, _id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" pass @abc.abstractmethod def fetch( - self, from_id: int | None = None, to_id: int | None = None + self, from_id: int, to_id: int ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: """Retrieve information of all resources""" pass diff --git a/src/connectors/example/example_connector.py b/src/connectors/example/example_connector.py index 4df83c63..76510389 100644 --- a/src/connectors/example/example_connector.py +++ b/src/connectors/example/example_connector.py @@ -31,13 +31,13 @@ def resource_class(self) -> type[RESOURCE]: def platform_name(self) -> PlatformName: return PlatformName.example - def retry(self, id: str) -> RESOURCE: + def retry(self, _id: str) -> RESOURCE: """Retrieve information of the resource identified by id""" with open(self.json_path) as f: json_data = json.load(f) pydantic_class = resource_create(self.resource_class) for json_item in json_data: - if json_item.get("platform_identifier") == id: + if json_item.get("platform_identifier") == _id: return pydantic_class(**json_item) raise ValueError("No resource associated with the id") diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index f9dbdfa2..9a433eab 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -19,7 +19,7 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.example - def retry(self, id_: str) -> ResourceWithRelations[Dataset]: + def retry(self, _id: str) -> ResourceWithRelations[Dataset]: """Retrieve information of the resource identified by id""" pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) @@ -74,7 +74,7 @@ def retry(self, id_: str) -> ResourceWithRelations[Dataset]: ), ] for dataset in datasets: - if dataset.resource.platform_identifier == id_: + if dataset.resource.platform_identifier == _id: return dataset raise ValueError("No resource associated with the id") diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index c173dffb..86b5fa4f 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -34,23 +34,26 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.openml - def retry(self, id: int) -> SQLModel: - url_data = f"https://www.openml.org/api/v1/json/data/{id}" + def retry(self, _id: int) -> SQLModel: + url_data = f"https://www.openml.org/api/v1/json/data/{_id}" response = requests.get(url_data) if not response.ok: code = response.status_code if code == 412 and response.json()["error"]["message"] == "Unknown dataset": code = 404 msg = response.json()["error"]["message"] - raise HTTPException( - status_code=code, - detail=f"Error while fetching data from OpenML: '{msg}'.", + + return RecordError( + platform="openml", + _id=str(_id), + error=f"Error while fetching data from OpenML: '{msg}'.", + type="dataset", ) dataset_json = response.json()["data_set_description"] # Here we can format the response into some standardized way, maybe this includes some # dataset characteristics. These need to be retrieved separately from OpenML: - url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{id}" + url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{_id}" response = requests.get(url_qual) if not response.ok: msg = response.json()["error"]["message"] @@ -66,7 +69,7 @@ def retry(self, id: int) -> SQLModel: pydantic_class = resource_create(Dataset) return pydantic_class( platform=self.platform_name, - platform_identifier=id, + platform_identifier=_id, name=dataset_json["name"], same_as=url_data, description=dataset_json["description"], @@ -89,20 +92,13 @@ def retry(self, id: int) -> SQLModel: measured_values=[], ) - def fetch( - self, from_id: int | None = None, to_id: int | None = None - ) -> Iterator[SQLModel | RecordError]: - if from_id is None: - from_id = 1 - if to_id is None: - to_id = from_id + 10 - - for id in range(from_id, to_id): + def fetch(self, from_id: int, to_id: int) -> Iterator[SQLModel | RecordError]: + for _id in range(from_id, to_id): try: - dataset = self.retry(id) + dataset = self.retry(_id) yield dataset except Exception as e: - return RecordError(id=id, platform="openml", type="dataset", error=e) + return RecordError(_id=str(_id), platform="openml", type="dataset", error=str(e)) def _as_int(v: str) -> int: diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py index d947c0c1..76153f6d 100644 --- a/src/connectors/record_error.py +++ b/src/connectors/record_error.py @@ -4,6 +4,6 @@ @dataclasses.dataclass class RecordError: platform: str - id: str + _id: str type: str error: str diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 61aca4a7..539ca4d7 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -24,15 +24,15 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo - def retry(self, id: str) -> Dataset: + def retry(self, _id: str) -> Dataset: """Retrieve information of the resource identified by id""" - record = requests.get(f"https://zenodo.org/api/records/{id}").json() + record = requests.get(f"https://zenodo.org/api/records/{_id}").json() creators_list = [item["name"] for item in record["metadata"]["creators"]] creator = "; ".join(creators_list) # TODO change field to an array return Dataset( platform="zenodo", - platform_identifier=id, + platform_identifier=_id, date_published=record.get("created"), name=record.get("metadata").get("title"), description=record.get("metadata").get("description"), @@ -58,26 +58,26 @@ def _bad_record_format(self, dataset_id, field): ) def _dataset_from_record(self, record_raw) -> Dataset | RecordError: - id_, record = ZenodoDatasetConnector._get_record_dictionary(record_raw) + _id, record = ZenodoDatasetConnector._get_record_dictionary(record_raw) if isinstance(record["creators"]["creator"], list): creators_list = [item["creatorName"] for item in record["creators"]["creator"]] creator = "; ".join(creators_list) # TODO change field to an array elif isinstance(record["creators"]["creator"]["creatorName"], str): creator = record["creators"]["creator"]["creatorName"] else: - self._bad_record_format(id_, "creator") + self._bad_record_format(_id, "creator") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding creator" + _id=_id, platform="zenodo", type="dataset", error="error decoding creator" ) if isinstance(record["titles"]["title"], str): title = record["titles"]["title"] else: - self._bad_record_format(id_, "title") + self._bad_record_format(_id, "title") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding title" + _id=_id, platform="zenodo", type="dataset", error="error decoding title" ) - number_str = id_.rsplit("/", 1)[-1] + number_str = _id.rsplit("/", 1)[-1] id_number = "".join(filter(str.isdigit, number_str)) same_as = f"https://zenodo.org/api/records/{id_number}" @@ -89,9 +89,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: elif description_raw["@descriptionType"] == "Abstract": description = description_raw["#text"] else: - self._bad_record_format(id_, "description") + self._bad_record_format(_id, "description") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding description" + _id=_id, platform="zenodo", type="dataset", error="error decoding description" ) date_published = None @@ -102,17 +102,17 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: date_string = date_raw["#text"] date_published = datetime.strptime(date_string, DATE_FORMAT) else: - self._bad_record_format(id_, "date_published") + self._bad_record_format(_id, "date_published") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding date_published" + _id=_id, platform="zenodo", type="dataset", error="error decoding date_published" ) if isinstance(record["publisher"], str): publisher = record["publisher"] else: - self._bad_record_format(id_, "publisher") + self._bad_record_format(_id, "publisher") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding publisher" + _id=_id, platform="zenodo", type="dataset", error="error decoding publisher" ) if isinstance(record["rightsList"]["rights"], list): @@ -120,9 +120,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: elif isinstance(record["rightsList"]["rights"]["@rightsURI"], str): license_ = record["rightsList"]["rights"]["@rightsURI"] else: - self._bad_record_format(id_, "license") + self._bad_record_format(_id, "license") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding license" + _id=_id, platform="zenodo", type="dataset", error="error decoding license" ) keywords = [] @@ -132,14 +132,14 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: elif isinstance(record["subjects"]["subject"], list): keywords = [item for item in record["subjects"]["subject"] if isinstance(item, str)] else: - self._bad_record_format(id_, "keywords") + self._bad_record_format(_id, "keywords") return RecordError( - id=id_, platform="zenodo", type="dataset", error="error decoding keywords" + _id=_id, platform="zenodo", type="dataset", error="error decoding keywords" ) dataset = Dataset( platform="zenodo", - platform_identifier=id_, + platform_identifier=_id, name=title[:150], same_as=same_as, creator=creator[ @@ -167,7 +167,7 @@ def _get_resource_type(self, record): return None def _retrieve_dataset_from_datetime( - self, sk: Sickle, from_incl: datetime, to_excl: datetime | None = None + self, sk: Sickle, from_incl: datetime, to_excl: datetime ) -> Iterator[Dataset | RecordError]: records = sk.ListRecords( **{ @@ -177,18 +177,16 @@ def _retrieve_dataset_from_datetime( ) record = next(records, None) - last_date = None - while record and (to_excl is None or (last_date is not None and last_date < to_excl)): + last_date = datetime.min + while record and last_date < to_excl: if self._get_resource_type(record) == "Dataset": dataset = self._dataset_from_record(record) if not isinstance(dataset, RecordError): - last_date = dataset.date_published + if dataset.date_published is not None: + last_date = dataset.date_published yield dataset record = next(records, None) - def fetch( - self, from_incl: datetime | None = None, to_excl: datetime | None = None - ) -> Iterator[Dataset | RecordError]: + def fetch(self, from_incl: datetime, to_excl: datetime) -> Iterator[Dataset | RecordError]: sickle = Sickle("https://zenodo.org/oai2d") - date = from_incl if from_incl is not None else datetime(2000, 1, 1, 12, 0, 0) - return self._retrieve_dataset_from_datetime(sickle, date, to_excl=to_excl) + return self._retrieve_dataset_from_datetime(sickle, from_incl, to_excl=to_excl) diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 36950686..7ba9fca3 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -16,7 +16,12 @@ def test_fetch_happy_path(): connector = ZenodoDatasetConnector() with responses.RequestsMock() as mocked_requests: mock_zenodo_responses(mocked_requests) - datasets = list(connector.fetch()) + + datasets = list( + connector.fetch( + from_incl=datetime.datetime(2000, 1, 1, 12, 0, 0), to_excl=datetime.datetime.max + ) + ) assert len(datasets) == 1 dataset = datasets[0] assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" From a237ad5425f4f968639d42c6526daf5e832f8625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:21:54 +0000 Subject: [PATCH 24/52] zenodo retry error handling --- src/connectors/zenodo/zenodo_dataset_connector.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 539ca4d7..1bfdae5f 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -24,10 +24,19 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo - def retry(self, _id: str) -> Dataset: + def retry(self, _id: str) -> Dataset | RecordError: """Retrieve information of the resource identified by id""" - record = requests.get(f"https://zenodo.org/api/records/{_id}").json() + response = requests.get(f"https://zenodo.org/api/records/{_id}") + if not response.ok: + msg = response.json()["error"]["message"] + return RecordError( + platform="zenodo", + _id=str(_id), + error=f"Error while fetching data from OpenML: '{msg}'.", + type="dataset", + ) + record = response.json() creators_list = [item["name"] for item in record["metadata"]["creators"]] creator = "; ".join(creators_list) # TODO change field to an array return Dataset( From 320f4748f2b45d8b9e4de78cd5501e77c75bdf7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:24:18 +0000 Subject: [PATCH 25/52] remove comment sflag dataset --- src/main.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/main.py b/src/main.py index f53ef3c2..70b6d5ab 100644 --- a/src/main.py +++ b/src/main.py @@ -32,14 +32,7 @@ def _parse_args() -> argparse.Namespace: choices=["no", "only-if-empty", "always"], help="Determines if the database is recreated.", ) - """ - parser.add_argument( - "--populate-datasets", - default=[], - nargs="+", - choices=[p.name for p in PlatformName], - help="Zero, one or more platforms with which the datasets should get populated.", - )""" + parser.add_argument( "--fill-with-examples", default=[], From b51cc9afe93d32f470df8d71dd351782f171e979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:42:08 +0000 Subject: [PATCH 26/52] fix test --- src/tests/connectors/example/test_example_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/connectors/example/test_example_connector.py b/src/tests/connectors/example/test_example_connector.py index a2da0b87..98632929 100644 --- a/src/tests/connectors/example/test_example_connector.py +++ b/src/tests/connectors/example/test_example_connector.py @@ -1,5 +1,5 @@ import pytest - +from datetime import datetime from connectors import example_connectors @@ -19,7 +19,7 @@ ) def test_fetch_happy_path(datatype: str): connector = example_connectors[datatype] - resources = list(connector.fetch()) + resources = list(connector.fetch(from_incl=datetime.min, to_excl=datetime.max)) assert len(resources) >= 1 resource = resources[0] if hasattr(resource, "keywords"): # otherwise, only tested that connector can run From 2735d9819e32911ace95cafcf82b9c6da75b784f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:43:15 +0000 Subject: [PATCH 27/52] fix setup fetch --- src/database/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/database/setup.py b/src/database/setup.py index 0d052090..906e6af3 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -5,6 +5,7 @@ from typing import List from sqlalchemy import text, and_ +from datetime import datetime from sqlalchemy.engine import Engine from sqlalchemy.exc import IntegrityError from sqlmodel import create_engine, Session, select, SQLModel @@ -64,7 +65,6 @@ def populate_database( engine: Engine, connectors: List[ResourceConnectorByDate], only_if_empty: bool = True, - limit: int | None = None, ): """Add some data to the Dataset and Publication tables.""" @@ -86,7 +86,7 @@ def populate_database( # This is a temporary solution. After finishing the Connectors (so that they're # synchronizing), we will probably just perform a HTTP POST instead. - for item in connector.fetch(): + for item in connector.fetch(from_incl=datetime.min, to_excl=datetime.max): if isinstance(item, ResourceWithRelations): resource_create_instance = item.resource _create_or_fetch_related_objects(session, item) From 424bdffb7459695064f5601402162587a1a319be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 09:46:38 +0000 Subject: [PATCH 28/52] limit remove --- src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.py b/src/main.py index 70b6d5ab..0be2d368 100644 --- a/src/main.py +++ b/src/main.py @@ -135,7 +135,6 @@ def create_app() -> FastAPI: engine, connectors=examples_connectors, only_if_empty=True, - limit=args.limit, ) add_routes(app, engine, url_prefix=args.url_prefix) From ae623c52185fc37b175d392c19f67bddb9f38845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 10:00:54 +0000 Subject: [PATCH 29/52] huggingface --- .../huggingface/huggingface_dataset_connector.py | 12 +++++------- .../test_huggingface_dataset_connector.py | 8 ++------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index 5eb05036..541e6217 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -1,4 +1,3 @@ -""" import itertools import logging import typing @@ -8,7 +7,7 @@ import dateutil.parser import requests -from connectors import ResourceConnector +from connectors.abstract.resource_connector import ResourceConnector from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.data_download import DataDownload from database.model.dataset.dataset import Dataset @@ -28,11 +27,11 @@ def platform_name(self) -> PlatformName: @staticmethod def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]]: -""" -""" + """ Perform a GET request and raise an exception if the response code is not OK. -""" -""" + resultaod + """ + response = requests.get(url, params={"dataset": dataset_id}) response_json = response.json() if not response.ok: @@ -133,4 +132,3 @@ def fetch_all( logging.error( f"Error while fetching huggingface dataset with id {dataset.id}: " f"{str(e)}" ) -""" diff --git a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py index 5cb75261..56ca7603 100644 --- a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py +++ b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py @@ -1,12 +1,10 @@ -""" import json import responses +from connectors.huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector -import connectors from connectors.resource_with_relations import ResourceWithRelations -from database.model.platform.platform_names import PlatformName from tests.testutils.paths import path_test_resources HUGGINGFACE_URL = "https://datasets-server.huggingface.co" @@ -20,7 +18,7 @@ def test_fetch_all_happy_path(): "acronym_identification", "air_dialogue", } - connector = connectors.dataset_connectors[PlatformName.huggingface] + connector = HuggingFaceDatasetConnector() with responses.RequestsMock() as mocked_requests: path_data_list = path_test_resources() / "connectors" / "huggingface" / "data_list.json" with open(path_data_list, "r") as f: @@ -46,7 +44,6 @@ def test_fetch_all_happy_path(): assert all(len(r.related_resources["citations"]) == 1 for r in resources_with_relations) - def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str): filename = f"parquet_{dataset_id.replace('/', '_')}.json" path_split = path_test_resources() / "connectors" / "huggingface" / filename @@ -59,4 +56,3 @@ def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str): json=response, status=status, ) -""" From e21c8b22f8537a76ae7c3c5d89dd374f3f2a4efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 10:02:44 +0000 Subject: [PATCH 30/52] comment --- src/connectors/huggingface/huggingface_dataset_connector.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index 541e6217..a805f545 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -17,6 +17,11 @@ class HuggingFaceDatasetConnector(ResourceConnector[Dataset]): + """ + This must be only runned on the startu due to there is no way to + retrieve data from huggingface filtering by time creation + """ + @property def resource_class(self) -> type[Dataset]: return Dataset From bc48b599f75b83cc28d5e5c796b14baf311da520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Mon, 24 Jul 2023 12:27:58 +0000 Subject: [PATCH 31/52] add check_valid_id --- .../openml/openml_dataset_connector.py | 27 ++++++++++++++++--- .../openml/test_openml_dataset_connector.py | 13 +++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 86b5fa4f..2d789d8a 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -3,7 +3,7 @@ and how to convert the OpenML response to some agreed AIoD format. """ -from typing import Iterator +from typing import Iterator, List import dateutil.parser import requests @@ -92,13 +92,34 @@ def retry(self, _id: int) -> SQLModel: measured_values=[], ) + def check_valid_id(self, from_id: int, to_id: int) -> List[int]: + url = f"https://www.openml.org/api/v1/json/data/list/data_id/{','.join(map(str, range(from_id, to_id)))}" # noqa E501 + response = requests.get(url) + + if response.status_code == 200: + data = response.json() + existing_ids = [int(item["did"]) for item in data.get("data", {}).get("dataset", [])] + return existing_ids + elif response.status_code == 412: + return [-1] + else: + raise ValueError() + def fetch(self, from_id: int, to_id: int) -> Iterator[SQLModel | RecordError]: - for _id in range(from_id, to_id): + valid_ids = self.check_valid_id(from_id, to_id) + if valid_ids[0] == -1: + return RecordError( + _id=str(from_id), + platform="openml", + type="datset", + error="No more datasets to retrieve", + ) + for _id in valid_ids: try: dataset = self.retry(_id) yield dataset except Exception as e: - return RecordError(_id=str(_id), platform="openml", type="dataset", error=str(e)) + yield RecordError(_id=str(_id), platform="openml", type="dataset", error=str(e)) def _as_int(v: str) -> int: diff --git a/src/tests/connectors/openml/test_openml_dataset_connector.py b/src/tests/connectors/openml/test_openml_dataset_connector.py index ad22fa15..e9de9af1 100644 --- a/src/tests/connectors/openml/test_openml_dataset_connector.py +++ b/src/tests/connectors/openml/test_openml_dataset_connector.py @@ -52,8 +52,20 @@ def test_fetch_happy_path(): def test_fetch_happy_path(): connector = OpenMlDatasetConnector() with responses.RequestsMock() as mocked_requests: + with open( + path_test_resources() / "connectors" / "openml" / "data_list.json", + "r", + ) as f: + data_list = json.load(f) + mocked_requests.add( + responses.GET, + f"{OPENML_URL}/data/list/data_id/2,3,4", + json=data_list, + status=200, + ) for i in range(2, 5): mock_openml_responses(mocked_requests, str(i)) + datasets = list(connector.fetch(2, 5)) assert len(datasets) == 3 @@ -78,6 +90,7 @@ def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_iden "r", ) as f: data_qualities_response = json.load(f) + mocked_requests.add( responses.GET, f"{OPENML_URL}/data/{platform_identifier}", From e396e3df0e41ac62cd656e2fec92da6a8c522609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Tue, 25 Jul 2023 14:03:20 +0000 Subject: [PATCH 32/52] main connector --- src/connectors/synchronization.py | 135 ++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 src/connectors/synchronization.py diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py new file mode 100644 index 00000000..26867749 --- /dev/null +++ b/src/connectors/synchronization.py @@ -0,0 +1,135 @@ +from typing import Dict, List +import argparse +from sqlmodel import SQLModel +from config import DB_CONFIG +from connectors.record_error import RecordError +from connectors.resource_with_relations import ResourceWithRelations +from database.model.platform.platform_names import PlatformName +from datetime import datetime +from connectors.abstract.resource_connector import ResourceConnector +from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate +from connectors.abstract.resource_connector_by_id import ResourceConnectorById +from connectors.huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector +from connectors.openml.openml_dataset_connector import OpenMlDatasetConnector +from connectors.zenodo.zenodo_dataset_connector import ZenodoDatasetConnector +from database.setup import ( + connect_to_database, +) +from sqlalchemy.engine import Engine + + +class Synchronization: + dataset_connectors = { + c.platform_name: c + for c in ( + OpenMlDatasetConnector(), + HuggingFaceDatasetConnector(), + ZenodoDatasetConnector(), + ) + } + + def _parse_args(self) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Please refer to the README.") + parser.add_argument( + "--populate-datasets", + default=[], + nargs="+", + choices=[p.name for p in PlatformName], + help="Zero, one or more platforms with which the datasets should get populated.", + ) + return parser.parse_args() + + def _connector_from_platform_name( + self, connector_type: str, connector_dict: Dict, platform_name: str + ): + """Get the connector from the connector_dict, identified by its platform name.""" + try: + platform = PlatformName(platform_name) + except ValueError: + raise ValueError( + f"platform " f"'{platform_name}' not recognized.", + ) + connector = connector_dict.get(platform, None) + if connector is None: + possibilities = ", ".join(f"`{c}`" for c in self.dataset_connectors.keys()) + msg = ( + f"No {connector_type} connector for platform '{platform_name}' available. Possible " + f"values: {possibilities}" + ) + raise ValueError(msg) + return connector + + def _engine(self, rebuild_db: str) -> Engine: + """ + Return a SqlAlchemy engine, backed by the MySql connection as + configured in the configuration file. + """ + username = DB_CONFIG.get("name", "root") + password = DB_CONFIG.get("password", "ok") + host = DB_CONFIG.get("host", "demodb") + port = DB_CONFIG.get("port", 3306) + database = DB_CONFIG.get("database", "aiod") + + db_url = f"mysql://{username}:{password}@{host}:{port}/{database}" + + delete_before_create = rebuild_db == "always" + return connect_to_database(db_url, delete_first=delete_before_create) + + def store_records( + self, engine: Engine, items: List["SQLModel" | "ResourceWithRelations[SQLModel]"] + ): + """ + This function store on the database all the items using the engine + """ + pass + + def start(self): + args = self._parse_args() + dataset_connectors: List["ResourceConnector"] = [ + self._connector_from_platform_name("dataset", self.dataset_connectors, platform_name) + for platform_name in args.populate_datasets + ] + # add all dict connectors + connectors_ = dataset_connectors + engine = self._engine(args.rebuild_db) + + # init the database with all connectors + for connector in connectors_: + if isinstance(connector, HuggingFaceDatasetConnector): + records = connector.fetch_all() + self.store_records(engine, records) + + elif isinstance(connector, ResourceConnectorByDate): + records = [] + items = connector.fetch(datetime.min, datetime.max) + for item in items: + if isinstance(item, RecordError): + # handle error + pass + else: + records.append(item) + self.store_records(engine, records) + + elif isinstance(connector, ResourceConnectorById): + # Retrieve all records + from_id = 0 + to_id = from_id + 10 + finished = False + records: List["SQLModel" | "ResourceWithRelations[SQLModel]"] = [] + while not finished: + items = connector.fetch(from_id, to_id) + if records[0].error == "No more datasets to retrieve": + finished = True + else: + from_id += 10 + to_id = from_id + 10 + for item in items: + if isinstance(item, RecordError): + # handle error + pass + else: + records.append(item) + self.store_records(engine, records) + + else: + pass # "Unknown connector type! From b1b0c5aa5609fdfc1b0311bc96d9e94e8317a1d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Tue, 25 Jul 2023 14:09:11 +0000 Subject: [PATCH 33/52] comments --- src/connectors/synchronization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 26867749..51f0bb82 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -95,6 +95,7 @@ def start(self): # init the database with all connectors for connector in connectors_: + # This is a unique type of connector due to Huggingface API if isinstance(connector, HuggingFaceDatasetConnector): records = connector.fetch_all() self.store_records(engine, records) From e4d218d7d521e882fc1d211c69c3c9938595bf23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 12:19:50 +0000 Subject: [PATCH 34/52] config --- src/connectors/config.py | 8 ++++++++ src/connectors/config.toml | 24 ++++++++++++++++++++++++ src/connectors/synchronization.py | 9 +++++++-- 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 src/connectors/config.py create mode 100644 src/connectors/config.toml diff --git a/src/connectors/config.py b/src/connectors/config.py new file mode 100644 index 00000000..bf10ed8c --- /dev/null +++ b/src/connectors/config.py @@ -0,0 +1,8 @@ +import pathlib +import tomllib + +with open(pathlib.Path(__file__).parent / "config.toml", "rb") as fh: + CONFIG = tomllib.load(fh) + +DB_CONFIG = CONFIG.get("database", {}) +KEYCLOAK_CONFIG = CONFIG.get("keycloak", {}) diff --git a/src/connectors/config.toml b/src/connectors/config.toml new file mode 100644 index 00000000..9148490e --- /dev/null +++ b/src/connectors/config.toml @@ -0,0 +1,24 @@ +# Configures the REST API +# TODO: refactor configuration (https://github.com/aiondemand/AIOD-rest-api/issues/82) + +# Information on which database to connect to +[database] +host = "sqlserver" +port = 3306 +database = "aiod" +username = "root" +password = "ok" + +# Additional options for development +[dev] +reload = true + +# Authentication and authorization +[keycloak] +server_url = "https://test.openml.org/aiod-auth/" +realm = "dev" +client_id = "aiod-api" # a private client, used by the backend +client_id_swagger = "aiod-api-swagger" # a public client, used by the Swagger Frontend +openid_connect_url = "https://test.openml.org/aiod-auth/realms/dev/.well-known/openid-configuration" +scopes = "openid profile microprofile-jwt" +role = "edit_aiod_resources" \ No newline at end of file diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 51f0bb82..0b8f3757 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -1,8 +1,10 @@ from typing import Dict, List import argparse from sqlmodel import SQLModel -from config import DB_CONFIG +import logging + from connectors.record_error import RecordError +from connectors.config import DB_CONFIG from connectors.resource_with_relations import ResourceWithRelations from database.model.platform.platform_names import PlatformName from datetime import datetime @@ -17,6 +19,8 @@ ) from sqlalchemy.engine import Engine +logging.basicConfig(filename="example.log", encoding="utf-8", level=logging.DEBUG) + class Synchronization: dataset_connectors = { @@ -76,11 +80,12 @@ def _engine(self, rebuild_db: str) -> Engine: return connect_to_database(db_url, delete_first=delete_before_create) def store_records( - self, engine: Engine, items: List["SQLModel" | "ResourceWithRelations[SQLModel]"] + self, engine: Engine, items: List["SQLModel | ResourceWithRelations[SQLModel]"] ): """ This function store on the database all the items using the engine """ + items pass def start(self): From cdbaa192bedd336a4f86ef2d5e12d301ac1a41d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 12:23:40 +0000 Subject: [PATCH 35/52] delete useless line --- src/connectors/synchronization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 0b8f3757..f846cbd8 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -85,7 +85,6 @@ def store_records( """ This function store on the database all the items using the engine """ - items pass def start(self): From ed42f016fd16933a55ff663d6c4621cedee33f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 13:40:39 +0000 Subject: [PATCH 36/52] create new abstract class --- src/connectors/__init__.py | 4 +-- .../resource_connector_on_start_up.py | 23 +++++++++++++++++ src/connectors/example/example_connector.py | 21 +++------------- .../example/example_dataset_connector.py | 25 ++++++------------- src/database/setup.py | 7 +++--- .../example/test_example_connector.py | 24 +----------------- 6 files changed, 40 insertions(+), 64 deletions(-) create mode 100644 src/connectors/abstract/resource_connector_on_start_up.py diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index 176b1176..494a4786 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -10,7 +10,7 @@ from database.model.project.project import Project from database.model.publication.publication import Publication from database.model.organisation.organisation import Organisation -from .abstract.resource_connector_by_date import ResourceConnectorByDate # noqa:F401 +from .abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp # noqa:F401 from .example.example_connector import ExampleConnector from .example.example_dataset_connector import ExampleDatasetConnector @@ -46,4 +46,4 @@ "organisations": ExampleConnector( resource_class=Organisation, json_path=_path_example_resources / "organisations.json" ), -} # type: Dict[str, ResourceConnectorByDate] +} # type: Dict[str, ResourceConnectorOnStartUp] diff --git a/src/connectors/abstract/resource_connector_on_start_up.py b/src/connectors/abstract/resource_connector_on_start_up.py new file mode 100644 index 00000000..c843628d --- /dev/null +++ b/src/connectors/abstract/resource_connector_on_start_up.py @@ -0,0 +1,23 @@ +import abc +from typing import Generic, Iterator, TypeVar +from connectors.abstract.resource_connector import ResourceConnector +from connectors.record_error import RecordError + +from sqlmodel import SQLModel + +from connectors.resource_with_relations import ResourceWithRelations + +RESOURCE = TypeVar("RESOURCE", bound=SQLModel) + + +class ResourceConnectorOnStartUp(ResourceConnector, Generic[RESOURCE]): + """ + For every platform that offers this resource, this ResourceConnector should be implemented. + """ + + @abc.abstractmethod + def fetch( + self, limit: int | None = None + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + """Retrieve information of all resources""" + pass diff --git a/src/connectors/example/example_connector.py b/src/connectors/example/example_connector.py index 76510389..baa392bd 100644 --- a/src/connectors/example/example_connector.py +++ b/src/connectors/example/example_connector.py @@ -1,10 +1,9 @@ -from datetime import datetime import json import pathlib from typing import Iterator, TypeVar from sqlmodel import SQLModel -from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate +from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from database.model.resource import resource_create @@ -14,7 +13,7 @@ RESOURCE = TypeVar("RESOURCE", bound=SQLModel) -class ExampleConnector(ResourceConnectorByDate[RESOURCE]): +class ExampleConnector(ResourceConnectorOnStartUp[RESOURCE]): """ Creating hardcoded values example values based on json files """ @@ -31,21 +30,9 @@ def resource_class(self) -> type[RESOURCE]: def platform_name(self) -> PlatformName: return PlatformName.example - def retry(self, _id: str) -> RESOURCE: - """Retrieve information of the resource identified by id""" + def fetch(self, limit: int | None = None) -> Iterator[RESOURCE]: with open(self.json_path) as f: json_data = json.load(f) pydantic_class = resource_create(self.resource_class) - for json_item in json_data: - if json_item.get("platform_identifier") == _id: - return pydantic_class(**json_item) - raise ValueError("No resource associated with the id") - - def fetch( - self, from_incl: datetime | None = None, to_excl: datetime | None = None - ) -> Iterator[RESOURCE]: - with open(self.json_path) as f: - json_data = json.load(f) - pydantic_class = resource_create(self.resource_class) - for json_item in json_data: + for json_item in json_data[:limit]: yield pydantic_class(**json_item) diff --git a/src/connectors/example/example_dataset_connector.py b/src/connectors/example/example_dataset_connector.py index 9a433eab..84c0bbaf 100644 --- a/src/connectors/example/example_dataset_connector.py +++ b/src/connectors/example/example_dataset_connector.py @@ -2,7 +2,7 @@ import typing # noqa:F401 (flake8 raises incorrect 'Module imported but unused' error) -from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate +from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset from database.model.publication.publication import Publication @@ -10,7 +10,7 @@ from database.model.platform.platform_names import PlatformName -class ExampleDatasetConnector(ResourceConnectorByDate[Dataset]): +class ExampleDatasetConnector(ResourceConnectorOnStartUp[Dataset]): @property def resource_class(self) -> type[Dataset]: return Dataset @@ -19,7 +19,7 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.example - def retry(self, _id: str) -> ResourceWithRelations[Dataset]: + def get(self, _id: str) -> ResourceWithRelations[Dataset]: """Retrieve information of the resource identified by id""" pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) @@ -78,19 +78,8 @@ def retry(self, _id: str) -> ResourceWithRelations[Dataset]: return dataset raise ValueError("No resource associated with the id") - def fetch( - self, from_incl: datetime | None = None, to_excl: datetime | None = None - ) -> typing.Iterator[ResourceWithRelations[Dataset]]: + def fetch(self, limit: int | None = None) -> typing.Iterator[ResourceWithRelations[Dataset]]: id_list = ["42769", "42742"] - for id_ in id_list: - dataset = self.retry(id_) - if from_incl is None: - from_incl = datetime.min - if to_excl is None: - to_excl = datetime.max - if ( - dataset.resource.date_published is not None - and dataset.resource.date_published >= from_incl - and dataset.resource.date_published < to_excl - ): - yield dataset + for id_ in id_list[:limit]: + dataset = self.get(id_) + yield dataset diff --git a/src/database/setup.py b/src/database/setup.py index 906e6af3..35123743 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -5,13 +5,12 @@ from typing import List from sqlalchemy import text, and_ -from datetime import datetime from sqlalchemy.engine import Engine from sqlalchemy.exc import IntegrityError from sqlmodel import create_engine, Session, select, SQLModel import routers -from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate +from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset from database.model.platform.platform import Platform @@ -63,7 +62,7 @@ def drop_or_create_database(url: str, delete_first: bool): def populate_database( engine: Engine, - connectors: List[ResourceConnectorByDate], + connectors: List[ResourceConnectorOnStartUp], only_if_empty: bool = True, ): """Add some data to the Dataset and Publication tables.""" @@ -86,7 +85,7 @@ def populate_database( # This is a temporary solution. After finishing the Connectors (so that they're # synchronizing), we will probably just perform a HTTP POST instead. - for item in connector.fetch(from_incl=datetime.min, to_excl=datetime.max): + for item in connector.fetch(): if isinstance(item, ResourceWithRelations): resource_create_instance = item.resource _create_or_fetch_related_objects(session, item) diff --git a/src/tests/connectors/example/test_example_connector.py b/src/tests/connectors/example/test_example_connector.py index 98632929..664da5cb 100644 --- a/src/tests/connectors/example/test_example_connector.py +++ b/src/tests/connectors/example/test_example_connector.py @@ -1,5 +1,4 @@ import pytest -from datetime import datetime from connectors import example_connectors @@ -19,29 +18,8 @@ ) def test_fetch_happy_path(datatype: str): connector = example_connectors[datatype] - resources = list(connector.fetch(from_incl=datetime.min, to_excl=datetime.max)) + resources = list(connector.fetch()) assert len(resources) >= 1 resource = resources[0] if hasattr(resource, "keywords"): # otherwise, only tested that connector can run assert set(resource.keywords) == {"keyword1", "keyword2"} - - -@pytest.mark.parametrize( - "datatype", - [ - "case_studies", - "computational_resources", - "educational_resources", - "events", - "presentations", - "projects", - "publications", - "news", - "organisations", - ], -) -def test_retry_happy_path(datatype: str): - connector = example_connectors[datatype] - resource = connector.retry("1") - if hasattr(resource, "keywords"): # otherwise, only tested that connector can run - assert set(resource.keywords) == {"keyword1", "keyword2"} From 60908396041c969228f12ac2264a7326432abcf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 13:46:40 +0000 Subject: [PATCH 37/52] change id type to int --- src/connectors/abstract/resource_connector_by_date.py | 2 +- src/connectors/zenodo/zenodo_dataset_connector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 4af86a9b..393f4b8e 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -17,7 +17,7 @@ class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]): """ @abc.abstractmethod - def retry(self, _id: str) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: + def retry(self, _id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" pass diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 1bfdae5f..2733de98 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -24,7 +24,7 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo - def retry(self, _id: str) -> Dataset | RecordError: + def retry(self, _id: int) -> Dataset | RecordError: """Retrieve information of the resource identified by id""" response = requests.get(f"https://zenodo.org/api/records/{_id}") From 18ecf50dd006af4a0fed4091f15c66acc6d0131a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 13:48:12 +0000 Subject: [PATCH 38/52] delete useless comment --- src/connectors/huggingface/huggingface_dataset_connector.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index a805f545..e3ee77fb 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -32,10 +32,6 @@ def platform_name(self) -> PlatformName: @staticmethod def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]]: - """ - Perform a GET request and raise an exception if the response code is not OK. - resultaod - """ response = requests.get(url, params={"dataset": dataset_id}) response_json = response.json() From 970448d65355b7d38a35496aee6359eb32def384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 13:51:04 +0000 Subject: [PATCH 39/52] connector hugg changed to on startup --- .../huggingface/huggingface_dataset_connector.py | 8 +++----- .../huggingface/test_huggingface_dataset_connector.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index e3ee77fb..e0adb299 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -7,7 +7,7 @@ import dateutil.parser import requests -from connectors.abstract.resource_connector import ResourceConnector +from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.data_download import DataDownload from database.model.dataset.dataset import Dataset @@ -16,7 +16,7 @@ from database.model.platform.platform_names import PlatformName -class HuggingFaceDatasetConnector(ResourceConnector[Dataset]): +class HuggingFaceDatasetConnector(ResourceConnectorOnStartUp[Dataset]): """ This must be only runned on the startu due to there is no way to retrieve data from huggingface filtering by time creation @@ -43,9 +43,7 @@ def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]] return [] return response_json["parquet_files"] - def fetch_all( - self, limit: int | None = None - ) -> typing.Iterator[ResourceWithRelations[Dataset]]: + def fetch(self, limit: int | None = None) -> typing.Iterator[ResourceWithRelations[Dataset]]: pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) for dataset in itertools.islice(datasets.list_datasets(with_details=True), limit): diff --git a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py index 56ca7603..9efd19a5 100644 --- a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py +++ b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py @@ -10,7 +10,7 @@ HUGGINGFACE_URL = "https://datasets-server.huggingface.co" -def test_fetch_all_happy_path(): +def test_fetch_happy_path(): ids_expected = { "0n1xus/codexglue", "04-07-22/wep-probes", @@ -31,7 +31,7 @@ def test_fetch_all_happy_path(): ) for dataset_id in ids_expected: mock_parquet(mocked_requests, dataset_id) - resources_with_relations = list(connector.fetch_all(limit=None)) + resources_with_relations = list(connector.fetch()) assert len(resources_with_relations) == 5 assert all(type(r) == ResourceWithRelations for r in resources_with_relations) From b36a84b54882f1f4e4597f067e43710cb6589884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 13:54:12 +0000 Subject: [PATCH 40/52] typo --- src/connectors/huggingface/huggingface_dataset_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index e0adb299..8a62e77b 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -18,7 +18,7 @@ class HuggingFaceDatasetConnector(ResourceConnectorOnStartUp[Dataset]): """ - This must be only runned on the startu due to there is no way to + This must be only runned on the startup due to there is no way to retrieve data from huggingface filtering by time creation """ From 91be909980c77320c38fe56e7de09ab351694bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 14:05:31 +0000 Subject: [PATCH 41/52] error handling huggingface --- .../huggingface/huggingface_dataset_connector.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index 8a62e77b..d8eb3a37 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -8,6 +8,7 @@ import requests from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp +from connectors.record_error import RecordError from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.data_download import DataDownload from database.model.dataset.dataset import Dataset @@ -32,7 +33,6 @@ def platform_name(self) -> PlatformName: @staticmethod def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]]: - response = requests.get(url, params={"dataset": dataset_id}) response_json = response.json() if not response.ok: @@ -43,7 +43,9 @@ def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]] return [] return response_json["parquet_files"] - def fetch(self, limit: int | None = None) -> typing.Iterator[ResourceWithRelations[Dataset]]: + def fetch( + self, limit: int | None = None + ) -> typing.Iterator[ResourceWithRelations[Dataset] | RecordError]: pydantic_class = resource_create(Dataset) pydantic_class_publication = resource_create(Publication) for dataset in itertools.islice(datasets.list_datasets(with_details=True), limit): @@ -128,6 +130,6 @@ def fetch(self, limit: int | None = None) -> typing.Iterator[ResourceWithRelatio related_resources={"citations": citations}, ) except Exception as e: - logging.error( - f"Error while fetching huggingface dataset with id {dataset.id}: " f"{str(e)}" + yield RecordError( + platform="huggingface", _id=dataset.id, type="dataset", error=e.args[0] ) From 9549cc9adcf24c27f212a175f3b90edbbfd6462d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 14:10:33 +0000 Subject: [PATCH 42/52] add header error --- src/connectors/openml/openml_dataset_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 2d789d8a..59227f0c 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -34,7 +34,7 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.openml - def retry(self, _id: int) -> SQLModel: + def retry(self, _id: int) -> SQLModel | RecordError: url_data = f"https://www.openml.org/api/v1/json/data/{_id}" response = requests.get(url_data) if not response.ok: From 27fe77c3150dce9cd3f50166c91f0a3dc00d11a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Wed, 26 Jul 2023 14:20:19 +0000 Subject: [PATCH 43/52] delete raise exception --- src/connectors/openml/openml_dataset_connector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 59227f0c..13432ddc 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -7,7 +7,6 @@ import dateutil.parser import requests -from fastapi import HTTPException from sqlmodel import SQLModel @@ -57,9 +56,11 @@ def retry(self, _id: int) -> SQLModel | RecordError: response = requests.get(url_qual) if not response.ok: msg = response.json()["error"]["message"] - raise HTTPException( - status_code=response.status_code, - detail=f"Error while fetching data qualities from OpenML: '{msg}'.", + return RecordError( + platform="openml", + _id=str(_id), + error=f"Error while fetching data from OpenML: '{msg}'.", + type="dataset", ) qualities_json = { From 2cf514b1e89bbf2e06b18bd53881d448b060168c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 28 Jul 2023 07:38:47 +0000 Subject: [PATCH 44/52] typo --- src/connectors/zenodo/zenodo_dataset_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 2733de98..39251550 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -33,7 +33,7 @@ def retry(self, _id: int) -> Dataset | RecordError: return RecordError( platform="zenodo", _id=str(_id), - error=f"Error while fetching data from OpenML: '{msg}'.", + error=f"Error while fetching data from Zenodo: '{msg}'.", type="dataset", ) record = response.json() From 322efe88caa6a9a9a23a42e5a7a35ae70d09498f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 28 Jul 2023 07:48:46 +0000 Subject: [PATCH 45/52] reason why zenodo uses 2 protocols --- src/connectors/zenodo/zenodo_dataset_connector.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 39251550..78313626 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -24,6 +24,15 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo + """ + This function fetches only one record from Zenodo using the Rest API instead of + the OAI-PMH one. When querying using the OAI protocol, we always receive all the + records, making it really inefficient to filter through all of them until we get + the one we want. Apart from using different protocols, they also employ different + serialization methods. The Rest API uses JSON, while OAI uses XML, which is why the + code shows no similarities. + """ + def retry(self, _id: int) -> Dataset | RecordError: """Retrieve information of the resource identified by id""" From a4c7ee392b89424e6de114d6399d153fb0706922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Ar=C3=A9jula=20A=C3=ADsa?= Date: Fri, 28 Jul 2023 07:53:49 +0000 Subject: [PATCH 46/52] ad until to the protocol --- src/connectors/zenodo/zenodo_dataset_connector.py | 8 +++----- src/tests/connectors/zenodo/test_get_datasets_zenodo.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 78313626..013f67d0 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -191,17 +191,15 @@ def _retrieve_dataset_from_datetime( **{ "metadataPrefix": "oai_datacite", "from": from_incl.isoformat(), + "until": to_excl.isoformat(), } ) record = next(records, None) - last_date = datetime.min - while record and last_date < to_excl: + + while record: if self._get_resource_type(record) == "Dataset": dataset = self._dataset_from_record(record) - if not isinstance(dataset, RecordError): - if dataset.date_published is not None: - last_date = dataset.date_published yield dataset record = next(records, None) diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 7ba9fca3..ca09732c 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -89,7 +89,7 @@ def mock_zenodo_responses(mocked_requests: responses.RequestsMock): records_list = f.read() mocked_requests.add( responses.GET, - "https://zenodo.org/oai2d?metadataPrefix=oai_datacite&from=2000-01-01T12%3A00%3A00&verb=ListRecords", # noqa E501 + "https://zenodo.org/oai2d?metadataPrefix=oai_datacite&from=2000-01-01T12%3A00%3A00&until=9999-12-31T23%3A59%3A59.999999&verb=ListRecords", # noqa E501 body=records_list, status=200, ) From 9e3e8c2e832f0a5ee171e90888e7f1cb5675af16 Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Fri, 18 Aug 2023 21:16:39 +0200 Subject: [PATCH 47/52] Updated the synchronization and connectors to be more like described in https://github.com/aiondemand/AIOD-rest-api/issues/16 --- src/connectors/abstract/resource_connector.py | 20 +- .../abstract/resource_connector_by_date.py | 49 ++- .../abstract/resource_connector_by_id.py | 66 +++- .../resource_connector_on_start_up.py | 21 +- src/connectors/config.py | 8 - src/connectors/config.toml | 24 -- .../huggingface_dataset_connector.py | 4 +- .../openml/openml_dataset_connector.py | 93 +++--- src/connectors/record_error.py | 6 +- src/connectors/synchronization.py | 292 ++++++++++-------- src/connectors/zenodo/__init__.py | 0 .../zenodo/zenodo_dataset_connector.py | 142 ++++----- src/database/setup.py | 18 ++ src/main.py | 23 +- .../openml/test_openml_dataset_connector.py | 132 ++++---- .../zenodo/test_get_datasets_zenodo.py | 16 +- .../{data_list.json => list_offset_0.json} | 51 --- .../connectors/openml/list_offset_2.json | 57 ++++ 18 files changed, 542 insertions(+), 480 deletions(-) delete mode 100644 src/connectors/config.py delete mode 100644 src/connectors/config.toml create mode 100644 src/connectors/zenodo/__init__.py rename src/tests/resources/connectors/openml/{data_list.json => list_offset_0.json} (67%) create mode 100644 src/tests/resources/connectors/openml/list_offset_2.json diff --git a/src/connectors/abstract/resource_connector.py b/src/connectors/abstract/resource_connector.py index 95142b18..67d66d37 100644 --- a/src/connectors/abstract/resource_connector.py +++ b/src/connectors/abstract/resource_connector.py @@ -1,11 +1,11 @@ import abc -from typing import Generic, TypeVar - +from typing import Generic, TypeVar, Iterator from sqlmodel import SQLModel - +from connectors.record_error import RecordError +from connectors.resource_with_relations import ResourceWithRelations from database.model.platform.platform_names import PlatformName @@ -14,16 +14,22 @@ class ResourceConnector(abc.ABC, Generic[RESOURCE]): """ - For every platform that offers this resource, this ResourceConnector should be implemented. + For every platform that offers a resource, a subclass of the ResourceConnector should be + implemented. """ @property @abc.abstractmethod def resource_class(self) -> type[RESOURCE]: - pass + """The resource class that this connector fetches. E.g. Dataset.""" @property @abc.abstractmethod def platform_name(self) -> PlatformName: - """The platform of this connector""" - pass + """The platform of this connector.""" + + @abc.abstractmethod + def run( + self, state: dict, **kwargs + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + """Fetch resources and update the state""" diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 393f4b8e..36e43866 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -1,6 +1,7 @@ import abc -from datetime import datetime -from typing import Generic, Iterator, TypeVar +import logging +from datetime import datetime, date +from typing import Generic, Iterator, TypeVar, Tuple from connectors.abstract.resource_connector import ResourceConnector from connectors.record_error import RecordError @@ -12,18 +13,50 @@ class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]): - """ - For every platform that offers this resource, this ResourceConnector should be implemented. - """ + """Connectors that synchronize by filtering the results on datetime. In every subsequent run, + the previous end-datetime is used as datetime-from.""" @abc.abstractmethod def retry(self, _id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" - pass @abc.abstractmethod def fetch( self, from_incl: datetime, to_excl: datetime - ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + ) -> Iterator[Tuple[date | None, SQLModel | ResourceWithRelations[SQLModel] | RecordError]]: """Retrieve information of all resources""" - pass + + def run( + self, + state: dict, + from_date: date | None = None, + limit: int | None = None, + to_excl: datetime | None = None, + **kwargs, + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + if limit is not None: + raise ValueError( + "Limit not implemented for this connector. Please remove the command " + "line argument." + ) + if to_excl is not None: + logging.warning("to_excl should only be set in (unit) tests") + else: + to_excl = datetime.now() + + first_run = not state + if first_run: + if from_date is None: + raise ValueError("In the first run, the from-date needs to be set") + from_incl = datetime.combine(from_date, datetime.min.time()) + else: + from_incl = state["to_excl"] + + logging.info(f"Starting synchronisation {from_incl=}, {to_excl=}.") + state["from_incl"] = from_incl + state["to_excl"] = to_excl + for datetime_, result in self.fetch(from_incl=from_incl, to_excl=to_excl): + yield result + if datetime_: + state["last_datetime"] = datetime_ # For manually resolving errors + state["result"] = "complete run successful" diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index b6e838a2..8789f48b 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -1,29 +1,73 @@ import abc +import logging from typing import Generic, Iterator, TypeVar from sqlmodel import SQLModel -from connectors.abstract.resource_connector import ResourceConnector - -from connectors.resource_with_relations import ResourceWithRelations +from connectors.abstract.resource_connector import ResourceConnector from connectors.record_error import RecordError +from connectors.resource_with_relations import ResourceWithRelations RESOURCE = TypeVar("RESOURCE", bound=SQLModel) class ResourceConnectorById(ResourceConnector, Generic[RESOURCE]): - """ - For every platform that offers this resource, this ResourceConnector should be implemented. - """ + """Connectors that synchronize by filtering the results on identifier. In every subsequent run, + only identifiers higher than the highest identifier of the previous run are fetched.""" + + def __init__(self, limit_per_iteration: int = 500): + self.limit_per_iteration = limit_per_iteration @abc.abstractmethod - def retry(self, _id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: + def retry(self, identifier: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError: """Retrieve information of the resource identified by id""" - pass @abc.abstractmethod def fetch( - self, from_id: int, to_id: int + self, offset: int, from_identifier: int ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: - """Retrieve information of all resources""" - pass + """Retrieve information of resources""" + + def run( + self, state: dict, from_identifier: int | None = None, limit: int | None = None, **kwargs + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + if limit is not None: + logging.warning( + "Limiting the results! Please remove the limit command line argument " + "in production." + ) + + first_run = not state + if first_run and from_identifier is None: + raise ValueError("In the first run, the from-identifier needs to be set") + elif first_run: + state = {"offset": 0, "from_id": 0} + else: + state["from_id"] = state["last_id"] + 1 + state["offset"] = state["offset"] # TODO: what if datasets are deleted? Or updated? + + logging.info( + f"Starting synchronisation of records from id {state['from_id']} and" + f" offset {state['offset']}" + ) + + finished = False + n_results = 0 + while not finished: + i = 0 + for item in self.fetch(offset=state["offset"], from_identifier=state["from_id"]): + i += 1 + if hasattr(item, "platform_identifier"): + id_ = int(item.platform_identifier) + else: + id_ = None + if id_ is None or id_ >= state["from_id"]: + if id_ is not None: + state["last_id"] = id_ + yield item + n_results += 1 + if n_results == limit: + return + finished = i < self.limit_per_iteration + state["offset"] += i + state["result"] = "complete run successful" diff --git a/src/connectors/abstract/resource_connector_on_start_up.py b/src/connectors/abstract/resource_connector_on_start_up.py index c843628d..f408a383 100644 --- a/src/connectors/abstract/resource_connector_on_start_up.py +++ b/src/connectors/abstract/resource_connector_on_start_up.py @@ -1,4 +1,6 @@ import abc +import datetime +import logging from typing import Generic, Iterator, TypeVar from connectors.abstract.resource_connector import ResourceConnector from connectors.record_error import RecordError @@ -11,13 +13,24 @@ class ResourceConnectorOnStartUp(ResourceConnector, Generic[RESOURCE]): - """ - For every platform that offers this resource, this ResourceConnector should be implemented. - """ + """A connector that only runs once, on startup, and performs no synchronization later.""" @abc.abstractmethod def fetch( self, limit: int | None = None ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: """Retrieve information of all resources""" - pass + + def run( + self, state: dict, limit: int | None = None, **kwargs + ) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]: + if state: + raise ValueError("This connector has already been run before.") + if limit is not None: + logging.warning( + "Limiting the results! Please remove the limit command line argument " + "in production." + ) + state["result"] = f"started on {datetime.datetime.now()}" + yield from self.fetch(limit=limit) + state["result"] = "complete run successful" diff --git a/src/connectors/config.py b/src/connectors/config.py deleted file mode 100644 index bf10ed8c..00000000 --- a/src/connectors/config.py +++ /dev/null @@ -1,8 +0,0 @@ -import pathlib -import tomllib - -with open(pathlib.Path(__file__).parent / "config.toml", "rb") as fh: - CONFIG = tomllib.load(fh) - -DB_CONFIG = CONFIG.get("database", {}) -KEYCLOAK_CONFIG = CONFIG.get("keycloak", {}) diff --git a/src/connectors/config.toml b/src/connectors/config.toml deleted file mode 100644 index 9148490e..00000000 --- a/src/connectors/config.toml +++ /dev/null @@ -1,24 +0,0 @@ -# Configures the REST API -# TODO: refactor configuration (https://github.com/aiondemand/AIOD-rest-api/issues/82) - -# Information on which database to connect to -[database] -host = "sqlserver" -port = 3306 -database = "aiod" -username = "root" -password = "ok" - -# Additional options for development -[dev] -reload = true - -# Authentication and authorization -[keycloak] -server_url = "https://test.openml.org/aiod-auth/" -realm = "dev" -client_id = "aiod-api" # a private client, used by the backend -client_id_swagger = "aiod-api-swagger" # a public client, used by the Swagger Frontend -openid_connect_url = "https://test.openml.org/aiod-auth/realms/dev/.well-known/openid-configuration" -scopes = "openid profile microprofile-jwt" -role = "edit_aiod_resources" \ No newline at end of file diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index d8eb3a37..98270ab0 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -130,6 +130,4 @@ def fetch( related_resources={"citations": citations}, ) except Exception as e: - yield RecordError( - platform="huggingface", _id=dataset.id, type="dataset", error=e.args[0] - ) + yield RecordError(identifier=dataset.id, error=e) diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 13432ddc..42e578ba 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -3,19 +3,18 @@ and how to convert the OpenML response to some agreed AIoD format. """ -from typing import Iterator, List +from typing import Iterator import dateutil.parser import requests from sqlmodel import SQLModel - from connectors.abstract.resource_connector_by_id import ResourceConnectorById +from connectors.record_error import RecordError from database.model.dataset.data_download import DataDownload from database.model.dataset.dataset import Dataset -from database.model.resource import resource_create from database.model.platform.platform_names import PlatformName -from connectors.record_error import RecordError +from database.model.resource import resource_create class OpenMlDatasetConnector(ResourceConnectorById[Dataset]): @@ -33,44 +32,36 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.openml - def retry(self, _id: int) -> SQLModel | RecordError: - url_data = f"https://www.openml.org/api/v1/json/data/{_id}" - response = requests.get(url_data) + def retry(self, identifier: int) -> SQLModel | RecordError: + url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{identifier}" + response = requests.get(url_qual) if not response.ok: - code = response.status_code - if code == 412 and response.json()["error"]["message"] == "Unknown dataset": - code = 404 msg = response.json()["error"]["message"] - return RecordError( - platform="openml", - _id=str(_id), + identifier=str(identifier), error=f"Error while fetching data from OpenML: '{msg}'.", - type="dataset", ) - dataset_json = response.json()["data_set_description"] + qualities = response.json()["data_qualities"]["quality"] + return self.fetch_record(identifier, qualities) - # Here we can format the response into some standardized way, maybe this includes some - # dataset characteristics. These need to be retrieved separately from OpenML: - url_qual = f"https://www.openml.org/api/v1/json/data/qualities/{_id}" - response = requests.get(url_qual) + def fetch_record( + self, identifier: int, qualities: list[dict[str, str]] + ) -> SQLModel | RecordError: + url_data = f"https://www.openml.org/api/v1/json/data/{identifier}" + response = requests.get(url_data) if not response.ok: msg = response.json()["error"]["message"] return RecordError( - platform="openml", - _id=str(_id), + identifier=str(identifier), error=f"Error while fetching data from OpenML: '{msg}'.", - type="dataset", ) + dataset_json = response.json()["data_set_description"] - qualities_json = { - quality["name"]: quality["value"] - for quality in response.json()["data_qualities"]["quality"] - } + qualities_json = {quality["name"]: quality["value"] for quality in qualities} pydantic_class = resource_create(Dataset) return pydantic_class( platform=self.platform_name, - platform_identifier=_id, + platform_identifier=identifier, name=dataset_json["name"], same_as=url_data, description=dataset_json["description"], @@ -93,34 +84,34 @@ def retry(self, _id: int) -> SQLModel | RecordError: measured_values=[], ) - def check_valid_id(self, from_id: int, to_id: int) -> List[int]: - url = f"https://www.openml.org/api/v1/json/data/list/data_id/{','.join(map(str, range(from_id, to_id)))}" # noqa E501 - response = requests.get(url) + def fetch(self, offset: int, from_identifier: int) -> Iterator[SQLModel | RecordError]: + url_data = ( + "https://www.openml.org/api/v1/json/data/list/" + f"limit/{self.limit_per_iteration}/offset/{offset}" + ) + response = requests.get(url_data) + if not response.ok: + msg = response.json()["error"]["message"] + yield RecordError( + identifier=None, + error=f"Error while fetching {url_data} from OpenML: '{msg}'.", + ) + return - if response.status_code == 200: - data = response.json() - existing_ids = [int(item["did"]) for item in data.get("data", {}).get("dataset", [])] - return existing_ids - elif response.status_code == 412: - return [-1] - else: - raise ValueError() + try: + dataset_summaries = response.json()["data"]["dataset"] + except Exception as e: + yield RecordError(identifier=None, error=e) + return - def fetch(self, from_id: int, to_id: int) -> Iterator[SQLModel | RecordError]: - valid_ids = self.check_valid_id(from_id, to_id) - if valid_ids[0] == -1: - return RecordError( - _id=str(from_id), - platform="openml", - type="datset", - error="No more datasets to retrieve", - ) - for _id in valid_ids: + for summary in dataset_summaries: + identifier = None try: - dataset = self.retry(_id) - yield dataset + identifier = summary["did"] + qualities = summary["quality"] + yield self.fetch_record(identifier, qualities) except Exception as e: - yield RecordError(_id=str(_id), platform="openml", type="dataset", error=str(e)) + yield RecordError(identifier=identifier, error=e) def _as_int(v: str) -> int: diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py index 76153f6d..6d0d654a 100644 --- a/src/connectors/record_error.py +++ b/src/connectors/record_error.py @@ -3,7 +3,5 @@ @dataclasses.dataclass class RecordError: - platform: str - _id: str - type: str - error: str + identifier: str | None + error: BaseException | str diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index f846cbd8..22b71211 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -1,140 +1,164 @@ -from typing import Dict, List import argparse -from sqlmodel import SQLModel +import importlib +import json import logging +import pathlib +import sys +from datetime import datetime +from typing import Optional + +from sqlmodel import SQLModel, Session +import routers +from connectors.abstract.resource_connector import ResourceConnector from connectors.record_error import RecordError -from connectors.config import DB_CONFIG from connectors.resource_with_relations import ResourceWithRelations -from database.model.platform.platform_names import PlatformName -from datetime import datetime -from connectors.abstract.resource_connector import ResourceConnector -from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate -from connectors.abstract.resource_connector_by_id import ResourceConnectorById -from connectors.huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector -from connectors.openml.openml_dataset_connector import OpenMlDatasetConnector -from connectors.zenodo.zenodo_dataset_connector import ZenodoDatasetConnector -from database.setup import ( - connect_to_database, -) -from sqlalchemy.engine import Engine - -logging.basicConfig(filename="example.log", encoding="utf-8", level=logging.DEBUG) - - -class Synchronization: - dataset_connectors = { - c.platform_name: c - for c in ( - OpenMlDatasetConnector(), - HuggingFaceDatasetConnector(), - ZenodoDatasetConnector(), - ) - } - - def _parse_args(self) -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Please refer to the README.") - parser.add_argument( - "--populate-datasets", - default=[], - nargs="+", - choices=[p.name for p in PlatformName], - help="Zero, one or more platforms with which the datasets should get populated.", +from database.setup import _create_or_fetch_related_objects, _get_existing_resource, sqlmodel_engine +from routers import ResourceRouter + +RELATIVE_PATH_LOG = pathlib.Path("connector.log") +RELATIVE_PATH_STATE_JSON = pathlib.Path("state.json") +RELATIVE_PATH_ERROR_CSV = pathlib.Path("errors.csv") + + +def _parse_args() -> argparse.Namespace: + # TODO: write readme + parser = argparse.ArgumentParser(description="Please refer to the README.") + parser.add_argument( + "--connector", + required=True, + help="The connector to use. Please provide a relative path such as " + "'connectors.zenodo.zenodo_dataset_connector.ZenodoDatasetConnector' where the " + "last part is the class name.", + ) + parser.add_argument( + "--working-dir", + required=True, + help="The working directory. The status will be stored here, next to the logs and a " + "list of failed resources", + ) + parser.add_argument( + "--from-date", + type=lambda d: datetime.strptime(d, "%Y-%m-%d").date(), + help="The start date. Only relevant for the first run of date-based connectors. " + "In subsequent runs, date-based connectors will synchronize from the previous " + "end-time. Format: YYYY-MM-DD", + ) + parser.add_argument( + "--from-identifier", + type=str, + help="The start identifier. Only relevant for the first run of identifier-based " + "connectors. In subsequent runs, identifier-based connectors will " + "synchronize from the previous end-identifier.", + ) + parser.add_argument( + "--limit", + type=int, + help="Implemented by some connectors for testing purposes: limit the number of results.", + ) + parser.add_argument( + "--save_every", + type=int, + help="Save the state file every N records. In case that the complete program is killed, " + "you can then resume the next run from the last saved state.", + ) + return parser.parse_args() + + +def exception_handler(exc_type, exc_value, exc_traceback): + logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback)) + + +def save_to_database( + session: Session, + connector: ResourceConnector, + router: ResourceRouter, + item: SQLModel | ResourceWithRelations[SQLModel] | RecordError, +) -> Optional[RecordError]: + if isinstance(item, RecordError): + return item + try: + if isinstance(item, ResourceWithRelations): + resource_create_instance = item.resource + _create_or_fetch_related_objects(session, item) + else: + resource_create_instance = item + existing = _get_existing_resource( + session, resource_create_instance, connector.resource_class ) - return parser.parse_args() - - def _connector_from_platform_name( - self, connector_type: str, connector_dict: Dict, platform_name: str - ): - """Get the connector from the connector_dict, identified by its platform name.""" - try: - platform = PlatformName(platform_name) - except ValueError: - raise ValueError( - f"platform " f"'{platform_name}' not recognized.", - ) - connector = connector_dict.get(platform, None) - if connector is None: - possibilities = ", ".join(f"`{c}`" for c in self.dataset_connectors.keys()) - msg = ( - f"No {connector_type} connector for platform '{platform_name}' available. Possible " - f"values: {possibilities}" - ) - raise ValueError(msg) - return connector - - def _engine(self, rebuild_db: str) -> Engine: - """ - Return a SqlAlchemy engine, backed by the MySql connection as - configured in the configuration file. - """ - username = DB_CONFIG.get("name", "root") - password = DB_CONFIG.get("password", "ok") - host = DB_CONFIG.get("host", "demodb") - port = DB_CONFIG.get("port", 3306) - database = DB_CONFIG.get("database", "aiod") - - db_url = f"mysql://{username}:{password}@{host}:{port}/{database}" - - delete_before_create = rebuild_db == "always" - return connect_to_database(db_url, delete_first=delete_before_create) - - def store_records( - self, engine: Engine, items: List["SQLModel | ResourceWithRelations[SQLModel]"] - ): - """ - This function store on the database all the items using the engine - """ - pass - - def start(self): - args = self._parse_args() - dataset_connectors: List["ResourceConnector"] = [ - self._connector_from_platform_name("dataset", self.dataset_connectors, platform_name) - for platform_name in args.populate_datasets - ] - # add all dict connectors - connectors_ = dataset_connectors - engine = self._engine(args.rebuild_db) - - # init the database with all connectors - for connector in connectors_: - # This is a unique type of connector due to Huggingface API - if isinstance(connector, HuggingFaceDatasetConnector): - records = connector.fetch_all() - self.store_records(engine, records) - - elif isinstance(connector, ResourceConnectorByDate): - records = [] - items = connector.fetch(datetime.min, datetime.max) - for item in items: - if isinstance(item, RecordError): - # handle error - pass - else: - records.append(item) - self.store_records(engine, records) - - elif isinstance(connector, ResourceConnectorById): - # Retrieve all records - from_id = 0 - to_id = from_id + 10 - finished = False - records: List["SQLModel" | "ResourceWithRelations[SQLModel]"] = [] - while not finished: - items = connector.fetch(from_id, to_id) - if records[0].error == "No more datasets to retrieve": - finished = True - else: - from_id += 10 - to_id = from_id + 10 - for item in items: - if isinstance(item, RecordError): - # handle error - pass - else: - records.append(item) - self.store_records(engine, records) - - else: - pass # "Unknown connector type! + if existing is None: # TODO: if not None, update + router.create_resource(session, resource_create_instance) + + except Exception as e: + return RecordError(identifier=str(item.identifier), error=e) # type:ignore + session.flush() + return None + + +def main(): + args = _parse_args() + working_dir = pathlib.Path(args.working_dir) + working_dir.mkdir(parents=True, exist_ok=True) + logging.basicConfig( + filename=working_dir / RELATIVE_PATH_LOG, encoding="utf-8", level=logging.INFO + ) + logging.getLogger().addHandler(logging.StreamHandler()) + sys.excepthook = exception_handler + + module_path = ".".join(args.connector.split(".")[0:-1]) + connector_cls_name = args.connector.split(".")[-1] + module = importlib.import_module(module_path) + connector: ResourceConnector = getattr(module, connector_cls_name) + + error_path = working_dir / RELATIVE_PATH_ERROR_CSV + state_path = working_dir / RELATIVE_PATH_STATE_JSON + error_path.parents[0].mkdir(parents=True, exist_ok=True) + state_path.parents[0].mkdir(parents=True, exist_ok=True) + first_run = not state_path.exists() + if not first_run: + with open(state_path, "r") as f: + state = json.load(f) + else: + state = {} + + items = connector.run( + state=state, + from_identifier=args.from_identifier, + from_datetime=args.from_datetime, + limit=args.limit, + ) + + (router,) = [ + router + for router in routers.resource_routers + if router.resource_class == connector.resource_class + ] + + engine = sqlmodel_engine(rebuild_db="never") + + with Session(engine) as session: + for i, item in enumerate(items): + error = save_to_database(router=router, connector=connector, session=session, item=item) + if error: + if isinstance(error.error, str): + logging.error(f"Error on identifier {error.identifier}: {error.error}") + else: + logging.error(f"Error on identifier {error.identifier}", exc_info=error.error) + with open(error_path, "a") as f: + error_cleaned = "".join( + c if c.isalnum() or c == "" else "_" for c in str(error.error) + ) + f.write(f'"{error.identifier}","{error_cleaned}"\n') + if args.save_every and i > 0 and i % args.save_every == 0: + logging.debug(f"Saving state after handling record {i}") + with open(state_path, "w") as f: + json.dump(state, f, indent=4) + session.commit() + with open(state_path, "w") as f: + session.commit() + json.dump(state, f, indent=4) + logging.info("Done") + + +if __name__ == "__main__": + main() diff --git a/src/connectors/zenodo/__init__.py b/src/connectors/zenodo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 013f67d0..6a22eba2 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -1,12 +1,14 @@ -from datetime import datetime -import logging -from typing import Iterator +from datetime import datetime, date +from typing import Iterator, Tuple + import requests -from connectors.record_error import RecordError -from sickle import Sickle import xmltodict +from sickle import Sickle +from sqlmodel import SQLModel from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate +from connectors.record_error import RecordError +from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset from database.model.general.keyword import Keyword from database.model.general.license import License @@ -24,27 +26,24 @@ def resource_class(self) -> type[Dataset]: def platform_name(self) -> PlatformName: return PlatformName.zenodo - """ - This function fetches only one record from Zenodo using the Rest API instead of - the OAI-PMH one. When querying using the OAI protocol, we always receive all the - records, making it really inefficient to filter through all of them until we get - the one we want. Apart from using different protocols, they also employ different - serialization methods. The Rest API uses JSON, while OAI uses XML, which is why the - code shows no similarities. - """ - def retry(self, _id: int) -> Dataset | RecordError: - """Retrieve information of the resource identified by id""" + """ + This function fetches only one record from Zenodo using the Rest API instead of + the OAI-PMH one. When querying using the OAI protocol, we always receive all the + records, making it really inefficient to filter through all of them until we get + the one we want. Apart from using different protocols, they also employ different + serialization methods. The Rest API uses JSON, while OAI uses XML, which is why the + code shows no similarities. + """ response = requests.get(f"https://zenodo.org/api/records/{_id}") if not response.ok: msg = response.json()["error"]["message"] return RecordError( - platform="zenodo", - _id=str(_id), + identifier=str(_id), error=f"Error while fetching data from Zenodo: '{msg}'.", - type="dataset", ) + record = response.json() creators_list = [item["name"] for item in record["metadata"]["creators"]] creator = "; ".join(creators_list) # TODO change field to an array @@ -61,41 +60,26 @@ def retry(self, _id: int) -> Dataset | RecordError: ) @staticmethod - def _get_record_dictionary(record): - xml_string = record.raw - xml_dict = xmltodict.parse(xml_string) - id_ = xml_dict["record"]["header"]["identifier"] - if id_.startswith("oai:"): - id_ = id_.replace("oai:", "") - resource = xml_dict["record"]["metadata"]["oai_datacite"]["payload"]["resource"] - return id_, resource - - def _bad_record_format(self, dataset_id, field): - logging.error( - f"Error while fetching record info for dataset {dataset_id}: bad format {field}" - ) + def _error_msg_bad_format(field) -> str: + return f"Error while fetching record info: bad format {field}" - def _dataset_from_record(self, record_raw) -> Dataset | RecordError: - _id, record = ZenodoDatasetConnector._get_record_dictionary(record_raw) + @staticmethod + def _dataset_from_record(identifier: str, record: dict) -> Dataset | RecordError: + error_fmt = ZenodoDatasetConnector._error_msg_bad_format if isinstance(record["creators"]["creator"], list): creators_list = [item["creatorName"] for item in record["creators"]["creator"]] creator = "; ".join(creators_list) # TODO change field to an array elif isinstance(record["creators"]["creator"]["creatorName"], str): creator = record["creators"]["creator"]["creatorName"] else: - self._bad_record_format(_id, "creator") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding creator" - ) + error_fmt("") + return RecordError(identifier=identifier, error=error_fmt("creator")) if isinstance(record["titles"]["title"], str): title = record["titles"]["title"] else: - self._bad_record_format(_id, "title") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding title" - ) - number_str = _id.rsplit("/", 1)[-1] + return RecordError(identifier=identifier, error=error_fmt("title")) + number_str = identifier.rsplit("/", 1)[-1] id_number = "".join(filter(str.isdigit, number_str)) same_as = f"https://zenodo.org/api/records/{id_number}" @@ -107,10 +91,7 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: elif description_raw["@descriptionType"] == "Abstract": description = description_raw["#text"] else: - self._bad_record_format(_id, "description") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding description" - ) + return RecordError(identifier=identifier, error=error_fmt("description")) date_published = None date_raw = record["dates"]["date"] @@ -120,28 +101,19 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: date_string = date_raw["#text"] date_published = datetime.strptime(date_string, DATE_FORMAT) else: - self._bad_record_format(_id, "date_published") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding date_published" - ) + return RecordError(identifier=identifier, error=error_fmt("date_published")) if isinstance(record["publisher"], str): publisher = record["publisher"] else: - self._bad_record_format(_id, "publisher") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding publisher" - ) + return RecordError(identifier=identifier, error=error_fmt("publisher")) if isinstance(record["rightsList"]["rights"], list): license_ = record["rightsList"]["rights"][0]["@rightsURI"] elif isinstance(record["rightsList"]["rights"]["@rightsURI"], str): license_ = record["rightsList"]["rights"]["@rightsURI"] else: - self._bad_record_format(_id, "license") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding license" - ) + return RecordError(identifier=identifier, error=error_fmt("license")) keywords = [] if "subjects" in record: @@ -150,14 +122,11 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: elif isinstance(record["subjects"]["subject"], list): keywords = [item for item in record["subjects"]["subject"] if isinstance(item, str)] else: - self._bad_record_format(_id, "keywords") - return RecordError( - _id=_id, platform="zenodo", type="dataset", error="error decoding keywords" - ) + return RecordError(identifier=identifier, error=error_fmt("keywords")) dataset = Dataset( platform="zenodo", - platform_identifier=_id, + platform_identifier=identifier, name=title[:150], same_as=same_as, creator=creator[ @@ -171,7 +140,9 @@ def _dataset_from_record(self, record_raw) -> Dataset | RecordError: ) return dataset - def _get_resource_type(self, record): + @staticmethod + def _resource_type(record) -> str | None: + """Cheap check before parsing the complete XML.""" xml_string = record.raw start = xml_string.find(' Iterator[Dataset | RecordError]: - records = sk.ListRecords( + def fetch( + self, from_incl: datetime, to_excl: datetime + ) -> Iterator[Tuple[date | None, SQLModel | ResourceWithRelations[SQLModel] | RecordError]]: + sickle = Sickle("https://zenodo.org/oai2d") + records = sickle.ListRecords( **{ "metadataPrefix": "oai_datacite", "from": from_incl.isoformat(), @@ -195,14 +164,23 @@ def _retrieve_dataset_from_datetime( } ) - record = next(records, None) - - while record: - if self._get_resource_type(record) == "Dataset": - dataset = self._dataset_from_record(record) - yield dataset - record = next(records, None) - - def fetch(self, from_incl: datetime, to_excl: datetime) -> Iterator[Dataset | RecordError]: - sickle = Sickle("https://zenodo.org/oai2d") - return self._retrieve_dataset_from_datetime(sickle, from_incl, to_excl=to_excl) + while record := next(records, None): + id_ = None + datetime_ = None + resource_type = ZenodoDatasetConnector._resource_type(record) + if resource_type is None: + yield datetime_, RecordError( + identifier=id_, error="Resource type could not be " "determined" + ) + if resource_type == "Dataset": + try: + xml_string = record.raw + xml_dict = xmltodict.parse(xml_string) + id_ = xml_dict["record"]["header"]["identifier"] + if id_.startswith("oai:"): + id_ = id_.replace("oai:", "") + datetime_ = xml_dict["record"]["header"]["datestamp"] + resource = xml_dict["record"]["metadata"]["oai_datacite"]["payload"]["resource"] + yield datetime_, self._dataset_from_record(id_, resource) + except Exception as e: + yield datetime_, RecordError(identifier=id_, error=e) diff --git a/src/database/setup.py b/src/database/setup.py index 35123743..f04a8040 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -10,6 +10,7 @@ from sqlmodel import create_engine, Session, select, SQLModel import routers +from config import DB_CONFIG from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.resource_with_relations import ResourceWithRelations from database.model.dataset.dataset import Dataset @@ -161,3 +162,20 @@ def _create_or_fetch_related_objects(session: Session, item: ResourceWithRelatio item.resource.__setattr__(field_name, id_) # E.g. Dataset.license_identifier = 1 else: item.resource.__setattr__(field_name, identifiers) # E.g. Dataset.keywords = [1, 4] + + +def sqlmodel_engine(rebuild_db: str) -> Engine: + """ + Return a SQLModel engine, backed by the MySql connection as configured in the configuration + file. + """ + username = DB_CONFIG.get("name", "root") + password = DB_CONFIG.get("password", "ok") + host = DB_CONFIG.get("host", "demodb") + port = DB_CONFIG.get("port", 3306) + database = DB_CONFIG.get("database", "aiod") + + db_url = f"mysql://{username}:{password}@{host}:{port}/{database}" + + delete_before_create = rebuild_db == "always" + return connect_to_database(db_url, delete_first=delete_before_create) diff --git a/src/main.py b/src/main.py index 0be2d368..c3eb5244 100644 --- a/src/main.py +++ b/src/main.py @@ -18,8 +18,8 @@ import connectors import routers from authentication import get_current_user -from config import DB_CONFIG, KEYCLOAK_CONFIG -from database.setup import connect_to_database, populate_database +from config import KEYCLOAK_CONFIG +from database.setup import populate_database, create_engine def _parse_args() -> argparse.Namespace: @@ -49,23 +49,6 @@ def _parse_args() -> argparse.Namespace: return parser.parse_args() -def _engine(rebuild_db: str) -> Engine: - """ - Return a SqlAlchemy engine, backed by the MySql connection as configured in the configuration - file. - """ - username = DB_CONFIG.get("name", "root") - password = DB_CONFIG.get("password", "ok") - host = DB_CONFIG.get("host", "demodb") - port = DB_CONFIG.get("port", 3306) - database = DB_CONFIG.get("database", "aiod") - - db_url = f"mysql://{username}:{password}@{host}:{port}/{database}" - - delete_before_create = rebuild_db == "always" - return connect_to_database(db_url, delete_first=delete_before_create) - - def _connector_example_from_resource(resource): connector_dict = connectors.example_connectors connector = connector_dict.get(resource, None) @@ -129,7 +112,7 @@ def create_app() -> FastAPI: _connector_example_from_resource(resource) for resource in args.fill_with_examples ] - engine = _engine(args.rebuild_db) + engine = create_engine(args.rebuild_db) if len(examples_connectors) > 0: populate_database( engine, diff --git a/src/tests/connectors/openml/test_openml_dataset_connector.py b/src/tests/connectors/openml/test_openml_dataset_connector.py index e9de9af1..9a332626 100644 --- a/src/tests/connectors/openml/test_openml_dataset_connector.py +++ b/src/tests/connectors/openml/test_openml_dataset_connector.py @@ -1,78 +1,88 @@ import json -import responses +import responses from connectors.openml.openml_dataset_connector import OpenMlDatasetConnector from tests.testutils.paths import path_test_resources OPENML_URL = "https://www.openml.org/api/v1/json" -""" -def test_fetch_happy_path(): - connector = connectors.dataset_connectors[PlatformName.openml] - id_ = "2" - with responses.RequestsMock() as mocked_requests: - mock_openml_responses(mocked_requests, id_) - dataset = connector.fetch(id_) - with open(path_test_resources() / "connectors" / "openml" / "data_2.json", "r") as f: - expected = json.load(f)["data_set_description"] - - assert dataset.name == "anneal" - assert dataset.description == expected["description"] - assert not hasattr(dataset, "identifier") # will be set when saving to the db - assert dataset.platform == PlatformName.openml.value - assert dataset.platform_identifier == id_ - assert dataset.same_as == "https://www.openml.org/api/v1/json/data/2" - assert len(dataset.citations) == 0 - assert dataset.license == "Public" - assert dataset.version == "1" - assert dataset.is_accessible_for_free - assert dataset.size == 898 - - assert len(dataset.distributions) == 1 - (distribution,) = dataset.distributions - assert distribution.encoding_format == "ARFF" - assert distribution.content_url == "https://api.openml.org/data/v1/download/1666876/anneal.arff" - - assert len(dataset.keywords) == 9 - assert set(dataset.keywords) == { - "study_1", - "study_14", - "study_34", - "study_37", - "study_41", - "study_70", - "study_76", - "test", - "uci", - } - -""" - - -def test_fetch_happy_path(): - connector = OpenMlDatasetConnector() + + +def test_first_run(): + connector = OpenMlDatasetConnector(limit_per_iteration=2) with responses.RequestsMock() as mocked_requests: - with open( - path_test_resources() / "connectors" / "openml" / "data_list.json", - "r", - ) as f: - data_list = json.load(f) - mocked_requests.add( - responses.GET, - f"{OPENML_URL}/data/list/data_id/2,3,4", - json=data_list, - status=200, - ) + for offset in (0, 2): + mock_list_data(mocked_requests, offset) for i in range(2, 5): - mock_openml_responses(mocked_requests, str(i)) - - datasets = list(connector.fetch(2, 5)) + mock_get_data(mocked_requests, str(i)) + datasets = list(connector.run(state={}, from_identifier=0, limit=None)) + assert {d.name for d in datasets} == {"anneal", "labor", "kr-vs-kp"} assert len(datasets) == 3 assert {len(d.citations) for d in datasets} == {0} -def mock_openml_responses(mocked_requests: responses.RequestsMock, platform_identifier: str): +def test_second_run(): + connector = OpenMlDatasetConnector(limit_per_iteration=2) + with responses.RequestsMock() as mocked_requests: + mock_list_data(mocked_requests, offset=2) + mock_get_data(mocked_requests, "4") + datasets = list( + connector.run(state={"offset": 2, "last_id": 3}, from_identifier=0, limit=None) + ) + assert len(datasets) == 1 + assert {d.name for d in datasets} == {"labor"} + + +def test_second_run_wrong_identifier(): + connector = OpenMlDatasetConnector(limit_per_iteration=2) + with responses.RequestsMock() as mocked_requests: + mock_list_data(mocked_requests, offset=2) + mock_get_data(mocked_requests, "4") + datasets = list( + connector.run(state={"offset": 2, "last_id": 1}, from_identifier=0, limit=None) + ) + assert len(datasets) == 1 + assert {d.name for d in datasets} == {"labor"} + + +def mock_list_data(mocked_requests, offset): + """ + Mocking requests to the OpenML dependency, so that we test only our own services + """ + + with open( + path_test_resources() / "connectors" / "openml" / f"list_offset_{offset}.json", + "r", + ) as f: + data_response = json.load(f) + mocked_requests.add( + responses.GET, + f"{OPENML_URL}/data/list/limit/2/offset/{offset}", + json=data_response, + status=200, + ) + + +def mock_get_data(mocked_requests: responses.RequestsMock, platform_identifier: str): + """ + Mocking requests to the OpenML dependency, so that we test only our own services + """ + + with open( + path_test_resources() / "connectors" / "openml" / f"data_{platform_identifier}.json", + "r", + ) as f: + data_response = json.load(f) + mocked_requests.add( + responses.GET, + f"{OPENML_URL}/data/{platform_identifier}", + json=data_response, + status=200, + ) + + +def mock_get_qualities(mocked_requests: responses.RequestsMock, platform_identifier: str): """ Mocking requests to the OpenML dependency, so that we test only our own services """ diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index ca09732c..1df362db 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -6,22 +6,14 @@ from tests.testutils.paths import path_test_resources -def read_file(path): - with open(path, "r") as file: - content = file.read() - return content - - def test_fetch_happy_path(): connector = ZenodoDatasetConnector() with responses.RequestsMock() as mocked_requests: mock_zenodo_responses(mocked_requests) - datasets = list( - connector.fetch( - from_incl=datetime.datetime(2000, 1, 1, 12, 0, 0), to_excl=datetime.datetime.max - ) - ) + from_incl = datetime.datetime(2000, 1, 1, 12, 0, 0) + to_excl = datetime.datetime(2000, 1, 2, 12, 0, 0) + datasets = list(connector.run(state={}, from_date=from_incl, to_excl=to_excl)) assert len(datasets) == 1 dataset = datasets[0] assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" @@ -89,7 +81,7 @@ def mock_zenodo_responses(mocked_requests: responses.RequestsMock): records_list = f.read() mocked_requests.add( responses.GET, - "https://zenodo.org/oai2d?metadataPrefix=oai_datacite&from=2000-01-01T12%3A00%3A00&until=9999-12-31T23%3A59%3A59.999999&verb=ListRecords", # noqa E501 + "https://zenodo.org/oai2d?metadataPrefix=oai_datacite&from=2000-01-01T00%3A00%3A00&until=2000-01-02T12%3A00%3A00&verb=ListRecords", # noqa E501 body=records_list, status=200, ) diff --git a/src/tests/resources/connectors/openml/data_list.json b/src/tests/resources/connectors/openml/list_offset_0.json similarity index 67% rename from src/tests/resources/connectors/openml/data_list.json rename to src/tests/resources/connectors/openml/list_offset_0.json index 25746b94..b6cc76ac 100644 --- a/src/tests/resources/connectors/openml/data_list.json +++ b/src/tests/resources/connectors/openml/list_offset_0.json @@ -102,57 +102,6 @@ "value": "37.0" } ] - }, - { - "did": 4, - "name": "labor", - "version": 1, - "status": "active", - "format": "ARFF", - "md5_checksum": "", - "file_id": 4, - "quality": [ - { - "name": "MajorityClassSize", - "value": "37.0" - }, - { - "name": "MaxNominalAttDistinctValues", - "value": "3.0" - }, - { - "name": "MinorityClassSize", - "value": "20.0" - }, - { - "name": "NumberOfClasses", - "value": "2.0" - }, - { - "name": "NumberOfFeatures", - "value": "17.0" - }, - { - "name": "NumberOfInstances", - "value": "57.0" - }, - { - "name": "NumberOfInstancesWithMissingValues", - "value": "56.0" - }, - { - "name": "NumberOfMissingValues", - "value": "326.0" - }, - { - "name": "NumberOfNumericFeatures", - "value": "8.0" - }, - { - "name": "NumberOfSymbolicFeatures", - "value": "9.0" - } - ] } ] } diff --git a/src/tests/resources/connectors/openml/list_offset_2.json b/src/tests/resources/connectors/openml/list_offset_2.json new file mode 100644 index 00000000..7037fd57 --- /dev/null +++ b/src/tests/resources/connectors/openml/list_offset_2.json @@ -0,0 +1,57 @@ +{ + "data": { + "dataset": [ + { + "did": 4, + "name": "labor", + "version": 1, + "status": "active", + "format": "ARFF", + "md5_checksum": "", + "file_id": 4, + "quality": [ + { + "name": "MajorityClassSize", + "value": "37.0" + }, + { + "name": "MaxNominalAttDistinctValues", + "value": "3.0" + }, + { + "name": "MinorityClassSize", + "value": "20.0" + }, + { + "name": "NumberOfClasses", + "value": "2.0" + }, + { + "name": "NumberOfFeatures", + "value": "17.0" + }, + { + "name": "NumberOfInstances", + "value": "57.0" + }, + { + "name": "NumberOfInstancesWithMissingValues", + "value": "56.0" + }, + { + "name": "NumberOfMissingValues", + "value": "326.0" + }, + { + "name": "NumberOfNumericFeatures", + "value": "8.0" + }, + { + "name": "NumberOfSymbolicFeatures", + "value": "9.0" + } + ] + } + ] + } +} \ No newline at end of file From 1905e813e2b0062a648a961564b6536c88a8cea9 Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Mon, 21 Aug 2023 14:17:33 +0200 Subject: [PATCH 48/52] Moved example connectors to separate docker container --- .env | 2 + Dockerfile | 4 +- connectors/fill-examples.sh | 29 ++++++++++ docker-compose.yaml | 33 +++++++++-- scripts/database-connect.sql | 4 ++ scripts/run_apiserver.sh | 20 ------- scripts/run_apiserver_devcontainer.sh | 13 ----- scripts/run_database_test.sh | 8 --- scripts/run_mysql_server.sh | 13 ----- src/connectors/__init__.py | 29 ---------- .../resource_connector_on_start_up.py | 11 +++- src/connectors/example/example.py | 54 ++++++++++++++++++ src/connectors/synchronization.py | 27 +++++++-- src/database/model/field_length.py | 2 +- src/database/setup.py | 56 +------------------ src/main.py | 50 +++-------------- src/routers/__init__.py | 4 +- .../example/test_example_connector.py | 26 ++++++++- 18 files changed, 186 insertions(+), 199 deletions(-) create mode 100755 connectors/fill-examples.sh create mode 100755 scripts/database-connect.sql delete mode 100755 scripts/run_apiserver.sh delete mode 100755 scripts/run_apiserver_devcontainer.sh delete mode 100755 scripts/run_database_test.sh delete mode 100755 scripts/run_mysql_server.sh create mode 100644 src/connectors/example/example.py diff --git a/.env b/.env index 683696f5..343c3c9a 100644 --- a/.env +++ b/.env @@ -1,3 +1,5 @@ +PYTHONPATH=/app + #MYSQL MYSQL_ROOT_PASSWORD=ok diff --git a/Dockerfile b/Dockerfile index a2319bf3..16c9c546 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,4 @@ ENV PATH="${PATH}:/home/apprunner/.local/bin" RUN pip install . -COPY ./src /app - -ENTRYPOINT ["python", "main.py"] +COPY ./src /app \ No newline at end of file diff --git a/connectors/fill-examples.sh b/connectors/fill-examples.sh new file mode 100755 index 00000000..a0583608 --- /dev/null +++ b/connectors/fill-examples.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExampleDatasetConnector \ + -w /opt/connectors/data/example/dataset + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExampleExperimentConnector \ + -w /opt/connectors/data/example/experiment + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExampleMLModelConnector \ + -w /opt/connectors/data/example/ml_model + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExampleOrganisationConnector \ + -w /opt/connectors/data/example/organisation + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExamplePersonConnector \ + -w /opt/connectors/data/example/person + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExamplePublicationConnector \ + -w /opt/connectors/data/example/publication + +python3 connectors/synchronization.py -f \ + -c connectors.example.example.ExampleServiceConnector \ + -w /opt/connectors/data/example/service diff --git a/docker-compose.yaml b/docker-compose.yaml index 1a787c46..2181ded0 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -14,19 +14,38 @@ services: volumes: - ./src:/app command: > + python main.py --rebuild-db always - --fill-with-examples datasets experiments ml_models organisations persons publications services --reload - --limit 10 healthcheck: test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:8000')"] - interval: 30s + start_interval: 1s + start_period: 30s + interval: 5s timeout: 30s - retries: 30 + retries: 5 depends_on: sqlserver: condition: service_healthy + fill-db-with-examples: + build: + context: ./ + dockerfile: Dockerfile + container_name: fill-db-with-examples + env_file: .env + environment: + - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET + volumes: + - ./src:/app + - ./data/connectors:/opt/connectors/data + - ./connectors:/opt/connectors/script + command: > + /bin/bash -c "/opt/connectors/script/fill-examples.sh" + depends_on: + app: + condition: service_healthy + sqlserver: image: mysql container_name: sqlserver @@ -37,7 +56,9 @@ services: - ./data/mysql:/var/lib/mysql healthcheck: test: ["CMD", "mysqladmin", "-uroot", "-p$MYSQL_ROOT_PASSWORD", "ping", "-h", "localhost", "--protocol","tcp"] - interval: 30s + start_interval: 1s + start_period: 10s + interval: 5s timeout: 30s retries: 30 @@ -53,7 +74,7 @@ services: volumes: - ./quay-keycloak:/opt/keycloak/data/import command: > - start-dev + start-dev --hostname-url http://${HOSTNAME}/aiod-auth --hostname-admin-url http://${HOSTNAME}/aiod-auth --http-relative-path=/aiod-auth diff --git a/scripts/database-connect.sql b/scripts/database-connect.sql new file mode 100755 index 00000000..fb664d60 --- /dev/null +++ b/scripts/database-connect.sql @@ -0,0 +1,4 @@ +#!/bin/bash + +# Can be run after docker compose up (the sqlserver must be running) +docker exec -it sqlserver mysql -uroot -pok \ No newline at end of file diff --git a/scripts/run_apiserver.sh b/scripts/run_apiserver.sh deleted file mode 100755 index 069049fa..00000000 --- a/scripts/run_apiserver.sh +++ /dev/null @@ -1,20 +0,0 @@ -#/bin/bash - -SCRIPT_PATH="$(cd "$(dirname "$0")" && pwd)" -APP_ROOT="$(dirname $SCRIPT_PATH)" -SRC_PATH="${APP_ROOT}/src" - -docker run \ - --network sql-network\ - --rm \ - -p 8000:8000 \ - --name apiserver \ - -v $SRC_PATH:/app \ - ai4eu_server_demo \ - --rebuild-db always \ - --fill-with-examples datasets computational_resources publications news events case_studies \ - presentations projects educational_resources organisations\ - --limit 10 \ - --url-prefix "" \ - --reload - diff --git a/scripts/run_apiserver_devcontainer.sh b/scripts/run_apiserver_devcontainer.sh deleted file mode 100755 index 16b0c2b6..00000000 --- a/scripts/run_apiserver_devcontainer.sh +++ /dev/null @@ -1,13 +0,0 @@ -#/bin/bash - -SCRIPT_PATH="$(cd "$(dirname "$0")" && pwd)" -APP_ROOT="$(dirname $SCRIPT_PATH)" -SRC_PATH="${APP_ROOT}/src" - -cd $SRC_PATH - -python main.py \ - --rebuild-db always \ - --fill-with-examples datasets computational_resources publications news events case_studies \ - presentations projects educational_resources organisations\ - --url-prefix "" --reload \ No newline at end of file diff --git a/scripts/run_database_test.sh b/scripts/run_database_test.sh deleted file mode 100755 index e4331239..00000000 --- a/scripts/run_database_test.sh +++ /dev/null @@ -1,8 +0,0 @@ -#/bin/bash - -docker run \ - -it --rm \ - --network sql-network \ - mysql mysql \ - -hsqlserver \ - -uroot -pok \ No newline at end of file diff --git a/scripts/run_mysql_server.sh b/scripts/run_mysql_server.sh deleted file mode 100755 index 84034d21..00000000 --- a/scripts/run_mysql_server.sh +++ /dev/null @@ -1,13 +0,0 @@ -#/bin/bash - -SCRIPT_PATH="$(cd "$(dirname "$0")" && pwd)" -APP_ROOT="$(dirname $SCRIPT_PATH)" -DATA_DIR="${APP_ROOT}/data/mysql" - -docker run \ - -e MYSQL_ROOT_PASSWORD=ok \ - --name sqlserver \ - --network sql-network \ - --rm \ - -v $DATA_DIR:/var/lib/mysql \ - mysql \ No newline at end of file diff --git a/src/connectors/__init__.py b/src/connectors/__init__.py index f2783c4a..e69de29b 100644 --- a/src/connectors/__init__.py +++ b/src/connectors/__init__.py @@ -1,29 +0,0 @@ -import pathlib -from typing import Dict # noqa:F401 - -from database.model.agent.organisation import Organisation -from database.model.agent.person import Person -from database.model.dataset.dataset import Dataset -from database.model.knowledge_asset.publication import Publication -from database.model.models_and_experiments.experiment import Experiment -from database.model.models_and_experiments.ml_model import MLModel -from database.model.service.service import Service -from .example.example_connector import ExampleConnector - -_path_example_resources = pathlib.Path(__file__).parent / "example" / "resources" - -example_connectors = { - name: ExampleConnector( - resource_class=cls, - json_path=_path_example_resources / f"{name}.json", - ) - for name, cls in ( - ("datasets", Dataset), - ("experiments", Experiment), - ("ml_models", MLModel), - ("organisations", Organisation), - ("persons", Person), - ("publications", Publication), - ("services", Service), - ) -} # type: Dict[str, ExampleConnector] diff --git a/src/connectors/abstract/resource_connector_on_start_up.py b/src/connectors/abstract/resource_connector_on_start_up.py index d427d2da..3177c539 100644 --- a/src/connectors/abstract/resource_connector_on_start_up.py +++ b/src/connectors/abstract/resource_connector_on_start_up.py @@ -18,10 +18,17 @@ def fetch( """Retrieve information of all resources""" def run( - self, state: dict, limit: int | None = None, **kwargs + self, state: dict, limit: int | None = None, force_rerun: bool = False, **kwargs ) -> Iterator[RESOURCE | ResourceWithRelations[RESOURCE] | RecordError]: if state: - raise ValueError("This connector has already been run before.") + if force_rerun: + logging.warning( + "Rerunning this connector, although the state shows that it has " + "already been run. Please remove the force_rerun command in " + "production." + ) + else: + raise ValueError("This connector has already been run before.") if limit is not None: logging.warning( "Limiting the results! Please remove the limit command line argument " diff --git a/src/connectors/example/example.py b/src/connectors/example/example.py new file mode 100644 index 00000000..6340c01b --- /dev/null +++ b/src/connectors/example/example.py @@ -0,0 +1,54 @@ +import pathlib + +from connectors.example.example_connector import ExampleConnector +from database.model.agent.organisation import Organisation +from database.model.agent.person import Person +from database.model.dataset.dataset import Dataset +from database.model.knowledge_asset.publication import Publication +from database.model.models_and_experiments.experiment import Experiment +from database.model.models_and_experiments.ml_model import MLModel +from database.model.service.service import Service + +_path_example_resources = pathlib.Path(__file__).parent.parent / "example" / "resources" + + +class ExampleDatasetConnector(ExampleConnector[Dataset]): + def __init__(self): + json_path = _path_example_resources / "datasets.json" + super().__init__(json_path, Dataset) + + +class ExampleExperimentConnector(ExampleConnector[Experiment]): + def __init__(self): + json_path = _path_example_resources / "experiments.json" + super().__init__(json_path, Experiment) + + +class ExampleMLModelConnector(ExampleConnector[MLModel]): + def __init__(self): + json_path = _path_example_resources / "ml_models.json" + super().__init__(json_path, MLModel) + + +class ExampleOrganisationConnector(ExampleConnector[Organisation]): + def __init__(self): + json_path = _path_example_resources / "organisations.json" + super().__init__(json_path, Organisation) + + +class ExamplePersonConnector(ExampleConnector[Person]): + def __init__(self): + json_path = _path_example_resources / "persons.json" + super().__init__(json_path, Person) + + +class ExamplePublicationConnector(ExampleConnector[Publication]): + def __init__(self): + json_path = _path_example_resources / "publications.json" + super().__init__(json_path, Publication) + + +class ExampleServiceConnector(ExampleConnector[Service]): + def __init__(self): + json_path = _path_example_resources / "services.json" + super().__init__(json_path, Service) diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 503f45b1..9292b0e8 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -7,12 +7,13 @@ from datetime import datetime from typing import Optional -from sqlmodel import SQLModel, Session +from sqlmodel import Session import routers -from connectors.abstract.resource_connector import ResourceConnector +from connectors.abstract.resource_connector import ResourceConnector, RESOURCE from connectors.record_error import RecordError from connectors.resource_with_relations import ResourceWithRelations +from database.model.concept.concept import AIoDConcept from database.setup import _create_or_fetch_related_objects, _get_existing_resource, sqlmodel_engine from routers import ResourceRouter @@ -25,6 +26,7 @@ def _parse_args() -> argparse.Namespace: # TODO: write readme parser = argparse.ArgumentParser(description="Please refer to the README.") parser.add_argument( + "-c", "--connector", required=True, help="The connector to use. Please provide a relative path such as " @@ -32,11 +34,19 @@ def _parse_args() -> argparse.Namespace: "last part is the class name.", ) parser.add_argument( + "-w", "--working-dir", required=True, help="The working directory. The status will be stored here, next to the logs and a " "list of failed resources", ) + parser.add_argument( + "-f", + "--force-rerun", + action=argparse.BooleanOptionalAction, + help="Run this connector even if it has run before (only applicable for connectors that " + "only run on startup). This is only meant for development, not for production!", + ) parser.add_argument( "--from-date", type=lambda d: datetime.strptime(d, "%Y-%m-%d").date(), @@ -52,6 +62,7 @@ def _parse_args() -> argparse.Namespace: "synchronize from the previous end-identifier.", ) parser.add_argument( + "-l", "--limit", type=int, help="Implemented by some connectors for testing purposes: limit the number of results.", @@ -73,7 +84,7 @@ def save_to_database( session: Session, connector: ResourceConnector, router: ResourceRouter, - item: SQLModel | ResourceWithRelations[SQLModel] | RecordError, + item: RESOURCE | ResourceWithRelations[RESOURCE] | RecordError, ) -> Optional[RecordError]: if isinstance(item, RecordError): return item @@ -90,7 +101,14 @@ def save_to_database( router.create_resource(session, resource_create_instance) except Exception as e: - return RecordError(identifier=str(item.identifier), error=e) # type:ignore + id_ = None + if isinstance(item, AIoDConcept): + id_ = item.aiod_entry.platform_identifier + elif isinstance(item, ResourceWithRelations): + id_ = item.resource.aiod_entry.platform_identifier + elif isinstance(item, RecordError): + id_ = item.identifier + return RecordError(identifier=id_, error=e) # type:ignore session.flush() return None @@ -126,6 +144,7 @@ def main(): from_identifier=args.from_identifier, from_date=args.from_date, limit=args.limit, + force_rerun=args.force_rerun, ) (router,) = [ diff --git a/src/database/model/field_length.py b/src/database/model/field_length.py index 437f1a20..78372246 100644 --- a/src/database/model/field_length.py +++ b/src/database/model/field_length.py @@ -6,4 +6,4 @@ SHORT = 64 NORMAL = 256 -DESCRIPTION = 5400 # three A4s full of text should be enough? +DESCRIPTION = 3600 # two A4s full of text should be enough? diff --git a/src/database/setup.py b/src/database/setup.py index 08cc1af4..c6071ae4 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -1,22 +1,15 @@ """ Utility functions for initializing the database and tables through SQLAlchemy. """ -import logging -from typing import List from sqlalchemy import text from sqlalchemy.engine import Engine -from sqlalchemy.exc import IntegrityError -from sqlmodel import create_engine, Session, select, SQLModel +from sqlmodel import create_engine, Session, SQLModel import routers from config import DB_CONFIG -from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.resource_with_relations import ResourceWithRelations from database.model.concept.concept import AIoDConcept -from database.model.dataset.dataset import Dataset -from database.model.knowledge_asset.publication import Publication -from database.model.platform.platform import Platform from database.model.platform.platform_names import PlatformName @@ -61,53 +54,6 @@ def drop_or_create_database(url: str, delete_first: bool): engine.dispose() -def populate_database( - engine: Engine, - connectors: List[ResourceConnectorOnStartUp], - only_if_empty: bool = True, -): - """Add some data to the Dataset and Publication tables.""" - - with Session(engine) as session: - session.add_all([Platform(name=name) for name in PlatformName]) - data_exists = ( - session.scalars(select(Publication)).first() or session.scalars(select(Dataset)).first() - ) - if only_if_empty and data_exists: - return - - for connector in connectors: - (router,) = [ - router - for router in routers.resource_routers - if router.resource_class == connector.resource_class - ] - # We use the create_resource function for this router. - # This is a temporary solution. After finishing the Connectors (so that they're - # synchronizing), we will probably just perform a HTTP POST instead. - - for item in connector.fetch(): - if isinstance(item, ResourceWithRelations): - resource_create_instance = item.resource - _create_or_fetch_related_objects(session, item) - else: - resource_create_instance = item - if ( - _get_existing_resource( - session, resource_create_instance, connector.resource_class - ) - is None - ): - try: - router.create_resource(session, resource_create_instance) - except IntegrityError as e: - logging.warning( - f"Error while creating resource. Continuing for now: " f" {e}" - ) - session.flush() - session.commit() - - def _get_existing_resource( session: Session, resource: AIoDConcept, clazz: type[SQLModel] ) -> AIoDConcept | None: diff --git a/src/main.py b/src/main.py index c3eb5244..99cc0e52 100644 --- a/src/main.py +++ b/src/main.py @@ -5,21 +5,20 @@ (https://fastapi.tiangolo.com/tutorial/path-params/#order-matters). """ import argparse -import logging - import uvicorn -from fastapi import Depends, FastAPI, HTTPException +from fastapi import Depends, FastAPI from fastapi.responses import HTMLResponse from pydantic import Json from sqlalchemy.engine import Engine -from starlette.status import HTTP_501_NOT_IMPLEMENTED +from sqlmodel import Session -import connectors import routers from authentication import get_current_user from config import KEYCLOAK_CONFIG -from database.setup import populate_database, create_engine +from database.model.platform.platform import Platform +from database.model.platform.platform_names import PlatformName +from database.setup import sqlmodel_engine def _parse_args() -> argparse.Namespace: @@ -32,15 +31,6 @@ def _parse_args() -> argparse.Namespace: choices=["no", "only-if-empty", "always"], help="Determines if the database is recreated.", ) - - parser.add_argument( - "--fill-with-examples", - default=[], - nargs="+", - choices=connectors.example_connectors.keys(), - help="Zero, one or more resources with which the database will have examples.", - ) - parser.add_argument( "--reload", action="store_true", @@ -49,20 +39,6 @@ def _parse_args() -> argparse.Namespace: return parser.parse_args() -def _connector_example_from_resource(resource): - connector_dict = connectors.example_connectors - connector = connector_dict.get(resource, None) - if connector is None: - possibilities = ", ".join(f"`{c}`" for c in connectors.example_connectors.keys()) - msg = ( - f"No example connector for resource '{resource}' available. Possible " - f"values: {possibilities}" - ) - logging.warning(msg) - raise HTTPException(status_code=HTTP_501_NOT_IMPLEMENTED, detail=msg) - return connector - - def add_routes(app: FastAPI, engine: Engine, url_prefix=""): """Add routes to the FastAPI application""" @@ -107,18 +83,10 @@ def create_app() -> FastAPI: "scopes": KEYCLOAK_CONFIG.get("scopes"), }, ) - - examples_connectors = [ - _connector_example_from_resource(resource) for resource in args.fill_with_examples - ] - - engine = create_engine(args.rebuild_db) - if len(examples_connectors) > 0: - populate_database( - engine, - connectors=examples_connectors, - only_if_empty=True, - ) + engine = sqlmodel_engine(args.rebuild_db) + with Session(engine) as session: + session.add_all([Platform(name=name) for name in PlatformName]) + session.commit() add_routes(app, engine, url_prefix=args.url_prefix) return app diff --git a/src/routers/__init__.py b/src/routers/__init__.py index 647f251d..f71fdd05 100644 --- a/src/routers/__init__.py +++ b/src/routers/__init__.py @@ -1,5 +1,3 @@ -import typing # noqa:F401 - from .dataset_router import DatasetRouter from .experiment_router import ExperimentRouter from .ml_model_router import MLModelRouter @@ -27,6 +25,6 @@ # ProjectRouter(), # PresentationRouter(), ServiceRouter(), -] # type: typing.List[ResourceRouter] +] # type: list[ResourceRouter] other_routers = [UploadRouterHuggingface()] diff --git a/src/tests/connectors/example/test_example_connector.py b/src/tests/connectors/example/test_example_connector.py index 8e096de3..64057421 100644 --- a/src/tests/connectors/example/test_example_connector.py +++ b/src/tests/connectors/example/test_example_connector.py @@ -1,5 +1,29 @@ +from typing import TYPE_CHECKING + import pytest -from connectors import example_connectors + +from connectors.example.example import ( + ExampleDatasetConnector, + ExamplePublicationConnector, + ExampleServiceConnector, + ExamplePersonConnector, + ExampleOrganisationConnector, + ExampleMLModelConnector, + ExampleExperimentConnector, +) + +if TYPE_CHECKING: + from connectors.example.example_connector import ExampleConnector # noqa:F401 + +example_connectors = { + "datasets": ExampleDatasetConnector(), + "experiments": ExampleExperimentConnector(), + "ml_models": ExampleMLModelConnector(), + "organisations": ExampleOrganisationConnector(), + "persons": ExamplePersonConnector(), + "publications": ExamplePublicationConnector(), + "services": ExampleServiceConnector(), +} # type: dict[str, ExampleConnector] @pytest.mark.parametrize( From 45c505400f02fa06db8214be452bd02b3aa51272 Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Tue, 22 Aug 2023 12:21:04 +0200 Subject: [PATCH 49/52] Openml connector running as cron job inside a separate container --- Dockerfile | 23 ++++---- connectors/fill-examples.sh | 16 ++---- connectors/openml/Dockerfile | 12 ++++ connectors/openml/cron | 1 + connectors/openml/datasets.sh | 20 +++++++ connectors/openml/entry.sh | 7 +++ docker-compose.yaml | 26 +++++++-- .../abstract/resource_connector_by_id.py | 9 ++- .../resource_connector_on_start_up.py | 12 +--- .../openml/openml_dataset_connector.py | 26 +++++++-- src/connectors/record_error.py | 1 + src/connectors/synchronization.py | 56 +++++++++++-------- src/database/model/field_length.py | 2 +- src/main.py | 10 ++-- 14 files changed, 151 insertions(+), 70 deletions(-) create mode 100644 connectors/openml/Dockerfile create mode 100644 connectors/openml/cron create mode 100755 connectors/openml/datasets.sh create mode 100755 connectors/openml/entry.sh diff --git a/Dockerfile b/Dockerfile index 16c9c546..d91d001b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,25 @@ FROM python:3.11-slim-bullseye # default-mysql-client is not necessary, but can be useful when debugging connection issues. -RUN apt-get update && apt-get -y install python3-dev default-libmysqlclient-dev build-essential default-mysql-client pkg-config +RUN apt-get update && apt-get -y install python3-dev default-libmysqlclient-dev build-essential \ + default-mysql-client pkg-config WORKDIR /app -COPY ./pyproject.toml /app/pyproject.toml - -# Create a non-root user for security -RUN groupadd -r apprunner && \ - useradd -mg apprunner apprunner \ - && chown -R apprunner:apprunner /app -USER apprunner:apprunner - # Add ~/.local/bin to the PATH. Not necessary, but can be useful for debugging and bypasses pip # warnings. ENV PATH="${PATH}:/home/apprunner/.local/bin" + +# Install python packages globally, so that it can also be used from cron dockers (running as root) +COPY ./pyproject.toml /app/pyproject.toml RUN pip install . -COPY ./src /app \ No newline at end of file +# This can be overwritten by a live volume, to support live code changes +COPY ./src /app + +# Create a non-root user for security +RUN groupadd -r apprunner && \ + useradd -mg apprunner apprunner \ + && chown -R apprunner:apprunner /app +USER apprunner:apprunner \ No newline at end of file diff --git a/connectors/fill-examples.sh b/connectors/fill-examples.sh index a0583608..488eb9ba 100755 --- a/connectors/fill-examples.sh +++ b/connectors/fill-examples.sh @@ -1,29 +1,25 @@ #!/bin/bash -python3 connectors/synchronization.py -f \ - -c connectors.example.example.ExampleDatasetConnector \ - -w /opt/connectors/data/example/dataset - -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExampleExperimentConnector \ -w /opt/connectors/data/example/experiment -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExampleMLModelConnector \ -w /opt/connectors/data/example/ml_model -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExampleOrganisationConnector \ -w /opt/connectors/data/example/organisation -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExamplePersonConnector \ -w /opt/connectors/data/example/person -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExamplePublicationConnector \ -w /opt/connectors/data/example/publication -python3 connectors/synchronization.py -f \ +python3 connectors/synchronization.py \ -c connectors.example.example.ExampleServiceConnector \ -w /opt/connectors/data/example/service diff --git a/connectors/openml/Dockerfile b/connectors/openml/Dockerfile new file mode 100644 index 00000000..6cba9257 --- /dev/null +++ b/connectors/openml/Dockerfile @@ -0,0 +1,12 @@ +FROM ai4eu_server + +COPY cron /etc/cron.d/aiod +COPY datasets.sh /opt/connectors/script/datasets.sh +COPY entry.sh /opt/connectors/script/entry.sh + +USER root +RUN apt -y install cron +RUN chmod +x /etc/cron.d/aiod /opt/connectors/script/datasets.sh +RUN crontab /etc/cron.d/aiod + +WORKDIR /app \ No newline at end of file diff --git a/connectors/openml/cron b/connectors/openml/cron new file mode 100644 index 00000000..b3961434 --- /dev/null +++ b/connectors/openml/cron @@ -0,0 +1 @@ +25 * * * * bash /opt/connectors/script/datasets.sh >> /opt/connectors/data/openml/dataset/cron.log 2>&1 diff --git a/connectors/openml/datasets.sh b/connectors/openml/datasets.sh new file mode 100755 index 00000000..fffc9297 --- /dev/null +++ b/connectors/openml/datasets.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +WORK_DIR=/opt/connectors/data/openml/dataset +CONNECTOR=connectors.openml.openml_dataset_connector.OpenMlDatasetConnector + +another_instance() +{ + echo $(date -u) "This script is already running in a different thread." + exit 1 +} +exec 9< "$0" +flock -n -x 9 || another_instance + +echo $(date -u) "Starting synchronization..." +PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \ + -c $CONNECTOR \ + -w $WORK_DIR \ + --from-identifier 4500 \ + --save-every 100 > ${WORK_DIR}/connector.log 2>&1 +echo $(date -u) "Synchronization Done." diff --git a/connectors/openml/entry.sh b/connectors/openml/entry.sh new file mode 100755 index 00000000..215f04d4 --- /dev/null +++ b/connectors/openml/entry.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# If this directory does not exist, the cron job cannot log (and cannot run) +MKDIR -p /opt/connectors/data/openml/dataset + +# Run cron on the foreground with log level WARN +/usr/sbin/cron -f -l 4 \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 2181ded0..3a8fc3bd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,6 +5,7 @@ services: build: context: ./ dockerfile: Dockerfile + image: ai4eu_server container_name: apiserver env_file: .env environment: @@ -15,7 +16,7 @@ services: - ./src:/app command: > python main.py - --rebuild-db always + --rebuild-db only-if-empty --reload healthcheck: test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:8000')"] @@ -29,9 +30,7 @@ services: condition: service_healthy fill-db-with-examples: - build: - context: ./ - dockerfile: Dockerfile + image: ai4eu_server container_name: fill-db-with-examples env_file: .env environment: @@ -46,6 +45,25 @@ services: app: condition: service_healthy + openml-dataset-connector: + build: + context: ./ + dockerfile: connectors/openml/Dockerfile + image: ai4eu_openml_connector + container_name: openml-dataset-connector + env_file: .env + environment: + - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET + volumes: + - ./src:/app + - ./data/connectors:/opt/connectors/data + - ./connectors/openml/:/opt/connectors/script + command: > + /bin/bash -c "/opt/connectors/script/entry.sh" + depends_on: + app: + condition: service_healthy + sqlserver: image: mysql container_name: sqlserver diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index 020adb5c..4ea7bf18 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -37,7 +37,8 @@ def run( if first_run and from_identifier is None: raise ValueError("In the first run, the from-identifier needs to be set") elif first_run: - state = {"offset": 0, "from_id": 0} + state["offset"] = 0 + state["from_id"] = from_identifier if from_identifier is not None else 0 else: state["from_id"] = state["last_id"] + 1 state["offset"] = state["offset"] # TODO: what if datasets are deleted? Or updated? @@ -53,8 +54,8 @@ def run( i = 0 for item in self.fetch(offset=state["offset"], from_identifier=state["from_id"]): i += 1 - if hasattr(item, "platform_identifier"): - id_ = int(item.platform_identifier) + if hasattr(item, "aiod_entry") and item.aiod_entry.platform_identifier is not None: + id_ = int(item.aiod_entry.platform_identifier) else: id_ = None if id_ is None or id_ >= state["from_id"]: @@ -64,6 +65,8 @@ def run( n_results += 1 if n_results == limit: return + finished = i < self.limit_per_iteration + logging.info(f"Finished: {i} < {self.limit_per_iteration}") state["offset"] += i state["result"] = "complete run successful" diff --git a/src/connectors/abstract/resource_connector_on_start_up.py b/src/connectors/abstract/resource_connector_on_start_up.py index 3177c539..bc94f75f 100644 --- a/src/connectors/abstract/resource_connector_on_start_up.py +++ b/src/connectors/abstract/resource_connector_on_start_up.py @@ -18,17 +18,11 @@ def fetch( """Retrieve information of all resources""" def run( - self, state: dict, limit: int | None = None, force_rerun: bool = False, **kwargs + self, state: dict, limit: int | None = None, **kwargs ) -> Iterator[RESOURCE | ResourceWithRelations[RESOURCE] | RecordError]: if state: - if force_rerun: - logging.warning( - "Rerunning this connector, although the state shows that it has " - "already been run. Please remove the force_rerun command in " - "production." - ) - else: - raise ValueError("This connector has already been run before.") + logging.warning("This connector has run before. Exiting.") + return if limit is not None: logging.warning( "Limiting the results! Please remove the limit command line argument " diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index c9cfb529..78a46096 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -2,7 +2,6 @@ This module knows how to load an OpenML object based on its AIoD implementation, and how to convert the OpenML response to some agreed AIoD format. """ - from typing import Iterator import dateutil.parser @@ -11,6 +10,7 @@ from connectors.abstract.resource_connector_by_id import ResourceConnectorById from connectors.record_error import RecordError +from database.model import field_length from database.model.ai_asset.distribution import Distribution from database.model.concept.aiod_entry import AIoDEntryCreate from database.model.dataset.dataset import Dataset @@ -61,6 +61,17 @@ def fetch_record( qualities_json = {quality["name"]: quality["value"] for quality in qualities} pydantic_class = resource_create(Dataset) + description = dataset_json["description"] + if isinstance(description, list) and len(description) == 0: + description = "" + elif not isinstance(description, str): + return RecordError(identifier=str(identifier), error="Description of unknown format.") + if len(description) > field_length.DESCRIPTION: + text_break = " [...]" + description = description[: field_length.DESCRIPTION - len(text_break)] + text_break + size = None + if "NumberOfInstances" in qualities_json: + size = Size(value=_as_int(qualities_json["NumberOfInstances"]), unit="instances") return pydantic_class( aiod_entry=AIoDEntryCreate( platform=self.platform_name, @@ -68,16 +79,16 @@ def fetch_record( ), name=dataset_json["name"], same_as=url_data, - description=dataset_json["description"], + description=description, date_published=dateutil.parser.parse(dataset_json["upload_date"]), distribution=[ Distribution( content_url=dataset_json["url"], encoding_format=dataset_json["format"] ) ], - size=Size(value=_as_int(qualities_json["NumberOfInstances"]), unit="instances"), + size=size, is_accessible_for_free=True, - keyword=[tag for tag in dataset_json["tag"]], + keyword=[tag for tag in dataset_json["tag"]] if "tag" in dataset_json else [], license=dataset_json["licence"] if "licence" in dataset_json else None, version=dataset_json["version"], ) @@ -106,8 +117,11 @@ def fetch(self, offset: int, from_identifier: int) -> Iterator[SQLModel | Record identifier = None try: identifier = summary["did"] - qualities = summary["quality"] - yield self.fetch_record(identifier, qualities) + if identifier < from_identifier: + yield RecordError(identifier=identifier, error="Id too low", ignore_error=True) + if from_identifier is None or identifier >= from_identifier: + qualities = summary["quality"] + yield self.fetch_record(identifier, qualities) except Exception as e: yield RecordError(identifier=identifier, error=e) diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py index 6d0d654a..77f0684b 100644 --- a/src/connectors/record_error.py +++ b/src/connectors/record_error.py @@ -5,3 +5,4 @@ class RecordError: identifier: str | None error: BaseException | str + ignore_error: bool = False diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 9292b0e8..8501fcb2 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -3,6 +3,7 @@ import json import logging import pathlib +import shutil import sys from datetime import datetime from typing import Optional @@ -17,7 +18,6 @@ from database.setup import _create_or_fetch_related_objects, _get_existing_resource, sqlmodel_engine from routers import ResourceRouter -RELATIVE_PATH_LOG = pathlib.Path("connector.log") RELATIVE_PATH_STATE_JSON = pathlib.Path("state.json") RELATIVE_PATH_ERROR_CSV = pathlib.Path("errors.csv") @@ -41,11 +41,12 @@ def _parse_args() -> argparse.Namespace: "list of failed resources", ) parser.add_argument( - "-f", - "--force-rerun", + "-rm", + "--remove_state", action=argparse.BooleanOptionalAction, - help="Run this connector even if it has run before (only applicable for connectors that " - "only run on startup). This is only meant for development, not for production!", + help="Remove the existing state files (the files in the working directory) on startup (so " + "to start with a clean sheet). This is only meant for development, not for " + "production!", ) parser.add_argument( "--from-date", @@ -56,7 +57,7 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( "--from-identifier", - type=str, + type=int, help="The start identifier. Only relevant for the first run of identifier-based " "connectors. In subsequent runs, identifier-based connectors will " "synchronize from the previous end-identifier.", @@ -68,7 +69,7 @@ def _parse_args() -> argparse.Namespace: help="Implemented by some connectors for testing purposes: limit the number of results.", ) parser.add_argument( - "--save_every", + "--save-every", type=int, help="Save the state file every N records. In case that the complete program is killed, " "you can then resume the next run from the last saved state.", @@ -101,6 +102,7 @@ def save_to_database( router.create_resource(session, resource_create_instance) except Exception as e: + session.rollback() id_ = None if isinstance(item, AIoDConcept): id_ = item.aiod_entry.platform_identifier @@ -115,12 +117,17 @@ def save_to_database( def main(): args = _parse_args() + working_dir = pathlib.Path(args.working_dir) + if args.remove_state and working_dir.exists(): + shutil.rmtree(working_dir) working_dir.mkdir(parents=True, exist_ok=True) + logging.basicConfig( - filename=working_dir / RELATIVE_PATH_LOG, encoding="utf-8", level=logging.INFO + level=logging.INFO, + format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", ) - logging.getLogger().addHandler(logging.StreamHandler()) sys.excepthook = exception_handler module_path = ".".join(args.connector.split(".")[0:-1]) @@ -133,18 +140,18 @@ def main(): error_path.parents[0].mkdir(parents=True, exist_ok=True) state_path.parents[0].mkdir(parents=True, exist_ok=True) first_run = not state_path.exists() - if not first_run: + + if first_run: + state = {} + else: with open(state_path, "r") as f: state = json.load(f) - else: - state = {} items = connector.run( state=state, from_identifier=args.from_identifier, from_date=args.from_date, limit=args.limit, - force_rerun=args.force_rerun, ) (router,) = [ @@ -159,17 +166,20 @@ def main(): for i, item in enumerate(items): error = save_to_database(router=router, connector=connector, session=session, item=item) if error: - if isinstance(error.error, str): - logging.error(f"Error on identifier {error.identifier}: {error.error}") - else: - logging.error(f"Error on identifier {error.identifier}", exc_info=error.error) - with open(error_path, "a") as f: - error_cleaned = "".join( - c if c.isalnum() or c == "" else "_" for c in str(error.error) - ) - f.write(f'"{error.identifier}","{error_cleaned}"\n') + if not error.ignore_error: + if isinstance(error.error, str): + logging.error(f"Error on identifier {error.identifier}: {error.error}") + else: + logging.error( + f"Error on identifier {error.identifier}", exc_info=error.error + ) + with open(error_path, "a") as f: + error_cleaned = "".join( + c if c.isalnum() or c == "" else "_" for c in str(error.error) + ) + f.write(f'"{error.identifier}","{error_cleaned}"\n') if args.save_every and i > 0 and i % args.save_every == 0: - logging.debug(f"Saving state after handling record {i}") + logging.info(f"Saving state after handling {i}th result: {json.dumps(state)}") with open(state_path, "w") as f: json.dump(state, f, indent=4) session.commit() diff --git a/src/database/model/field_length.py b/src/database/model/field_length.py index 78372246..2b2dddfb 100644 --- a/src/database/model/field_length.py +++ b/src/database/model/field_length.py @@ -6,4 +6,4 @@ SHORT = 64 NORMAL = 256 -DESCRIPTION = 3600 # two A4s full of text should be enough? +DESCRIPTION = 1800 # two A4s full of text should be enough? diff --git a/src/main.py b/src/main.py index 99cc0e52..dff1e4ed 100644 --- a/src/main.py +++ b/src/main.py @@ -11,7 +11,7 @@ from fastapi.responses import HTMLResponse from pydantic import Json from sqlalchemy.engine import Engine -from sqlmodel import Session +from sqlmodel import Session, select import routers from authentication import get_current_user @@ -33,7 +33,7 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( "--reload", - action="store_true", + action=argparse.BooleanOptionalAction, help="Use `--reload` for FastAPI.", ) return parser.parse_args() @@ -85,8 +85,10 @@ def create_app() -> FastAPI: ) engine = sqlmodel_engine(args.rebuild_db) with Session(engine) as session: - session.add_all([Platform(name=name) for name in PlatformName]) - session.commit() + existing_platforms = session.scalars(select(Platform)).all() + if not any(existing_platforms): + session.add_all([Platform(name=name) for name in PlatformName]) + session.commit() add_routes(app, engine, url_prefix=args.url_prefix) return app From 70d330e3609e59e5a2bdedf7296681ed60275ebc Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Tue, 22 Aug 2023 14:52:13 +0200 Subject: [PATCH 50/52] Huggingface and zenodo connectors in containers --- connectors/huggingface/datasets.sh | 8 +++ connectors/openml/entry.sh | 4 +- connectors/zenodo/Dockerfile | 12 +++++ connectors/zenodo/cron | 1 + connectors/zenodo/datasets.sh | 20 ++++++++ connectors/zenodo/entry.sh | 7 +++ docker-compose.yaml | 39 +++++++++++++- .../abstract/resource_connector_by_date.py | 10 ++-- .../huggingface_dataset_connector.py | 23 +++++---- .../openml/openml_dataset_connector.py | 2 +- src/connectors/record_error.py | 2 +- src/connectors/synchronization.py | 2 +- .../zenodo/zenodo_dataset_connector.py | 51 ++++++++++++++----- src/database/setup.py | 11 ++-- .../zenodo/test_get_datasets_zenodo.py | 4 +- 15 files changed, 153 insertions(+), 43 deletions(-) create mode 100755 connectors/huggingface/datasets.sh create mode 100644 connectors/zenodo/Dockerfile create mode 100644 connectors/zenodo/cron create mode 100755 connectors/zenodo/datasets.sh create mode 100755 connectors/zenodo/entry.sh diff --git a/connectors/huggingface/datasets.sh b/connectors/huggingface/datasets.sh new file mode 100755 index 00000000..99ba48c7 --- /dev/null +++ b/connectors/huggingface/datasets.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +WORK_DIR=/opt/connectors/data/huggingface/dataset + +python3 connectors/synchronization.py \ + -c connectors.huggingface.huggingface_dataset_connector.HuggingFaceDatasetConnector \ + -w ${WORK_DIR} \ + --save-every 100 > ${WORK_DIR}/connector.log 2>&1 diff --git a/connectors/openml/entry.sh b/connectors/openml/entry.sh index 215f04d4..5974cadc 100755 --- a/connectors/openml/entry.sh +++ b/connectors/openml/entry.sh @@ -1,7 +1,7 @@ #!/bin/bash # If this directory does not exist, the cron job cannot log (and cannot run) -MKDIR -p /opt/connectors/data/openml/dataset +mkdir -p /opt/connectors/data/openml/dataset # Run cron on the foreground with log level WARN -/usr/sbin/cron -f -l 4 \ No newline at end of file +/usr/sbin/cron -f -l 4 diff --git a/connectors/zenodo/Dockerfile b/connectors/zenodo/Dockerfile new file mode 100644 index 00000000..6cba9257 --- /dev/null +++ b/connectors/zenodo/Dockerfile @@ -0,0 +1,12 @@ +FROM ai4eu_server + +COPY cron /etc/cron.d/aiod +COPY datasets.sh /opt/connectors/script/datasets.sh +COPY entry.sh /opt/connectors/script/entry.sh + +USER root +RUN apt -y install cron +RUN chmod +x /etc/cron.d/aiod /opt/connectors/script/datasets.sh +RUN crontab /etc/cron.d/aiod + +WORKDIR /app \ No newline at end of file diff --git a/connectors/zenodo/cron b/connectors/zenodo/cron new file mode 100644 index 00000000..423fa889 --- /dev/null +++ b/connectors/zenodo/cron @@ -0,0 +1 @@ +55 * * * * bash /opt/connectors/script/datasets.sh >> /opt/connectors/data/zenodo/dataset/cron.log 2>&1 diff --git a/connectors/zenodo/datasets.sh b/connectors/zenodo/datasets.sh new file mode 100755 index 00000000..810bba01 --- /dev/null +++ b/connectors/zenodo/datasets.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +WORK_DIR=/opt/connectors/data/zenodo/dataset +CONNECTOR=connectors.zenodo.zenodo_dataset_connector.ZenodoDatasetConnector + +another_instance() +{ + echo $(date -u) "This script is already running in a different thread." + exit 1 +} +exec 9< "$0" +flock -n -x 9 || another_instance + +echo $(date -u) "Starting synchronization..." +PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \ + -c $CONNECTOR \ + -w $WORK_DIR \ + --from-date "2023-08-01" \ + --save-every 100 > ${WORK_DIR}/connector.log 2>&1 +echo $(date -u) "Synchronization Done." diff --git a/connectors/zenodo/entry.sh b/connectors/zenodo/entry.sh new file mode 100755 index 00000000..c6e5fc08 --- /dev/null +++ b/connectors/zenodo/entry.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# If this directory does not exist, the cron job cannot log (and cannot run) +mkdir -p /opt/connectors/data/zenodo/dataset + +# Run cron on the foreground with log level WARN +/usr/sbin/cron -f -l 4 diff --git a/docker-compose.yaml b/docker-compose.yaml index 3a8fc3bd..06fe58ff 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -45,10 +45,26 @@ services: app: condition: service_healthy + huggingface-dataset-connector: + image: ai4eu_server + container_name: huggingface-dataset-connector + env_file: .env + environment: + - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET + volumes: + - ./src:/app + - ./data/connectors:/opt/connectors/data + - ./connectors/huggingface/:/opt/connectors/script + command: > + /bin/bash -c "/opt/connectors/script/datasets.sh" + depends_on: + app: + condition: service_healthy + openml-dataset-connector: build: - context: ./ - dockerfile: connectors/openml/Dockerfile + context: connectors/openml + dockerfile: Dockerfile image: ai4eu_openml_connector container_name: openml-dataset-connector env_file: .env @@ -64,6 +80,25 @@ services: app: condition: service_healthy + zenodo-dataset-connector: + build: + context: connectors/zenodo + dockerfile: Dockerfile + image: ai4eu_zenodo_connector + container_name: zenodo-dataset-connector + env_file: .env + environment: + - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET + volumes: + - ./src:/app + - ./data/connectors:/opt/connectors/data + - ./connectors/zenodo/:/opt/connectors/script + command: > + /bin/bash -c "/opt/connectors/script/entry.sh" + depends_on: + app: + condition: service_healthy + sqlserver: image: mysql container_name: sqlserver diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 72dd86fa..07a8bffc 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -21,7 +21,7 @@ def retry(self, _id: int) -> RESOURCE | ResourceWithRelations[RESOURCE] | Record @abc.abstractmethod def fetch( self, from_incl: datetime, to_excl: datetime - ) -> Iterator[Tuple[date | None, RESOURCE | ResourceWithRelations[RESOURCE] | RecordError]]: + ) -> Iterator[Tuple[datetime | None, RESOURCE | ResourceWithRelations[RESOURCE] | RecordError]]: """Retrieve information of all resources""" def run( @@ -48,13 +48,13 @@ def run( raise ValueError("In the first run, the from-date needs to be set") from_incl = datetime.combine(from_date, datetime.min.time()) else: - from_incl = state["to_excl"] + from_incl = datetime.fromtimestamp(state["last"] + 0.001) logging.info(f"Starting synchronisation {from_incl=}, {to_excl=}.") - state["from_incl"] = from_incl - state["to_excl"] = to_excl + state["from_incl"] = from_incl.timestamp() + state["to_excl"] = to_excl.timestamp() for datetime_, result in self.fetch(from_incl=from_incl, to_excl=to_excl): yield result if datetime_: - state["last_datetime"] = datetime_ # For manually resolving errors + state["last"] = datetime_.timestamp() state["result"] = "complete run successful" diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index 27a4ec94..1fe58ac0 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -10,6 +10,7 @@ from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp from connectors.record_error import RecordError from connectors.resource_with_relations import ResourceWithRelations +from database.model import field_length from database.model.agent.person import Person from database.model.ai_asset.distribution import Distribution from database.model.concept.aiod_entry import AIoDEntryCreate @@ -21,8 +22,8 @@ class HuggingFaceDatasetConnector(ResourceConnectorOnStartUp[Dataset]): """ - This must be only runned on the startup due to there is no way to - retrieve data from huggingface filtering by time creation + This must be only ran on the startup because there is no way to retrieve data from + huggingface filtering by the created time """ @property @@ -67,8 +68,7 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication): name=dataset.citation, ) ] - elif len(parsed_citations) == 1: - citation = parsed_citations[0] + else: citations = [ pydantic_class_publication( aiod_entry=AIoDEntryCreate( @@ -79,12 +79,8 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication): same_as=citation["link"] if "link" in citation else None, type=citation["ENTRYTYPE"], ) + for citation in parsed_citations ] - else: - raise ValueError( - f"Unexpected number of citations found for dataset " - f"{dataset.id} in {dataset.citation}: {len(parsed_citations)}" - ) parquet_info = HuggingFaceDatasetConnector._get( url="https://datasets-server.huggingface.co/parquet", @@ -102,7 +98,7 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication): ] size = None ds_license = None - if dataset.cardData is not None: + if dataset.cardData is not None and "license" in dataset.cardData: if isinstance(dataset.cardData["license"], str): ds_license = dataset.cardData["license"] else: @@ -117,13 +113,18 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication): related_resources = {"citation": citations} if dataset.author is not None: related_resources["creator"] = [Person(name=dataset.author)] + description = dataset.description + if len(description) > field_length.DESCRIPTION: + text_break = " [...]" + description = description[: field_length.DESCRIPTION - len(text_break)] + text_break + return ResourceWithRelations[Dataset]( resource=pydantic_class( aiod_entry=AIoDEntryCreate( platform_identifier=dataset.id, platform=self.platform_name, ), - description=dataset.description, + description=description, name=dataset.id, same_as=f"https://huggingface.co/datasets/{dataset.id}", date_modified=dateutil.parser.parse(dataset.lastModified), diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 78a46096..9a914ff4 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -118,7 +118,7 @@ def fetch(self, offset: int, from_identifier: int) -> Iterator[SQLModel | Record try: identifier = summary["did"] if identifier < from_identifier: - yield RecordError(identifier=identifier, error="Id too low", ignore_error=True) + yield RecordError(identifier=identifier, error="Id too low", ignore=True) if from_identifier is None or identifier >= from_identifier: qualities = summary["quality"] yield self.fetch_record(identifier, qualities) diff --git a/src/connectors/record_error.py b/src/connectors/record_error.py index 77f0684b..f8c71343 100644 --- a/src/connectors/record_error.py +++ b/src/connectors/record_error.py @@ -5,4 +5,4 @@ class RecordError: identifier: str | None error: BaseException | str - ignore_error: bool = False + ignore: bool = False diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index 8501fcb2..f2bdebb6 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -166,7 +166,7 @@ def main(): for i, item in enumerate(items): error = save_to_database(router=router, connector=connector, session=session, item=item) if error: - if not error.ignore_error: + if not error.ignore: if isinstance(error.error, str): logging.error(f"Error on identifier {error.identifier}: {error.error}") else: diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py index 4384635c..53d6a008 100644 --- a/src/connectors/zenodo/zenodo_dataset_connector.py +++ b/src/connectors/zenodo/zenodo_dataset_connector.py @@ -1,6 +1,7 @@ -from datetime import datetime, date +from datetime import datetime from typing import Iterator, Tuple +import dateutil.parser import requests import xmltodict from sickle import Sickle @@ -9,6 +10,7 @@ from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate from connectors.record_error import RecordError from connectors.resource_with_relations import ResourceWithRelations +from database.model import field_length from database.model.agent.person import Person from database.model.concept.aiod_entry import AIoDEntryCreate from database.model.dataset.dataset import Dataset @@ -47,10 +49,18 @@ def retry(self, _id: int) -> ResourceWithRelations[Dataset] | RecordError: record = response.json() creator_names = [item["name"] for item in record["metadata"]["creators"]] - creators = [ - Person(given_name=name.split(", ")[1], surname=name.split(", ")[0]) - for name in creator_names - ] + creators = [] + for name in creator_names: + name_splits = name.split(", ") + if len(name_splits) == 2: + creators.append(Person(given_name=name_splits[1], surname=name_splits[0])) + else: + creators.append(Person(name=name)) + + description = record.get("metadata").get("description") + if len(description) > field_length.DESCRIPTION: + text_break = " [...]" + description = description[: field_length.DESCRIPTION - len(text_break)] + text_break pydantic_class = resource_create(Dataset) dataset = pydantic_class( @@ -60,7 +70,7 @@ def retry(self, _id: int) -> ResourceWithRelations[Dataset] | RecordError: ), date_published=record.get("created"), name=record.get("metadata").get("title"), - description=record.get("metadata").get("description"), + description=description, license=record.get("metadata").get("license").get("id"), keyword=record.get("metadata").get("keywords"), ) @@ -84,15 +94,23 @@ def _dataset_from_record( else: error_fmt("") return RecordError(identifier=identifier, error=error_fmt("creator")) - creators = [ - Person(given_name=name.split(", ")[1], surname=name.split(", ")[0]) - for name in creator_names - ] + + creators = [] + for name in creator_names: + name_splits = name.split(", ") + if len(name_splits) == 2: + creators.append(Person(given_name=name_splits[1], surname=name_splits[0])) + else: + creators.append(Person(name=name)) if isinstance(record["titles"]["title"], str): title = record["titles"]["title"] else: return RecordError(identifier=identifier, error=error_fmt("title")) + if len(title) > field_length.NORMAL: + text_break = " [...]" + title = title[: field_length.NORMAL - len(text_break)] + text_break + number_str = identifier.rsplit("/", 1)[-1] id_number = "".join(filter(str.isdigit, number_str)) same_as = f"https://zenodo.org/api/records/{id_number}" @@ -106,6 +124,9 @@ def _dataset_from_record( description = description_raw["#text"] else: return RecordError(identifier=identifier, error=error_fmt("description")) + if len(description) > field_length.DESCRIPTION: + text_break = " [...]" + description = description[: field_length.DESCRIPTION - len(text_break)] + text_break date_published = None date_raw = record["dates"]["date"] @@ -144,7 +165,7 @@ def _dataset_from_record( platform="zenodo", platform_identifier=identifier, ), - name=title[:150], + name=title, same_as=same_as, description=description, date_published=date_published, @@ -171,7 +192,7 @@ def _resource_type(record) -> str | None: def fetch( self, from_incl: datetime, to_excl: datetime - ) -> Iterator[Tuple[date | None, SQLModel | ResourceWithRelations[SQLModel] | RecordError]]: + ) -> Iterator[Tuple[datetime | None, SQLModel | ResourceWithRelations[SQLModel] | RecordError]]: sickle = Sickle("https://zenodo.org/oai2d") records = sickle.ListRecords( **{ @@ -187,7 +208,7 @@ def fetch( resource_type = ZenodoDatasetConnector._resource_type(record) if resource_type is None: yield datetime_, RecordError( - identifier=id_, error="Resource type could not be " "determined" + identifier=id_, error="Resource type could not be determined" ) if resource_type == "Dataset": try: @@ -196,8 +217,10 @@ def fetch( id_ = xml_dict["record"]["header"]["identifier"] if id_.startswith("oai:"): id_ = id_.replace("oai:", "") - datetime_ = xml_dict["record"]["header"]["datestamp"] + datetime_ = dateutil.parser.parse(xml_dict["record"]["header"]["datestamp"]) resource = xml_dict["record"]["metadata"]["oai_datacite"]["payload"]["resource"] yield datetime_, self._dataset_from_record(id_, resource) except Exception as e: yield datetime_, RecordError(identifier=id_, error=e) + else: + yield datetime_, RecordError(identifier=id_, error="Wrong type", ignore=True) diff --git a/src/database/setup.py b/src/database/setup.py index c6071ae4..8edc0aeb 100644 --- a/src/database/setup.py +++ b/src/database/setup.py @@ -34,7 +34,7 @@ def connect_to_database( if delete_first or create_if_not_exists: drop_or_create_database(url, delete_first) - engine = create_engine(url, echo=True, pool_recycle=3600) + engine = create_engine(url, echo=False, pool_recycle=3600) with engine.connect() as connection: AIoDConcept.metadata.create_all(connection, checkfirst=True) @@ -44,7 +44,7 @@ def connect_to_database( def drop_or_create_database(url: str, delete_first: bool): server, database = url.rsplit("/", 1) - engine = create_engine(server, echo=True) # Temporary engine, not connected to a database + engine = create_engine(server, echo=False) # Temporary engine, not connected to a database with engine.connect() as connection: if delete_first: @@ -83,9 +83,10 @@ def _create_or_fetch_related_objects(session: Session, item: ResourceWithRelatio identifiers = [] for resource in resources: if ( - resource.platform is not None - and resource.platform != PlatformName.aiod - and resource.platform_identifier is not None + resource.aiod_entry is not None + and resource.aiod_entry.platform is not None + and resource.aiod_entry.platform != PlatformName.aiod + and resource.aiod_entry.platform_identifier is not None ): # Get the router of this resource. The difficulty is, that the resource will be a # ResourceRead (e.g. a DatasetRead). So we search for the router for which the diff --git a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py index 1e101ea0..a8e7a61c 100644 --- a/src/tests/connectors/zenodo/test_get_datasets_zenodo.py +++ b/src/tests/connectors/zenodo/test_get_datasets_zenodo.py @@ -2,6 +2,7 @@ import responses +from connectors.record_error import RecordError from connectors.zenodo.zenodo_dataset_connector import ZenodoDatasetConnector from database.model.agent.person import Person from tests.testutils.paths import path_test_resources @@ -14,7 +15,8 @@ def test_fetch_happy_path(): from_incl = datetime.datetime(2000, 1, 1, 12, 0, 0) to_excl = datetime.datetime(2000, 1, 2, 12, 0, 0) - datasets = list(connector.run(state={}, from_date=from_incl, to_excl=to_excl)) + resources = list(connector.run(state={}, from_date=from_incl, to_excl=to_excl)) + datasets = [r for r in resources if not isinstance(r, RecordError)] assert len(datasets) == 1 dataset = datasets[0].resource assert dataset.name == "THE FIELD'S MALL MASS SHOOTING: EMERGENCY MEDICAL SERVICES RESPONSE" From 1df23e4edefa368fe2ff0ce36febbf0081125d54 Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Tue, 22 Aug 2023 14:55:44 +0200 Subject: [PATCH 51/52] Removed devcontainer (dev docker compose) because its outdated, and I dont think anybody uses it. If anybody wants it back, feel free to set it back --- DevDockerfile | 14 -------------- README.md | 14 -------------- compose-dev.yaml | 34 ---------------------------------- 3 files changed, 62 deletions(-) delete mode 100644 DevDockerfile delete mode 100644 compose-dev.yaml diff --git a/DevDockerfile b/DevDockerfile deleted file mode 100644 index b52bb29f..00000000 --- a/DevDockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM python:3.11-slim-bullseye - -# default-mysql-client is not necessary, but can be useful when debugging connection issues. -RUN apt-get update && apt-get -y install python3-dev git default-libmysqlclient-dev build-essential default-mysql-client - -WORKDIR /app - -COPY ./pyproject.toml /app/pyproject.toml - - -RUN pip install . -RUN python -m pip install ".[dev]" - - diff --git a/README.md b/README.md index 09eb40e0..e78c6c06 100644 --- a/README.md +++ b/README.md @@ -197,20 +197,6 @@ The `--reload` argument will automatically restart the app if changes are made t 2. Run using docker. For instance using `scripts/run_apiserver.sh` 3. Run using DevContainer (see next subsection) -#### (Optional) Devcontainer Installation -If you want to run the server on and isolated container pre configured for the proyect you can open the proyect via docker dashboard. On the dev container section click `create a new enviroment` - -image - -Follow the instructions and select the root folder of the project: - -image - - After this, docker will ask you for selection de devcontainer and open it on vscode, you should - choose aiod-app-1: - - image - ### Authentication Currently, the code is on default coupled with a keycloak running on test.openml.org. To make this work, you need to set an environment variable. You can do this by setting the diff --git a/compose-dev.yaml b/compose-dev.yaml deleted file mode 100644 index e08ffba2..00000000 --- a/compose-dev.yaml +++ /dev/null @@ -1,34 +0,0 @@ -services: - app: - entrypoint: - - sleep - - infinity - build: - dockerfile: DevDockerfile - init: true - volumes: - - type: bind - source: /var/run/docker.sock - target: /var/run/docker.sock - - type: bind - source: ~/.ssh/ - target: /root/.ssh/ - ports: - - "8000:8000" - networks: - - sql-network - - sqlserver: - image: mysql - environment: - MYSQL_ROOT_PASSWORD: ok - volumes: - - ./data/mysql:/var/lib/mysql - networks: - - sql-network - -networks: - sql-network: - driver: bridge - - From 778f86f5aaedc0f5694384c2024c42abc27fd776 Mon Sep 17 00:00:00 2001 From: Jos van der Velde Date: Tue, 22 Aug 2023 15:11:14 +0200 Subject: [PATCH 52/52] Cleanup --- src/connectors/abstract/resource_connector_by_date.py | 2 +- src/connectors/abstract/resource_connector_by_id.py | 2 +- .../abstract/resource_connector_on_start_up.py | 2 +- .../huggingface/huggingface_dataset_connector.py | 4 ++-- src/connectors/openml/openml_dataset_connector.py | 8 ++++---- src/connectors/synchronization.py | 10 ++++++---- src/database/model/field_length.py | 2 +- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py index 07a8bffc..e655995d 100644 --- a/src/connectors/abstract/resource_connector_by_date.py +++ b/src/connectors/abstract/resource_connector_by_date.py @@ -57,4 +57,4 @@ def run( yield result if datetime_: state["last"] = datetime_.timestamp() - state["result"] = "complete run successful" + state["result"] = "Complete run done (although there might be errors)." diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py index 4ea7bf18..b6749324 100644 --- a/src/connectors/abstract/resource_connector_by_id.py +++ b/src/connectors/abstract/resource_connector_by_id.py @@ -69,4 +69,4 @@ def run( finished = i < self.limit_per_iteration logging.info(f"Finished: {i} < {self.limit_per_iteration}") state["offset"] += i - state["result"] = "complete run successful" + state["result"] = "Complete run done (although there might be errors)." diff --git a/src/connectors/abstract/resource_connector_on_start_up.py b/src/connectors/abstract/resource_connector_on_start_up.py index bc94f75f..0ea11af2 100644 --- a/src/connectors/abstract/resource_connector_on_start_up.py +++ b/src/connectors/abstract/resource_connector_on_start_up.py @@ -30,4 +30,4 @@ def run( ) state["result"] = f"started on {datetime.datetime.now()}" yield from self.fetch(limit=limit) - state["result"] = "complete run successful" + state["result"] = "Complete run done (although there might be errors)." diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py index 1fe58ac0..f0db4600 100644 --- a/src/connectors/huggingface/huggingface_dataset_connector.py +++ b/src/connectors/huggingface/huggingface_dataset_connector.py @@ -22,8 +22,8 @@ class HuggingFaceDatasetConnector(ResourceConnectorOnStartUp[Dataset]): """ - This must be only ran on the startup because there is no way to retrieve data from - huggingface filtering by the created time + This connector only runs on startup, because there is no endpoint to filter the huggingface + data by last modified datetime """ @property diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py index 9a914ff4..5d630edc 100644 --- a/src/connectors/openml/openml_dataset_connector.py +++ b/src/connectors/openml/openml_dataset_connector.py @@ -20,10 +20,10 @@ class OpenMlDatasetConnector(ResourceConnectorById[Dataset]): - """ " - Openml orders its records with a numeric id in ascendent order but does not allow - gather them from a certain date. This is the reason why the ResourceConnectorById - is needed + """ + Openml does not allow gathering the records based on the last modified datetime. Instead, + it does guarantee strictly ascending identifiers. This is the reason why the + ResourceConnectorById is used. """ @property diff --git a/src/connectors/synchronization.py b/src/connectors/synchronization.py index f2bdebb6..376961ca 100644 --- a/src/connectors/synchronization.py +++ b/src/connectors/synchronization.py @@ -23,8 +23,9 @@ def _parse_args() -> argparse.Namespace: - # TODO: write readme - parser = argparse.ArgumentParser(description="Please refer to the README.") + parser = argparse.ArgumentParser( + description="Synchronize a resource from a platform to the " "AIoD database." + ) parser.add_argument( "-c", "--connector", @@ -71,7 +72,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument( "--save-every", type=int, - help="Save the state file every N records. In case that the complete program is killed, " + help="Save the state file every N records. In case the complete program is killed, " "you can then resume the next run from the last saved state.", ) return parser.parse_args() @@ -98,7 +99,8 @@ def save_to_database( existing = _get_existing_resource( session, resource_create_instance, connector.resource_class ) - if existing is None: # TODO: if not None, update + # TODO: if not None, update (https://github.com/aiondemand/AIOD-rest-api/issues/131) + if existing is None: router.create_resource(session, resource_create_instance) except Exception as e: diff --git a/src/database/model/field_length.py b/src/database/model/field_length.py index 2b2dddfb..cb965ddc 100644 --- a/src/database/model/field_length.py +++ b/src/database/model/field_length.py @@ -6,4 +6,4 @@ SHORT = 64 NORMAL = 256 -DESCRIPTION = 1800 # two A4s full of text should be enough? +DESCRIPTION = 1800 # an A4s full of text should be enough?