Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/connector sync #100

Merged
merged 54 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
143bf70
example changed
arejula27 Jul 6, 2023
d698a6a
connector by date
arejula27 Jul 6, 2023
5128a38
zenodo conector
arejula27 Jul 10, 2023
f848db7
openml connector
arejula27 Jul 12, 2023
acaf5d1
remove connectors from api
arejula27 Jul 12, 2023
d4fd687
error on zenodo
arejula27 Jul 13, 2023
6eba04a
openml
arejula27 Jul 13, 2023
ac7e485
delete retun
arejula27 Jul 14, 2023
6a182df
delete code duplication dataset example
arejula27 Jul 17, 2023
eaf8d85
change exception to Value error
arejula27 Jul 17, 2023
3860b61
change excepcion to value error
arejula27 Jul 17, 2023
06ebd63
change id to id_ test zenodo
arejula27 Jul 17, 2023
daf0408
change a nmethod to static
arejula27 Jul 17, 2023
0869102
change var name
arejula27 Jul 17, 2023
ea2fb14
remove commnets
arejula27 Jul 17, 2023
0344c82
example filter by time
arejula27 Jul 17, 2023
4f0a358
change default example date dataset
arejula27 Jul 17, 2023
14efe89
comments
arejula27 Jul 21, 2023
f16823f
reduce code duplication
arejula27 Jul 21, 2023
d4b4b29
type hints in error class
arejula27 Jul 21, 2023
eba1311
record error
arejula27 Jul 21, 2023
fb062d6
add record to abstact connector class
arejula27 Jul 21, 2023
2bc4465
change id
arejula27 Jul 24, 2023
a237ad5
zenodo retry error handling
arejula27 Jul 24, 2023
320f474
remove comment sflag dataset
arejula27 Jul 24, 2023
b51cc9a
fix test
arejula27 Jul 24, 2023
2735d98
fix setup fetch
arejula27 Jul 24, 2023
424bdff
limit remove
arejula27 Jul 24, 2023
ae623c5
huggingface
arejula27 Jul 24, 2023
e21c8b2
comment
arejula27 Jul 24, 2023
bc48b59
add check_valid_id
arejula27 Jul 24, 2023
e396e3d
main connector
arejula27 Jul 25, 2023
b1b0c5a
comments
arejula27 Jul 25, 2023
e4d218d
config
arejula27 Jul 26, 2023
cdbaa19
delete useless line
arejula27 Jul 26, 2023
ed42f01
create new abstract class
arejula27 Jul 26, 2023
6090839
change id type to int
arejula27 Jul 26, 2023
18ecf50
delete useless comment
arejula27 Jul 26, 2023
970448d
connector hugg changed to on startup
arejula27 Jul 26, 2023
b36a84b
typo
arejula27 Jul 26, 2023
91be909
error handling huggingface
arejula27 Jul 26, 2023
9549cc9
add header error
arejula27 Jul 26, 2023
27fe77c
delete raise exception
arejula27 Jul 26, 2023
2cf514b
typo
arejula27 Jul 28, 2023
322efe8
reason why zenodo uses 2 protocols
arejula27 Jul 28, 2023
a4c7ee3
ad until to the protocol
arejula27 Jul 28, 2023
3ef74d2
Merge branch 'develop' of https://github.com/aiondemand/AIOD-rest-api…
josvandervelde Aug 18, 2023
9e3e8c2
Updated the synchronization and connectors to be more like described …
josvandervelde Aug 18, 2023
e773777
Fixed merge conflicts between connector-sync and develop (the new met…
josvandervelde Aug 21, 2023
1905e81
Moved example connectors to separate docker container
josvandervelde Aug 21, 2023
45c5054
Openml connector running as cron job inside a separate container
josvandervelde Aug 22, 2023
70d330e
Huggingface and zenodo connectors in containers
josvandervelde Aug 22, 2023
1df23e4
Removed devcontainer (dev docker compose) because its outdated, and I…
josvandervelde Aug 22, 2023
778f86f
Cleanup
josvandervelde Aug 22, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 2 additions & 13 deletions src/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,10 @@
from database.model.project.project import Project
from database.model.publication.publication import Publication
from database.model.organisation.organisation import Organisation
from .abstract.resource_connector import ResourceConnector # noqa:F401
from .abstract.resource_connector_by_date import ResourceConnectorByDate # noqa:F401
from .example.example_connector import ExampleConnector
from .example.example_dataset_connector import ExampleDatasetConnector
from .huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector
from .openml.openml_dataset_connector import OpenMlDatasetConnector
from .zenodo.zenodo_dataset_connector import ZenodoDatasetConnector

dataset_connectors = {
c.platform_name: c
for c in (
OpenMlDatasetConnector(),
HuggingFaceDatasetConnector(),
ZenodoDatasetConnector(),
)
}

_path_example_resources = pathlib.Path(__file__).parent / "example" / "resources"

Expand Down Expand Up @@ -57,4 +46,4 @@
"organisations": ExampleConnector(
resource_class=Organisation, json_path=_path_example_resources / "organisations.json"
),
} # type: Dict[str, ResourceConnector]
} # type: Dict[str, ResourceConnectorByDate]
13 changes: 4 additions & 9 deletions src/connectors/abstract/resource_connector.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import abc
from typing import Generic, TypeVar, Iterator

from typing import Generic, TypeVar


from sqlmodel import SQLModel

from connectors.resource_with_relations import ResourceWithRelations

from database.model.platform.platform_names import PlatformName


Expand All @@ -25,10 +27,3 @@ def resource_class(self) -> type[RESOURCE]:
def platform_name(self) -> PlatformName:
"""The platform of this connector"""
pass

@abc.abstractmethod
def fetch_all(
self, limit: int | None = None
) -> Iterator[SQLModel | ResourceWithRelations[SQLModel]]:
"""Retrieve information of all resources"""
pass
29 changes: 29 additions & 0 deletions src/connectors/abstract/resource_connector_by_date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import abc
from datetime import datetime
from typing import Generic, Iterator, TypeVar
from connectors.abstract.resource_connector import ResourceConnector
from connectors.record_error import RecordError

from sqlmodel import SQLModel

from connectors.resource_with_relations import ResourceWithRelations

RESOURCE = TypeVar("RESOURCE", bound=SQLModel)


class ResourceConnectorByDate(ResourceConnector, Generic[RESOURCE]):
"""
For every platform that offers this resource, this ResourceConnector should be implemented.
"""

@abc.abstractmethod
def retry(self, _id: str) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError:
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
"""Retrieve information of the resource identified by id"""
pass

@abc.abstractmethod
def fetch(
self, from_incl: datetime, to_excl: datetime
) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]:
"""Retrieve information of all resources"""
pass
29 changes: 29 additions & 0 deletions src/connectors/abstract/resource_connector_by_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import abc
from typing import Generic, Iterator, TypeVar

from sqlmodel import SQLModel
from connectors.abstract.resource_connector import ResourceConnector

from connectors.resource_with_relations import ResourceWithRelations

from connectors.record_error import RecordError

RESOURCE = TypeVar("RESOURCE", bound=SQLModel)


class ResourceConnectorById(ResourceConnector, Generic[RESOURCE]):
"""
For every platform that offers this resource, this ResourceConnector should be implemented.
"""

@abc.abstractmethod
def retry(self, _id: int) -> SQLModel | ResourceWithRelations[SQLModel] | RecordError:
"""Retrieve information of the resource identified by id"""
pass

@abc.abstractmethod
def fetch(
self, from_id: int, to_id: int
) -> Iterator[SQLModel | ResourceWithRelations[SQLModel] | RecordError]:
"""Retrieve information of all resources"""
pass
22 changes: 18 additions & 4 deletions src/connectors/example/example_connector.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from datetime import datetime
import json
import pathlib
from typing import Iterator, TypeVar

from sqlmodel import SQLModel
from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate


from connectors import ResourceConnector
from database.model.resource import resource_create
from database.model.platform.platform_names import PlatformName


RESOURCE = TypeVar("RESOURCE", bound=SQLModel)


class ExampleConnector(ResourceConnector[RESOURCE]):
class ExampleConnector(ResourceConnectorByDate[RESOURCE]):
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
"""
Creating hardcoded values example values based on json files
"""
Expand All @@ -29,9 +31,21 @@ def resource_class(self) -> type[RESOURCE]:
def platform_name(self) -> PlatformName:
return PlatformName.example

def fetch_all(self, limit: int | None = None) -> Iterator[RESOURCE]:
def retry(self, _id: str) -> RESOURCE:
"""Retrieve information of the resource identified by id"""
with open(self.json_path) as f:
json_data = json.load(f)
pydantic_class = resource_create(self.resource_class)
for json_item in json_data:
if json_item.get("platform_identifier") == _id:
return pydantic_class(**json_item)
raise ValueError("No resource associated with the id")

def fetch(
self, from_incl: datetime | None = None, to_excl: datetime | None = None
) -> Iterator[RESOURCE]:
with open(self.json_path) as f:
json_data = json.load(f)
pydantic_class = resource_create(self.resource_class)
for json_item in json_data[:limit]:
for json_item in json_data:
yield pydantic_class(**json_item)
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
38 changes: 31 additions & 7 deletions src/connectors/example/example_dataset_connector.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from datetime import datetime
import typing # noqa:F401 (flake8 raises incorrect 'Module imported but unused' error)

from connectors import ResourceConnector

from connectors.abstract.resource_connector_by_date import ResourceConnectorByDate
from connectors.resource_with_relations import ResourceWithRelations
from database.model.dataset.dataset import Dataset
from database.model.publication.publication import Publication
from database.model.resource import resource_create
from database.model.platform.platform_names import PlatformName


class ExampleDatasetConnector(ResourceConnector[Dataset]):
class ExampleDatasetConnector(ResourceConnectorByDate[Dataset]):
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
@property
def resource_class(self) -> type[Dataset]:
return Dataset
Expand All @@ -17,12 +19,11 @@ def resource_class(self) -> type[Dataset]:
def platform_name(self) -> PlatformName:
return PlatformName.example

def fetch_all(
self, limit: int | None = None
) -> typing.Iterator[ResourceWithRelations[Dataset]]:
def retry(self, _id: str) -> ResourceWithRelations[Dataset]:
"""Retrieve information of the resource identified by id"""
pydantic_class = resource_create(Dataset)
pydantic_class_publication = resource_create(Publication)
yield from [
datasets = [
ResourceWithRelations[Dataset](
resource=pydantic_class(
name="Higgs",
Expand All @@ -37,6 +38,7 @@ def fetch_all(
has_parts=[],
keywords=["keyword1", "keyword2"],
measured_values=[],
date_published=datetime.now(),
),
related_resources={
"citations": [
Expand Down Expand Up @@ -67,6 +69,28 @@ def fetch_all(
has_parts=[],
keywords=[],
measured_values=[],
date_published=datetime.min,
)
),
][:limit]
]
for dataset in datasets:
if dataset.resource.platform_identifier == _id:
return dataset
raise ValueError("No resource associated with the id")

def fetch(
self, from_incl: datetime | None = None, to_excl: datetime | None = None
) -> typing.Iterator[ResourceWithRelations[Dataset]]:
id_list = ["42769", "42742"]
for id_ in id_list:
dataset = self.retry(id_)
if from_incl is None:
from_incl = datetime.min
if to_excl is None:
to_excl = datetime.max
if (
dataset.resource.date_published is not None
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
and dataset.resource.date_published >= from_incl
and dataset.resource.date_published < to_excl
):
yield dataset
2 changes: 1 addition & 1 deletion src/connectors/example/resources/publications.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[
{
"platform": "example",
"platform_identifier": 1,
"platform_identifier": "1",
"title": "The Art of Fiction",
"doi": "10.1234/567890",
"creators": "Jane Smith",
Expand Down
9 changes: 8 additions & 1 deletion src/connectors/huggingface/huggingface_dataset_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import dateutil.parser
import requests

from connectors import ResourceConnector
from connectors.abstract.resource_connector import ResourceConnector
from connectors.resource_with_relations import ResourceWithRelations
from database.model.dataset.data_download import DataDownload
from database.model.dataset.dataset import Dataset
Expand All @@ -17,6 +17,11 @@


class HuggingFaceDatasetConnector(ResourceConnector[Dataset]):
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
"""
This must be only runned on the startu due to there is no way to
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
retrieve data from huggingface filtering by time creation
"""

@property
def resource_class(self) -> type[Dataset]:
return Dataset
Expand All @@ -29,7 +34,9 @@ def platform_name(self) -> PlatformName:
def _get(url: str, dataset_id: str) -> typing.List[typing.Dict[str, typing.Any]]:
"""
Perform a GET request and raise an exception if the response code is not OK.
resultaod
arejula27 marked this conversation as resolved.
Show resolved Hide resolved
"""

response = requests.get(url, params={"dataset": dataset_id})
response_json = response.json()
if not response.ok:
Expand Down
Loading