Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dataset connectors #186

Merged
merged 8 commits into from
Nov 14, 2023
24 changes: 13 additions & 11 deletions src/connectors/huggingface/huggingface_dataset_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import typing

import bibtexparser
import datasets
import dateutil.parser
import requests
from huggingface_hub import list_datasets

from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp
from connectors.record_error import RecordError
Expand All @@ -14,6 +13,7 @@
from database.model.agent.person import Person
from database.model.ai_asset.distribution import Distribution
from database.model.ai_resource.text import Text
from database.model.concept.aiod_entry import AIoDEntryCreate
from database.model.dataset.dataset import Dataset
from database.model.knowledge_asset.publication import Publication
from database.model.platform.platform_names import PlatformName
Expand Down Expand Up @@ -51,15 +51,15 @@ def fetch(
) -> typing.Iterator[ResourceWithRelations[Dataset] | RecordError]:
pydantic_class = resource_create(Dataset)
pydantic_class_publication = resource_create(Publication)
for dataset in itertools.islice(datasets.list_datasets(with_details=True), limit):
for dataset in itertools.islice(list_datasets(full=True), limit):
try:
yield self.fetch_dataset(dataset, pydantic_class, pydantic_class_publication)
except Exception as e:
yield RecordError(identifier=dataset.id, error=e)

def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
andrejridzik marked this conversation as resolved.
Show resolved Hide resolved
citations = []
if dataset.citation is not None:
if hasattr(dataset, "citation") and isinstance(dataset.citation, str):
josvandervelde marked this conversation as resolved.
Show resolved Hide resolved
parsed_citations = bibtexparser.loads(dataset.citation).entries
if len(parsed_citations) == 0:
if dataset.citation:
Expand Down Expand Up @@ -96,11 +96,11 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
]
size = None
ds_license = None
if dataset.cardData is not None and "license" in dataset.cardData:
if isinstance(dataset.cardData["license"], str):
ds_license = dataset.cardData["license"]
if dataset.card_data is not None and "license" in dataset.card_data:
josvandervelde marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(dataset.card_data["license"], str):
ds_license = dataset.card_data["license"]
else:
(ds_license,) = dataset.cardData["license"]
(ds_license,) = dataset.card_data["license"]

# TODO(issue 8): implement
# if "dataset_info" in dataset.cardData:
Expand All @@ -111,7 +111,8 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
related_resources = {"citation": citations}
if dataset.author is not None:
related_resources["creator"] = [Person(name=dataset.author)]
description = dataset.description

description = getattr(dataset, "description", "")
josvandervelde marked this conversation as resolved.
Show resolved Hide resolved
if len(description) > field_length.LONG:
text_break = " [...]"
description = description[: field_length.LONG - len(text_break)] + text_break
Expand All @@ -120,12 +121,13 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):

return ResourceWithRelations[Dataset](
resource=pydantic_class(
aiod_entry=AIoDEntryCreate(status="published"),
platform_resource_identifier=dataset.id,
platform=self.platform_name,
description=description,
name=dataset.id,
same_as=f"https://huggingface.co/datasets/{dataset.id}",
date_modified=dateutil.parser.parse(dataset.lastModified),
description=description,
date_published=dataset.createdAt,
josvandervelde marked this conversation as resolved.
Show resolved Hide resolved
license=ds_license,
distributions=distributions,
is_accessible_for_free=True,
Expand Down
9 changes: 5 additions & 4 deletions src/connectors/openml/openml_dataset_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,23 @@ def fetch_record(
size = DatasetSize(value=_as_int(qualities_json["NumberOfInstances"]), unit="instances")
return pydantic_class(
aiod_entry=AIoDEntryCreate(
platform=self.platform_name,
platform_resource_identifier=identifier,
status="published",
),
platform_resource_identifier=identifier,
platform=self.platform_name,
name=dataset_json["name"],
same_as=url_data,
description=description,
date_published=dateutil.parser.parse(dataset_json["upload_date"]),
license=dataset_json["licence"] if "licence" in dataset_json else None,
distribution=[
Distribution(
content_url=dataset_json["url"], encoding_format=dataset_json["format"]
)
],
size=size,
is_accessible_for_free=True,
size=size,
keyword=[tag for tag in dataset_json["tag"]] if "tag" in dataset_json else [],
license=dataset_json["licence"] if "licence" in dataset_json else None,
version=dataset_json["version"],
)

Expand Down
3 changes: 3 additions & 0 deletions src/connectors/zenodo/zenodo_dataset_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from database.model import field_length
from database.model.agent.person import Person
from database.model.ai_resource.text import Text
from database.model.concept.aiod_entry import AIoDEntryCreate
from database.model.dataset.dataset import Dataset
from database.model.platform.platform_names import PlatformName
from database.model.resource_read_and_create import resource_create
Expand Down Expand Up @@ -66,6 +67,7 @@ def retry(self, _id: int) -> ResourceWithRelations[Dataset] | RecordError:

pydantic_class = resource_create(Dataset)
dataset = pydantic_class(
aiod_entry=AIoDEntryCreate(status="published"),
platform="zenodo",
platform_resource_identifier=_id,
date_published=record.get("created"),
Expand Down Expand Up @@ -163,6 +165,7 @@ def _dataset_from_record(

pydantic_class = resource_create(Dataset)
dataset = pydantic_class(
aiod_entry=AIoDEntryCreate(status="published"),
platform="zenodo",
platform_resource_identifier=identifier,
name=title,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json

import pytest
import responses

from connectors.huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector
Expand All @@ -10,14 +9,14 @@
HUGGINGFACE_URL = "https://datasets-server.huggingface.co"


@pytest.mark.skip(reason="We'll fix this in a separate PR")
def test_fetch_all_happy_path():
ids_expected = {
"0n1xus/codexglue",
"04-07-22/wep-probes",
"rotten_tomatoes",
"acronym_identification",
"air_dialogue",
"bobbydylan/top2k",
}
connector = HuggingFaceDatasetConnector()
with responses.RequestsMock() as mocked_requests:
Expand All @@ -34,15 +33,17 @@ def test_fetch_all_happy_path():
mock_parquet(mocked_requests, dataset_id)
resources_with_relations = list(connector.fetch())

assert len(resources_with_relations) == 5
assert len(resources_with_relations) == len(ids_expected)
assert all(type(r) == ResourceWithRelations for r in resources_with_relations)

datasets = [r.resource for r in resources_with_relations]
ids = {d.platform_resource_identifier for d in datasets}
names = {d.name for d in datasets}
assert ids == ids_expected
assert names == ids_expected
assert {d.platform_resource_identifier for d in datasets} == ids_expected
assert {d.name for d in datasets} == ids_expected
assert all(d.date_published for d in datasets)
assert all(d.aiod_entry for d in datasets)

assert all(len(r.related_resources) in (1, 2) for r in resources_with_relations)
assert all(len(r.related_resources["citation"]) == 1 for r in resources_with_relations)
assert all(len(r.related_resources["citation"]) == 1 for r in resources_with_relations[:5])


def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str):
Expand Down
65 changes: 44 additions & 21 deletions src/tests/resources/connectors/huggingface/data_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,12 @@
"disabled": false,
"gated": false,
"lastModified": "2023-01-25T14:18:28.000Z",
"likes": 10,
"likes": 17,
"private": false,
"sha": "c3c245a18bbd57b1682b099e14460eebf154cbdf",
"citation": "@inproceedings{veyseh-et-al-2020-what,\n title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n year={2020},\n booktitle={Proceedings of COLING},\n link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}",
"description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.",
"downloads": 6011,
"downloads": 2189,
"paperswithcode_id": "acronym-identification",
"tags": [
"task_categories:token-classification",
Expand All @@ -113,9 +113,10 @@
"language:en",
"license:mit",
"acronym-identification",
"arxiv:2010.14678"
"arxiv:2010.14678",
"region:us"
],
"siblings": null,
"createdAt": "2022-03-02T23:29:22.000Z",
"key": ""
},
{
Expand Down Expand Up @@ -276,13 +277,13 @@
},
"disabled": false,
"gated": false,
"lastModified": "2023-01-25T14:43:24.000Z",
"likes": 11,
"lastModified": "2023-04-05T13:39:30.000Z",
"likes": 28,
"private": false,
"sha": "c33cbf965006dba64f134f7bef69c53d5d0d285d",
"sha": "c9f4562ef4a6c84f0098f7845944a5472cb52cad",
"citation": "@InProceedings{Pang+Lee:05a,\n author = {Bo Pang and Lillian Lee},\n title = {Seeing stars: Exploiting class relationships for sentiment\n categorization with respect to rating scales},\n booktitle = {Proceedings of the ACL},\n year = 2005\n}",
"description": "Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.",
"downloads": 68254,
"downloads": 65361,
"paperswithcode_id": "mr",
"tags": [
"task_categories:text-classification",
Expand All @@ -293,9 +294,10 @@
"size_categories:1K<n<10K",
"source_datasets:original",
"language:en",
"license:unknown"
"license:unknown",
"region:us"
],
"siblings": null,
"createdAt": "2022-03-02T23:29:22.000Z",
"key": ""
},
{
Expand All @@ -305,14 +307,16 @@
"disabled": false,
"gated": false,
"lastModified": "2021-11-18T08:45:46.000Z",
"likes": 1,
"likes": 3,
"private": false,
"sha": "e4604616235cdfa7398d489ba1f95d44a18d2f5d",
"citation": "@article{Lu2021,\nauthor = {Lu, Shuai and Guo, Daya and Ren, Shuo and Huang, Junjie and Svyatkovskiy, Alexey and Blanco, Ambrosio and Clement, Colin B. and Drain, Dawn and Jiang, Daxin and Tang, Duyu and Li, Ge and Zhou, Lidong and Shou, Linjun and Zhou, Long and Tufano, Michele and Gong, Ming and Zhou, Ming and Duan, Nan and Sundaresan, Neel and Deng, Shao Kun and Fu, Shengyu and Liu, Shujie},\nyear = {2021},\nbooktitle = {arXiv},\ntitle = {CodeXGLUE - A Machine Learning Benchmark Dataset for Code Understanding and Generation}\n}",
"description": "CodeXGLUE is a benchmark dataset to foster machine learning research for program understanding and generation. \nCodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison.",
"downloads": 295,
"tags": [],
"siblings": null,
"downloads": 387,
"tags": [
"region:us"
],
"createdAt": "2022-03-02T23:29:22.000Z",
"key": ""
},
{
Expand All @@ -330,11 +334,12 @@
"sha": "0681013e6518c8d53cac727b2ca4dc821ffd954c",
"citation": "Probing neural language models for understanding of words of estimative probability\nAnonymous submission",
"description": "Probing neural language models for understanding of words of estimative probability\nAnonymous submission",
"downloads": 0,
"downloads": 3,
"tags": [
"license:apache-2.0"
"license:apache-2.0",
"region:us"
],
"siblings": null,
"createdAt": "2022-07-12T11:48:56.000Z",
"key": ""
},
{
Expand Down Expand Up @@ -592,12 +597,12 @@
"disabled": false,
"gated": false,
"lastModified": "2022-11-03T16:31:11.000Z",
"likes": 1,
"likes": 6,
"private": false,
"sha": "3ef284c2b1ca63cebd46335641fa31b09763f4e5",
"citation": "@inproceedings{wei-etal-2018-airdialogue,\n title = \"{A}ir{D}ialogue: An Environment for Goal-Oriented Dialogue Research\",\n author = \"Wei, Wei and\n Le, Quoc and\n Dai, Andrew and\n Li, Jia\",\n booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n month = oct # \"-\" # nov,\n year = \"2018\",\n address = \"Brussels, Belgium\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D18-1419\",\n doi = \"10.18653/v1/D18-1419\",\n pages = \"3844--3854\",\n abstract = \"Recent progress in dialogue generation has inspired a number of studies on dialogue systems that are capable of accomplishing tasks through natural language interactions. A promising direction among these studies is the use of reinforcement learning techniques, such as self-play, for training dialogue agents. However, current datasets are limited in size, and the environment for training agents and evaluating progress is relatively unsophisticated. We present AirDialogue, a large dataset that contains 301,427 goal-oriented conversations. To collect this dataset, we create a context-generator which provides travel and flight restrictions. We then ask human annotators to play the role of a customer or an agent and interact with the goal of successfully booking a trip given the restrictions. Key to our environment is the ease of evaluating the success of the dialogue, which is achieved by using ground-truth states (e.g., the flight being booked) generated by the restrictions. Any dialogue agent that does not generate the correct states is considered to fail. Our experimental results indicate that state-of-the-art dialogue models can only achieve a score of 0.17 while humans can reach a score of 0.91, which suggests significant opportunities for future improvement.\",\n}",
"description": "AirDialogue, is a large dataset that contains 402,038 goal-oriented conversations. To collect this dataset, we create a contextgenerator which provides travel and flight restrictions. Then the human annotators are asked to play the role of a customer or an agent and interact with the goal of successfully booking a trip given the restrictions.",
"downloads": 422,
"downloads": 420,
"tags": [
"task_categories:conversational",
"task_categories:text-generation",
Expand All @@ -612,9 +617,27 @@
"size_categories:100K<n<1M",
"source_datasets:original",
"language:en",
"license:cc-by-nc-4.0"
"license:cc-by-nc-4.0",
"region:us"
],
"createdAt": "2022-03-02T23:29:22.000Z",
"key": ""
},
{
"_id": "621ffdd236468d709f182fdf",
"id": "bobbydylan/top2k",
"author": "bobbydylan",
"disabled": false,
"gated": false,
"lastModified": "2022-02-01T02:59:04.000Z",
"likes": 0,
"private": false,
"sha": "5e264a6bb7728ba1fd46e96ccc8c7a56461ad661",
"downloads": 164,
"tags": [
"region:us"
],
"siblings": null,
"createdAt": "2022-03-02T23:29:22.000Z",
"key": ""
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"parquet_files": [
{
"dataset": "bobbydylan/top2k",
"config": "default",
"split": "train",
"url": "https://huggingface.co/datasets/bobbydylan/top2k/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
"filename": "0000.parquet",
"size": 493181
}
],
"pending": [],
"failed": [],
"partial": false
}