aiondemand · josvandervelde · Nov 14, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py
@@ -3,9 +3,8 @@
 import typing
 
 import bibtexparser
-import datasets
-import dateutil.parser
 import requests
+from huggingface_hub import list_datasets
 
 from connectors.abstract.resource_connector_on_start_up import ResourceConnectorOnStartUp
 from connectors.record_error import RecordError
@@ -14,6 +13,7 @@
 from database.model.agent.person import Person
 from database.model.ai_asset.distribution import Distribution
 from database.model.ai_resource.text import Text
+from database.model.concept.aiod_entry import AIoDEntryCreate
 from database.model.dataset.dataset import Dataset
 from database.model.knowledge_asset.publication import Publication
 from database.model.platform.platform_names import PlatformName
@@ -51,15 +51,15 @@ def fetch(
     ) -> typing.Iterator[ResourceWithRelations[Dataset] | RecordError]:
         pydantic_class = resource_create(Dataset)
         pydantic_class_publication = resource_create(Publication)
-        for dataset in itertools.islice(datasets.list_datasets(with_details=True), limit):
+        for dataset in itertools.islice(list_datasets(full=True), limit):
             try:
                 yield self.fetch_dataset(dataset, pydantic_class, pydantic_class_publication)
             except Exception as e:
                 yield RecordError(identifier=dataset.id, error=e)
 
     def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
         citations = []
-        if dataset.citation is not None:
+        if hasattr(dataset, "citation") and isinstance(dataset.citation, str):
             parsed_citations = bibtexparser.loads(dataset.citation).entries
             if len(parsed_citations) == 0:
                 if dataset.citation:
@@ -96,11 +96,11 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
         ]
         size = None
         ds_license = None
-        if dataset.cardData is not None and "license" in dataset.cardData:
-            if isinstance(dataset.cardData["license"], str):
-                ds_license = dataset.cardData["license"]
+        if dataset.card_data is not None and "license" in dataset.card_data:
+            if isinstance(dataset.card_data["license"], str):
+                ds_license = dataset.card_data["license"]
             else:
-                (ds_license,) = dataset.cardData["license"]
+                (ds_license,) = dataset.card_data["license"]
 
             # TODO(issue 8): implement
             # if "dataset_info" in dataset.cardData:
@@ -111,7 +111,8 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
         related_resources = {"citation": citations}
         if dataset.author is not None:
             related_resources["creator"] = [Person(name=dataset.author)]
-        description = dataset.description
+
+        description = getattr(dataset, "description", "")
         if len(description) > field_length.LONG:
             text_break = " [...]"
             description = description[: field_length.LONG - len(text_break)] + text_break
@@ -120,12 +121,13 @@ def fetch_dataset(self, dataset, pydantic_class, pydantic_class_publication):
 
         return ResourceWithRelations[Dataset](
             resource=pydantic_class(
+                aiod_entry=AIoDEntryCreate(status="published"),
                 platform_resource_identifier=dataset.id,
                 platform=self.platform_name,
-                description=description,
                 name=dataset.id,
                 same_as=f"https://huggingface.co/datasets/{dataset.id}",
-                date_modified=dateutil.parser.parse(dataset.lastModified),
+                description=description,
+                date_published=dataset.createdAt,
                 license=ds_license,
                 distributions=distributions,
                 is_accessible_for_free=True,

diff --git a/src/connectors/openml/openml_dataset_connector.py b/src/connectors/openml/openml_dataset_connector.py
@@ -77,22 +77,23 @@ def fetch_record(
             size = DatasetSize(value=_as_int(qualities_json["NumberOfInstances"]), unit="instances")
         return pydantic_class(
             aiod_entry=AIoDEntryCreate(
-                platform=self.platform_name,
-                platform_resource_identifier=identifier,
+                status="published",
             ),
+            platform_resource_identifier=identifier,
+            platform=self.platform_name,
             name=dataset_json["name"],
             same_as=url_data,
             description=description,
             date_published=dateutil.parser.parse(dataset_json["upload_date"]),
+            license=dataset_json["licence"] if "licence" in dataset_json else None,
             distribution=[
                 Distribution(
                     content_url=dataset_json["url"], encoding_format=dataset_json["format"]
                 )
             ],
-            size=size,
             is_accessible_for_free=True,
+            size=size,
             keyword=[tag for tag in dataset_json["tag"]] if "tag" in dataset_json else [],
-            license=dataset_json["licence"] if "licence" in dataset_json else None,
             version=dataset_json["version"],
         )
 

diff --git a/src/connectors/zenodo/zenodo_dataset_connector.py b/src/connectors/zenodo/zenodo_dataset_connector.py
@@ -13,6 +13,7 @@
 from database.model import field_length
 from database.model.agent.person import Person
 from database.model.ai_resource.text import Text
+from database.model.concept.aiod_entry import AIoDEntryCreate
 from database.model.dataset.dataset import Dataset
 from database.model.platform.platform_names import PlatformName
 from database.model.resource_read_and_create import resource_create
@@ -66,6 +67,7 @@ def retry(self, _id: int) -> ResourceWithRelations[Dataset] | RecordError:
 
         pydantic_class = resource_create(Dataset)
         dataset = pydantic_class(
+            aiod_entry=AIoDEntryCreate(status="published"),
             platform="zenodo",
             platform_resource_identifier=_id,
             date_published=record.get("created"),
@@ -163,6 +165,7 @@ def _dataset_from_record(
 
         pydantic_class = resource_create(Dataset)
         dataset = pydantic_class(
+            aiod_entry=AIoDEntryCreate(status="published"),
             platform="zenodo",
             platform_resource_identifier=identifier,
             name=title,

diff --git a/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py b/src/tests/connectors/huggingface/test_huggingface_dataset_connector.py
@@ -1,6 +1,5 @@
 import json
 
-import pytest
 import responses
 
 from connectors.huggingface.huggingface_dataset_connector import HuggingFaceDatasetConnector
@@ -10,14 +9,14 @@
 HUGGINGFACE_URL = "https://datasets-server.huggingface.co"
 
 
-@pytest.mark.skip(reason="We'll fix this in a separate PR")
 def test_fetch_all_happy_path():
     ids_expected = {
         "0n1xus/codexglue",
         "04-07-22/wep-probes",
         "rotten_tomatoes",
         "acronym_identification",
         "air_dialogue",
+        "bobbydylan/top2k",
     }
     connector = HuggingFaceDatasetConnector()
     with responses.RequestsMock() as mocked_requests:
@@ -34,15 +33,17 @@ def test_fetch_all_happy_path():
             mock_parquet(mocked_requests, dataset_id)
         resources_with_relations = list(connector.fetch())
 
-    assert len(resources_with_relations) == 5
+    assert len(resources_with_relations) == len(ids_expected)
     assert all(type(r) == ResourceWithRelations for r in resources_with_relations)
+
     datasets = [r.resource for r in resources_with_relations]
-    ids = {d.platform_resource_identifier for d in datasets}
-    names = {d.name for d in datasets}
-    assert ids == ids_expected
-    assert names == ids_expected
+    assert {d.platform_resource_identifier for d in datasets} == ids_expected
+    assert {d.name for d in datasets} == ids_expected
+    assert all(d.date_published for d in datasets)
+    assert all(d.aiod_entry for d in datasets)
+
     assert all(len(r.related_resources) in (1, 2) for r in resources_with_relations)
-    assert all(len(r.related_resources["citation"]) == 1 for r in resources_with_relations)
+    assert all(len(r.related_resources["citation"]) == 1 for r in resources_with_relations[:5])
 
 
 def mock_parquet(mocked_requests: responses.RequestsMock, dataset_id: str):

diff --git a/src/tests/resources/connectors/huggingface/data_list.json b/src/tests/resources/connectors/huggingface/data_list.json
@@ -96,12 +96,12 @@
         "disabled": false,
         "gated": false,
         "lastModified": "2023-01-25T14:18:28.000Z",
-        "likes": 10,
+        "likes": 17,
         "private": false,
         "sha": "c3c245a18bbd57b1682b099e14460eebf154cbdf",
         "citation": "@inproceedings{veyseh-et-al-2020-what,\n   title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n   author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n   year={2020},\n   booktitle={Proceedings of COLING},\n   link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}",
         "description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.",
-        "downloads": 6011,
+        "downloads": 2189,
         "paperswithcode_id": "acronym-identification",
         "tags": [
             "task_categories:token-classification",
@@ -113,9 +113,10 @@
             "language:en",
             "license:mit",
             "acronym-identification",
-            "arxiv:2010.14678"
+            "arxiv:2010.14678",
+            "region:us"
         ],
-        "siblings": null,
+        "createdAt": "2022-03-02T23:29:22.000Z",
         "key": ""
     },
     {
@@ -276,13 +277,13 @@
         },
         "disabled": false,
         "gated": false,
-        "lastModified": "2023-01-25T14:43:24.000Z",
-        "likes": 11,
+        "lastModified": "2023-04-05T13:39:30.000Z",
+        "likes": 28,
         "private": false,
-        "sha": "c33cbf965006dba64f134f7bef69c53d5d0d285d",
+        "sha": "c9f4562ef4a6c84f0098f7845944a5472cb52cad",
         "citation": "@InProceedings{Pang+Lee:05a,\n  author =       {Bo Pang and Lillian Lee},\n  title =        {Seeing stars: Exploiting class relationships for sentiment\n                  categorization with respect to rating scales},\n  booktitle =    {Proceedings of the ACL},\n  year =         2005\n}",
         "description": "Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.",
-        "downloads": 68254,
+        "downloads": 65361,
         "paperswithcode_id": "mr",
         "tags": [
             "task_categories:text-classification",
@@ -293,9 +294,10 @@
             "size_categories:1K<n<10K",
             "source_datasets:original",
             "language:en",
-            "license:unknown"
+            "license:unknown",
+            "region:us"
         ],
-        "siblings": null,
+        "createdAt": "2022-03-02T23:29:22.000Z",
         "key": ""
     },
     {
@@ -305,14 +307,16 @@
         "disabled": false,
         "gated": false,
         "lastModified": "2021-11-18T08:45:46.000Z",
-        "likes": 1,
+        "likes": 3,
         "private": false,
         "sha": "e4604616235cdfa7398d489ba1f95d44a18d2f5d",
         "citation": "@article{Lu2021,\nauthor = {Lu, Shuai and Guo, Daya and Ren, Shuo and Huang, Junjie and Svyatkovskiy, Alexey and Blanco, Ambrosio and Clement, Colin B. and Drain, Dawn and Jiang, Daxin and Tang, Duyu and Li, Ge and Zhou, Lidong and Shou, Linjun and Zhou, Long and Tufano, Michele and Gong, Ming and Zhou, Ming and Duan, Nan and Sundaresan, Neel and Deng, Shao Kun and Fu, Shengyu and Liu, Shujie},\nyear = {2021},\nbooktitle = {arXiv},\ntitle = {CodeXGLUE - A Machine Learning Benchmark Dataset for Code Understanding and Generation}\n}",
         "description": "CodeXGLUE is a benchmark dataset to foster machine learning research for program understanding and generation. \nCodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison.",
-        "downloads": 295,
-        "tags": [],
-        "siblings": null,
+        "downloads": 387,
+        "tags": [
+            "region:us"
+        ],
+        "createdAt": "2022-03-02T23:29:22.000Z",
         "key": ""
     },
     {
@@ -330,11 +334,12 @@
         "sha": "0681013e6518c8d53cac727b2ca4dc821ffd954c",
         "citation": "Probing neural language models for understanding of words of estimative probability\nAnonymous submission",
         "description": "Probing neural language models for understanding of words of estimative probability\nAnonymous submission",
-        "downloads": 0,
+        "downloads": 3,
         "tags": [
-            "license:apache-2.0"
+            "license:apache-2.0",
+            "region:us"
         ],
-        "siblings": null,
+        "createdAt": "2022-07-12T11:48:56.000Z",
         "key": ""
     },
      {
@@ -592,12 +597,12 @@
         "disabled": false,
         "gated": false,
         "lastModified": "2022-11-03T16:31:11.000Z",
-        "likes": 1,
+        "likes": 6,
         "private": false,
         "sha": "3ef284c2b1ca63cebd46335641fa31b09763f4e5",
         "citation": "@inproceedings{wei-etal-2018-airdialogue,\n    title = \"{A}ir{D}ialogue: An Environment for Goal-Oriented Dialogue Research\",\n    author = \"Wei, Wei  and\n      Le, Quoc  and\n      Dai, Andrew  and\n      Li, Jia\",\n    booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n    month = oct # \"-\" # nov,\n    year = \"2018\",\n    address = \"Brussels, Belgium\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/D18-1419\",\n    doi = \"10.18653/v1/D18-1419\",\n    pages = \"3844--3854\",\n    abstract = \"Recent progress in dialogue generation has inspired a number of studies on dialogue systems that are capable of accomplishing tasks through natural language interactions. A promising direction among these studies is the use of reinforcement learning techniques, such as self-play, for training dialogue agents. However, current datasets are limited in size, and the environment for training agents and evaluating progress is relatively unsophisticated. We present AirDialogue, a large dataset that contains 301,427 goal-oriented conversations. To collect this dataset, we create a context-generator which provides travel and flight restrictions. We then ask human annotators to play the role of a customer or an agent and interact with the goal of successfully booking a trip given the restrictions. Key to our environment is the ease of evaluating the success of the dialogue, which is achieved by using ground-truth states (e.g., the flight being booked) generated by the restrictions. Any dialogue agent that does not generate the correct states is considered to fail. Our experimental results indicate that state-of-the-art dialogue models can only achieve a score of 0.17 while humans can reach a score of 0.91, which suggests significant opportunities for future improvement.\",\n}",
         "description": "AirDialogue, is a large dataset that contains 402,038 goal-oriented conversations. To collect this dataset, we create a contextgenerator which provides travel and flight restrictions. Then the human annotators are asked to play the role of a customer or an agent and interact with the goal of successfully booking a trip given the restrictions.",
-        "downloads": 422,
+        "downloads": 420,
         "tags": [
             "task_categories:conversational",
             "task_categories:text-generation",
@@ -612,9 +617,27 @@
             "size_categories:100K<n<1M",
             "source_datasets:original",
             "language:en",
-            "license:cc-by-nc-4.0"
+            "license:cc-by-nc-4.0",
+            "region:us"
+        ],
+        "createdAt": "2022-03-02T23:29:22.000Z",
+        "key": ""
+    },
+    {
+        "_id": "621ffdd236468d709f182fdf",
+        "id": "bobbydylan/top2k",
+        "author": "bobbydylan",
+        "disabled": false,
+        "gated": false,
+        "lastModified": "2022-02-01T02:59:04.000Z",
+        "likes": 0,
+        "private": false,
+        "sha": "5e264a6bb7728ba1fd46e96ccc8c7a56461ad661",
+        "downloads": 164,
+        "tags": [
+            "region:us"
         ],
-        "siblings": null,
+        "createdAt": "2022-03-02T23:29:22.000Z",
         "key": ""
     }
 ]
diff --git a/src/tests/resources/connectors/huggingface/parquet_bobbydylan_top2k.json b/src/tests/resources/connectors/huggingface/parquet_bobbydylan_top2k.json
@@ -0,0 +1,15 @@
+{
+    "parquet_files": [
+        {
+            "dataset": "bobbydylan/top2k",
+            "config": "default",
+            "split": "train",
+            "url": "https://huggingface.co/datasets/bobbydylan/top2k/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
+            "filename": "0000.parquet",
+            "size": 493181
+        }
+    ],
+    "pending": [],
+    "failed": [],
+    "partial": false
+}