From f67b8139e2ddf6659e133456f62365f27faefa8c Mon Sep 17 00:00:00 2001 From: Binh Vu Date: Thu, 11 Jan 2024 23:30:55 +0000 Subject: [PATCH] add fro dict --- kgdata/models/multilingual.py | 4 + scripts/build.sh | 46 ++++----- scripts/download-data.ipynb | 186 ++++++++++++++++++++++++++++------ scripts/process-data.ipynb | 41 ++++++-- 4 files changed, 214 insertions(+), 63 deletions(-) diff --git a/kgdata/models/multilingual.py b/kgdata/models/multilingual.py index b6debbd..084ea68 100644 --- a/kgdata/models/multilingual.py +++ b/kgdata/models/multilingual.py @@ -84,6 +84,10 @@ def get_all(self) -> set[str]: def to_dict(self): return {"lang2values": self.lang2values, "lang": self.lang} + @staticmethod + def from_dict(obj: dict): + return MultiLingualStringList(obj["lang2values"], obj["lang"]) + def to_tuple(self): return self.lang2values, self.lang diff --git a/scripts/build.sh b/scripts/build.sh index e650205..e825ae9 100644 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -50,29 +50,29 @@ function wikidata_db { # # python -m kgdata.wikidata.datasets -d entity_redirection_dump --sign # # python -m kgdata.wikidata.datasets -d page_dump --sign -# wikidata_dataset page_ids -# wikidata_dataset entity_ids -# wikidata_dataset entity_redirections -# wikidata_dataset entities -# wikidata_dataset entity_types - -# wikidata_dataset classes -# wikidata_dataset properties - -# wikidata_dataset class_count -# wikidata_dataset property_count -# wikidata_dataset property_domains -# wikidata_dataset property_ranges - -# wikidata_dataset cross_wiki_mapping - -# wikidata_dataset entity_metadata -# wikidata_dataset entity_all_types -# wikidata_dataset entity_degrees -# wikidata_dataset entity_labels -# wikidata_dataset entity_types_and_degrees -# wikidata_dataset entity_outlinks -# wikidata_dataset entity_pagerank +wikidata_dataset page_ids +wikidata_dataset entity_ids +wikidata_dataset entity_redirections +wikidata_dataset entities +wikidata_dataset entity_types + +wikidata_dataset classes +wikidata_dataset properties + +wikidata_dataset class_count +wikidata_dataset property_count +wikidata_dataset property_domains +wikidata_dataset property_ranges + +wikidata_dataset cross_wiki_mapping + +wikidata_dataset entity_metadata +wikidata_dataset entity_all_types +wikidata_dataset entity_degrees +wikidata_dataset entity_labels +wikidata_dataset entity_types_and_degrees +wikidata_dataset entity_outlinks +wikidata_dataset entity_pagerank wikidata_dataset entity_wiki_aliases # ====================================================================== diff --git a/scripts/download-data.ipynb b/scripts/download-data.ipynb index f9939ff..1f03018 100644 --- a/scripts/download-data.ipynb +++ b/scripts/download-data.ipynb @@ -22,7 +22,15 @@ "cell_type": "code", "execution_count": 1, "id": "7b7128b3-ab65-43c7-b228-824a039e6515", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:34.259197Z", + "iopub.status.busy": "2024-01-08T14:43:34.258233Z", + "iopub.status.idle": "2024-01-08T14:43:34.270642Z", + "shell.execute_reply": "2024-01-08T14:43:34.269411Z", + "shell.execute_reply.started": "2024-01-08T14:43:34.259154Z" + } + }, "outputs": [], "source": [ "from os.path import expanduser\n", @@ -43,12 +51,20 @@ "cell_type": "code", "execution_count": 2, "id": "9c5fcfc2-a961-4780-892e-4bca7a0de15f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:34.273546Z", + "iopub.status.busy": "2024-01-08T14:43:34.273027Z", + "iopub.status.idle": "2024-01-08T14:43:35.198066Z", + "shell.execute_reply": "2024-01-08T14:43:35.196633Z", + "shell.execute_reply.started": "2024-01-08T14:43:34.273510Z" + } + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3f953b8eee6746389eab7561355e3422", + "model_id": "352e074cad1f4104af32c190bc49ee09", "version_major": 2, "version_minor": 0 }, @@ -77,17 +93,25 @@ "cell_type": "code", "execution_count": 3, "id": "d84f52a4", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:38:21.018659Z", + "iopub.status.busy": "2024-01-08T14:38:21.017432Z", + "iopub.status.idle": "2024-01-08T14:38:21.074577Z", + "shell.execute_reply": "2024-01-08T14:38:21.073715Z", + "shell.execute_reply.started": "2024-01-08T14:38:21.018602Z" + } + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "77e8475eac214e21a9f850ac58249d9b", + "model_id": "3b2a5e9f86774e9990087537fe80eefa", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(Button(description='2023-08-21', style=ButtonStyle()), Button(description='2023-07-31', style=B…" + "HBox(children=(Button(description='2024-01-01', style=ButtonStyle()), Button(description='2023-12-18', style=B…" ] }, "metadata": {}, @@ -96,7 +120,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e0d7e19bc0b849be8146cec96885ef1f", + "model_id": "7763343af7bb4cba9e58fa0cf8d12232", "version_major": 2, "version_minor": 0 }, @@ -149,22 +173,30 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "b3a39dfc", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:35.276278Z", + "iopub.status.busy": "2024-01-08T14:43:35.275603Z", + "iopub.status.idle": "2024-01-08T14:43:35.288422Z", + "shell.execute_reply": "2024-01-08T14:43:35.286988Z", + "shell.execute_reply.started": "2024-01-08T14:43:35.276211Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Selected date: 2023-06-19\n" + "Selected date: 2024-01-01\n" ] } ], "source": [ "# set the selected date from the selected button, or you can manually set it to the date you want\n", "# SELECTED_DUMP_DATE = selected_date\n", - "SELECTED_DUMP_DATE = datetime.date(2023, 6, 19)\n", + "SELECTED_DUMP_DATE = datetime.date(2024, 1, 1)\n", "print(\"Selected date:\", SELECTED_DUMP_DATE.isoformat())" ] }, @@ -186,19 +218,27 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "0bf19885", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:37.722477Z", + "iopub.status.busy": "2024-01-08T14:43:37.721947Z", + "iopub.status.idle": "2024-01-08T14:43:37.733911Z", + "shell.execute_reply": "2024-01-08T14:43:37.732473Z", + "shell.execute_reply.started": "2024-01-08T14:43:37.722430Z" + } + }, "outputs": [ { "data": { "text/plain": [ - "[DumpFile(date=datetime.date(2023, 6, 19), url='https://dumps.wikimedia.org/wikidatawiki/entities/20230619/wikidata-20230619-all.json.bz2'),\n", - " DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/wikidatawiki/20230620/wikidatawiki-20230620-page.sql.gz'),\n", - " DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/wikidatawiki/20230620/wikidatawiki-20230620-redirect.sql.gz')]" + "[DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/entities/20240101/wikidata-20240101-all.json.bz2'),\n", + " DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz'),\n", + " DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz')]" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -210,9 +250,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "cb09fe48-4c4f-4b32-ae59-04a18b9ae0c4", "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:39.327579Z", + "iopub.status.busy": "2024-01-08T14:43:39.327077Z", + "iopub.status.idle": "2024-01-08T14:43:39.334583Z", + "shell.execute_reply": "2024-01-08T14:43:39.333210Z", + "shell.execute_reply.started": "2024-01-08T14:43:39.327537Z" + }, "tags": [] }, "outputs": [], @@ -222,12 +269,35 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "eceb23a2-98fa-41fd-b297-74414e773541", - "metadata": {}, - "outputs": [], + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:43:51.569735Z", + "iopub.status.busy": "2024-01-08T14:43:51.569185Z", + "iopub.status.idle": "2024-01-08T14:43:51.586622Z", + "shell.execute_reply": "2024-01-08T14:43:51.585166Z", + "shell.execute_reply.started": "2024-01-08T14:43:51.569689Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',\n", + " PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),\n", + " ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',\n", + " PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)" + "wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)\n", + "wdjobs" ] }, { @@ -280,19 +350,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, + "id": "8fe99975-8c7a-47ec-aa1b-21b9a47ce756", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:45:00.269360Z", + "iopub.status.busy": "2024-01-08T14:45:00.268768Z", + "iopub.status.idle": "2024-01-08T14:45:00.279659Z", + "shell.execute_reply": "2024-01-08T14:45:00.278195Z", + "shell.execute_reply.started": "2024-01-08T14:45:00.269310Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',\n", + " PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),\n", + " ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',\n", + " PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jobs = []\n", + "if \"wdjobs\" in locals():\n", + " jobs += wdjobs\n", + "if \"wpjobs\" in locals():\n", + " jobs += wpjobs\n", + "jobs" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "id": "39cdeb3b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:45:04.095000Z", + "iopub.status.busy": "2024-01-08T14:45:04.094299Z", + "iopub.status.idle": "2024-01-08T14:57:11.806833Z", + "shell.execute_reply": "2024-01-08T14:57:11.805552Z", + "shell.execute_reply.started": "2024-01-08T14:45:04.094953Z" + } + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "17d1fee843d6488ab7f3ed75b7f6e070", + "model_id": "94635d16d4204e1bbb5df280c67fe726", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Download wikidatawiki-20240101-page.sql.gz: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3a3bdbe709f746f48be015779de8f51d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Download enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz: 0.00B [00:00, ?B/s]" + "Download wikidatawiki-20240101-redirect.sql.gz: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, @@ -300,7 +429,6 @@ } ], "source": [ - "jobs = wdjobs + wpjobs\n", "with WGet.start() as wget:\n", " for url, outfile in jobs:\n", " wget.download(url, outfile)\n", @@ -318,9 +446,9 @@ ], "metadata": { "kernelspec": { - "display_name": "gramsplus", + "display_name": "resm", "language": "python", - "name": "gramsplus" + "name": "resm" }, "language_info": { "codemirror_mode": { diff --git a/scripts/process-data.ipynb b/scripts/process-data.ipynb index 83d44ea..40a7a1f 100644 --- a/scripts/process-data.ipynb +++ b/scripts/process-data.ipynb @@ -20,24 +20,35 @@ "cell_type": "code", "execution_count": 1, "id": "e39534bf-e36b-4b38-9fbe-7008899d40ee", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-08T14:47:20.736884Z", + "iopub.status.busy": "2024-01-08T14:47:20.736207Z", + "iopub.status.idle": "2024-01-08T14:47:21.125873Z", + "shell.execute_reply": "2024-01-08T14:47:21.125068Z", + "shell.execute_reply.started": "2024-01-08T14:47:20.736837Z" + } + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-08-20 11:23:18.010 | INFO | kgdata.config:init_dbdir_from_env:23 - Wikidata directory: /nas/home/binhvu/kgdata/wikidata/20230619\n", - "2023-08-20 11:23:18.016 | INFO | kgdata.config:init_dbdir_from_env:29 - DBpedia directory: /nas/home/binhvu/kgdata/dbpedia/20221201\n", - "2023-08-20 11:23:18.018 | INFO | kgdata.config:init_dbdir_from_env:37 - Wikipedia directory: /nas/home/binhvu/kgdata/wikipedia/20230620\n" + "\u001b[32m2024-01-08 14:47:20.782\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mkgdata.config\u001b[0m:\u001b[36minit_dbdir_from_env\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1mWikidata directory: /nas/home/binhvu/kgdata/wikidata/20240101\u001b[0m\n", + "\u001b[32m2024-01-08 14:47:20.783\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mkgdata.config\u001b[0m:\u001b[36minit_dbdir_from_env\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1mDBpedia directory: /nas/home/binhvu/kgdata/dbpedia/20221201\u001b[0m\n", + "\u001b[32m2024-01-08 14:47:20.785\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mkgdata.config\u001b[0m:\u001b[36minit_dbdir_from_env\u001b[0m:\u001b[36m42\u001b[0m - \u001b[1mWikipedia directory: /nas/home/binhvu/kgdata/wikipedia/20240101\u001b[0m\n" ] } ], "source": [ "import os, subprocess, sys\n", "\n", - "WD_DATE = \"20230619\"\n", + "WD_DATE = \"20240101\"\n", + "WP_DATE = \"20240101\"\n", + "\n", + "# WD_DATE = \"20230619\"\n", "# WD_DATE = \"20211213\"\n", - "WP_DATE = \"20230620\"\n", + "# WP_DATE = \"20230620\"\n", "DBP_DATE = \"20221201\"\n", "\n", "os.environ[\"WD_DIR\"] = os.path.expanduser(f\"~/kgdata/wikidata/{WD_DATE}\")\n", @@ -48,9 +59,7 @@ "\n", "init_dbdir_from_env()\n", "\n", - "# from kgdata.wikidata.datasets import import_dataset as import_wd_ds\n", - "# from kgdata.wikipedia.datasets import import_dataset as import_wp_ds\n", - "# from kgdata.dbpedia.datasets import import_dataset as import_db_ds" + "from kgdata.dataset import import_dataset" ] }, { @@ -69,6 +78,16 @@ "#### Make datasets" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "22fa9417-d389-4064-8cc2-289ec2dee64a", + "metadata": {}, + "outputs": [], + "source": [ + "import_dataset(\"wikidata.entities\")" + ] + }, { "cell_type": "markdown", "id": "e3962948-a962-4d9c-a92e-b4288c5a316e", @@ -609,9 +628,9 @@ ], "metadata": { "kernelspec": { - "display_name": "gramsplus", + "display_name": "resm", "language": "python", - "name": "gramsplus" + "name": "resm" }, "language_info": { "codemirror_mode": {