Skip to content

Commit

Permalink
add fro dict
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Jan 11, 2024
1 parent 7db7965 commit f67b813
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 63 deletions.
4 changes: 4 additions & 0 deletions kgdata/models/multilingual.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ def get_all(self) -> set[str]:
def to_dict(self):
return {"lang2values": self.lang2values, "lang": self.lang}

@staticmethod
def from_dict(obj: dict):
return MultiLingualStringList(obj["lang2values"], obj["lang"])

def to_tuple(self):
return self.lang2values, self.lang

Expand Down
46 changes: 23 additions & 23 deletions scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,29 @@ function wikidata_db {
# # python -m kgdata.wikidata.datasets -d entity_redirection_dump --sign
# # python -m kgdata.wikidata.datasets -d page_dump --sign

# wikidata_dataset page_ids
# wikidata_dataset entity_ids
# wikidata_dataset entity_redirections
# wikidata_dataset entities
# wikidata_dataset entity_types

# wikidata_dataset classes
# wikidata_dataset properties

# wikidata_dataset class_count
# wikidata_dataset property_count
# wikidata_dataset property_domains
# wikidata_dataset property_ranges

# wikidata_dataset cross_wiki_mapping

# wikidata_dataset entity_metadata
# wikidata_dataset entity_all_types
# wikidata_dataset entity_degrees
# wikidata_dataset entity_labels
# wikidata_dataset entity_types_and_degrees
# wikidata_dataset entity_outlinks
# wikidata_dataset entity_pagerank
wikidata_dataset page_ids
wikidata_dataset entity_ids
wikidata_dataset entity_redirections
wikidata_dataset entities
wikidata_dataset entity_types

wikidata_dataset classes
wikidata_dataset properties

wikidata_dataset class_count
wikidata_dataset property_count
wikidata_dataset property_domains
wikidata_dataset property_ranges

wikidata_dataset cross_wiki_mapping

wikidata_dataset entity_metadata
wikidata_dataset entity_all_types
wikidata_dataset entity_degrees
wikidata_dataset entity_labels
wikidata_dataset entity_types_and_degrees
wikidata_dataset entity_outlinks
wikidata_dataset entity_pagerank
wikidata_dataset entity_wiki_aliases

# ======================================================================
Expand Down
186 changes: 157 additions & 29 deletions scripts/download-data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,15 @@
"cell_type": "code",
"execution_count": 1,
"id": "7b7128b3-ab65-43c7-b228-824a039e6515",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:34.259197Z",
"iopub.status.busy": "2024-01-08T14:43:34.258233Z",
"iopub.status.idle": "2024-01-08T14:43:34.270642Z",
"shell.execute_reply": "2024-01-08T14:43:34.269411Z",
"shell.execute_reply.started": "2024-01-08T14:43:34.259154Z"
}
},
"outputs": [],
"source": [
"from os.path import expanduser\n",
Expand All @@ -43,12 +51,20 @@
"cell_type": "code",
"execution_count": 2,
"id": "9c5fcfc2-a961-4780-892e-4bca7a0de15f",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:34.273546Z",
"iopub.status.busy": "2024-01-08T14:43:34.273027Z",
"iopub.status.idle": "2024-01-08T14:43:35.198066Z",
"shell.execute_reply": "2024-01-08T14:43:35.196633Z",
"shell.execute_reply.started": "2024-01-08T14:43:34.273510Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3f953b8eee6746389eab7561355e3422",
"model_id": "352e074cad1f4104af32c190bc49ee09",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -77,17 +93,25 @@
"cell_type": "code",
"execution_count": 3,
"id": "d84f52a4",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:38:21.018659Z",
"iopub.status.busy": "2024-01-08T14:38:21.017432Z",
"iopub.status.idle": "2024-01-08T14:38:21.074577Z",
"shell.execute_reply": "2024-01-08T14:38:21.073715Z",
"shell.execute_reply.started": "2024-01-08T14:38:21.018602Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "77e8475eac214e21a9f850ac58249d9b",
"model_id": "3b2a5e9f86774e9990087537fe80eefa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='2023-08-21', style=ButtonStyle()), Button(description='2023-07-31', style=B…"
"HBox(children=(Button(description='2024-01-01', style=ButtonStyle()), Button(description='2023-12-18', style=B…"
]
},
"metadata": {},
Expand All @@ -96,7 +120,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e0d7e19bc0b849be8146cec96885ef1f",
"model_id": "7763343af7bb4cba9e58fa0cf8d12232",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -149,22 +173,30 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "b3a39dfc",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:35.276278Z",
"iopub.status.busy": "2024-01-08T14:43:35.275603Z",
"iopub.status.idle": "2024-01-08T14:43:35.288422Z",
"shell.execute_reply": "2024-01-08T14:43:35.286988Z",
"shell.execute_reply.started": "2024-01-08T14:43:35.276211Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Selected date: 2023-06-19\n"
"Selected date: 2024-01-01\n"
]
}
],
"source": [
"# set the selected date from the selected button, or you can manually set it to the date you want\n",
"# SELECTED_DUMP_DATE = selected_date\n",
"SELECTED_DUMP_DATE = datetime.date(2023, 6, 19)\n",
"SELECTED_DUMP_DATE = datetime.date(2024, 1, 1)\n",
"print(\"Selected date:\", SELECTED_DUMP_DATE.isoformat())"
]
},
Expand All @@ -186,19 +218,27 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "0bf19885",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:37.722477Z",
"iopub.status.busy": "2024-01-08T14:43:37.721947Z",
"iopub.status.idle": "2024-01-08T14:43:37.733911Z",
"shell.execute_reply": "2024-01-08T14:43:37.732473Z",
"shell.execute_reply.started": "2024-01-08T14:43:37.722430Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[DumpFile(date=datetime.date(2023, 6, 19), url='https://dumps.wikimedia.org/wikidatawiki/entities/20230619/wikidata-20230619-all.json.bz2'),\n",
" DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/wikidatawiki/20230620/wikidatawiki-20230620-page.sql.gz'),\n",
" DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/wikidatawiki/20230620/wikidatawiki-20230620-redirect.sql.gz')]"
"[DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/entities/20240101/wikidata-20240101-all.json.bz2'),\n",
" DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz'),\n",
" DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz')]"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -210,9 +250,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"id": "cb09fe48-4c4f-4b32-ae59-04a18b9ae0c4",
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:39.327579Z",
"iopub.status.busy": "2024-01-08T14:43:39.327077Z",
"iopub.status.idle": "2024-01-08T14:43:39.334583Z",
"shell.execute_reply": "2024-01-08T14:43:39.333210Z",
"shell.execute_reply.started": "2024-01-08T14:43:39.327537Z"
},
"tags": []
},
"outputs": [],
Expand All @@ -222,12 +269,35 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "eceb23a2-98fa-41fd-b297-74414e773541",
"metadata": {},
"outputs": [],
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:43:51.569735Z",
"iopub.status.busy": "2024-01-08T14:43:51.569185Z",
"iopub.status.idle": "2024-01-08T14:43:51.586622Z",
"shell.execute_reply": "2024-01-08T14:43:51.585166Z",
"shell.execute_reply.started": "2024-01-08T14:43:51.569689Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',\n",
" PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),\n",
" ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',\n",
" PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)"
"wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)\n",
"wdjobs"
]
},
{
Expand Down Expand Up @@ -280,27 +350,85 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "8fe99975-8c7a-47ec-aa1b-21b9a47ce756",
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:45:00.269360Z",
"iopub.status.busy": "2024-01-08T14:45:00.268768Z",
"iopub.status.idle": "2024-01-08T14:45:00.279659Z",
"shell.execute_reply": "2024-01-08T14:45:00.278195Z",
"shell.execute_reply.started": "2024-01-08T14:45:00.269310Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',\n",
" PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),\n",
" ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',\n",
" PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jobs = []\n",
"if \"wdjobs\" in locals():\n",
" jobs += wdjobs\n",
"if \"wpjobs\" in locals():\n",
" jobs += wpjobs\n",
"jobs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "39cdeb3b",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-01-08T14:45:04.095000Z",
"iopub.status.busy": "2024-01-08T14:45:04.094299Z",
"iopub.status.idle": "2024-01-08T14:57:11.806833Z",
"shell.execute_reply": "2024-01-08T14:57:11.805552Z",
"shell.execute_reply.started": "2024-01-08T14:45:04.094953Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "17d1fee843d6488ab7f3ed75b7f6e070",
"model_id": "94635d16d4204e1bbb5df280c67fe726",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Download wikidatawiki-20240101-page.sql.gz: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3a3bdbe709f746f48be015779de8f51d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Download enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz: 0.00B [00:00, ?B/s]"
"Download wikidatawiki-20240101-redirect.sql.gz: 0.00B [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"jobs = wdjobs + wpjobs\n",
"with WGet.start() as wget:\n",
" for url, outfile in jobs:\n",
" wget.download(url, outfile)\n",
Expand All @@ -318,9 +446,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "gramsplus",
"display_name": "resm",
"language": "python",
"name": "gramsplus"
"name": "resm"
},
"language_info": {
"codemirror_mode": {
Expand Down
Loading

0 comments on commit f67b813

Please sign in to comment.