From 794a83c137e29919bd881066bd5aa0d8b56f454f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=9B=A7=E5=9B=A7?= Date: Tue, 20 Aug 2024 22:59:36 -0400 Subject: [PATCH] Automatically upload binary dataset to s3 (#4114) (cherry picked from commit e90380ea89fd37a7fd29bf933d7e81ab1af51318) --- .github/workflows/ci-workflow.yml | 14 +++ benchmark/serialize.cypher | 124 +++++++++++++------------- benchmark/serializer.py | 3 +- dataset/demo-db/parquet/copy.cypher | 8 +- dataset/demo-db/parquet/schema.cypher | 8 +- scripts/generate_binary_tinysnb.sh | 5 ++ 6 files changed, 91 insertions(+), 71 deletions(-) create mode 100755 scripts/generate_binary_tinysnb.sh diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml index c3bf61d8a40..ad19b100109 100644 --- a/.github/workflows/ci-workflow.yml +++ b/.github/workflows/ci-workflow.yml @@ -76,6 +76,20 @@ jobs: - name: Generate datasets run: bash scripts/generate_binary_demo.sh + + - name: Generate and upload tinysnb + run: | + bash scripts/generate_binary_tinysnb.sh + s3cmd get s3://kuzu-test/tinysnb/tinysnb/version.txt + if [ "$(cat tinysnb/version.txt)" == "$(cat version.txt)" ]; then + echo "TinySNB dataset is up to date, skipping upload" + rm -rf tinysnb version.txt + exit 0 + fi + echo "TinySNB dataset is outdated, uploading..." + s3cmd del -r s3://kuzu-test/tinysnb/ + s3cmd sync ./tinysnb s3://kuzu-test/tinysnb/ + rm -rf tinysnb version.txt - name: Upload binary-demo uses: actions/upload-artifact@v4 diff --git a/benchmark/serialize.cypher b/benchmark/serialize.cypher index 0f00dbbf802..af4442e6e60 100644 --- a/benchmark/serialize.cypher +++ b/benchmark/serialize.cypher @@ -1,62 +1,62 @@ -create node table Person (ID INT64,firstName STRING,lastName STRING,gender STRING,birthday DATE,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING, PRIMARY KEY(ID)) -copy Person from "{}/person_0_0.csv" (HEADER=true, DELIM="|") -create node table Forum (ID INT64,title STRING,creationDate TIMESTAMP, PRIMARY KEY(ID)) -copy Forum from "{}/forum_0_0.csv" (HEADER=true, DELIM="|") -create node table Post (ID INT64,imageFile STRING,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,language STRING,content STRING,length INT64, PRIMARY KEY(ID)) -copy Post from "{}/post_0_0.csv" (HEADER=true, DELIM="|") -create node table Comment (ID INT64,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,content STRING,length INT64, PRIMARY KEY(ID)) -copy Comment from "{}/comment_0_0.csv" (HEADER=true, DELIM="|") -create node table Tag (ID INT64,name STRING,url STRING, PRIMARY KEY(ID)) -copy Tag from "{}/tag_0_0.csv" (HEADER=true, DELIM="|") -create node table Tagclass (ID INT64,name STRING,url STRING, PRIMARY KEY(ID)) -copy Tagclass from "{}/tagclass_0_0.csv" (HEADER=true, DELIM="|") -create node table Place (ID INT64,name STRING,url STRING,type STRING, PRIMARY KEY(ID)) -copy Place from "{}/place_0_0.csv" (HEADER=true, DELIM="|") -create node table Organisation (ID INT64,type STRING,name STRING,url STRING, PRIMARY KEY(ID)) -copy Organisation from "{}/organisation_0_0.csv" (HEADER=true, DELIM="|") -create rel table containerOf (FROM Forum TO Post,ONE_MANY) -copy containerOf from "{}/forum_containerOf_post_0_0.csv" (HEADER=true, DELIM="|") -create rel table comment_hasCreator (FROM Comment TO Person, MANY_ONE) -copy comment_hasCreator from "{}/comment_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|") -create rel table post_hasCreator (FROM Post TO Person,MANY_ONE) -copy post_hasCreator from "{}/post_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|") -create rel table hasInterest (FROM Person TO Tag, MANY_MANY) -copy hasInterest from "{}/person_hasInterest_tag_0_0.csv" (HEADER=true, DELIM="|") -create rel table hasMember (FROM Forum TO Person,joinDate TIMESTAMP,MANY_MANY) -copy hasMember from "{}/forum_hasMember_person_0_0.csv" (HEADER=true, DELIM="|") -create rel table hasModerator (FROM Forum TO Person,MANY_ONE) -copy hasModerator from "{}/forum_hasModerator_person_0_0.csv" (HEADER=true, DELIM="|") -create rel table comment_hasTag (FROM Comment TO Tag,MANY_MANY) -copy comment_hasTag from "{}/comment_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|") -create rel table forum_hasTag (FROM Forum TO Tag,MANY_MANY) -copy forum_hasTag from "{}/forum_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|") -create rel table post_hasTag (FROM Post TO Tag,MANY_MANY) -copy post_hasTag from "{}/post_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|") -create rel table hasType (FROM Tag TO Tagclass,MANY_ONE) -copy hasType from "{}/tag_hasType_tagclass_0_0.csv" (HEADER=true, DELIM="|") -create rel table comment_isLocatedIn (FROM Comment TO Place,MANY_ONE) -copy comment_isLocatedIn from "{}/comment_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|") -create rel table organisation_isLocatedIn (FROM Organisation TO Place,MANY_ONE) -copy organisation_isLocatedIn from "{}/organisation_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|") -create rel table person_isLocatedIn (FROM Person TO Place,MANY_ONE) -copy person_isLocatedIn from "{}/person_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|") -create rel table post_isLocatedIn (FROM Post TO Place,MANY_ONE) -copy post_isLocatedIn from "{}/post_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|") -create rel table isPartOf (FROM Place TO Place,MANY_ONE) -copy isPartOf from "{}/place_isPartOf_place_0_0.csv" (HEADER=true, DELIM="|") -create rel table isSubclassOf (FROM Tagclass TO Tagclass,MANY_ONE) -copy isSubclassOf from "{}/tagclass_isSubclassOf_tagclass_0_0.csv" (HEADER=true, DELIM="|") -create rel table knows (FROM Person TO Person,creationDate TIMESTAMP,MANY_MANY) -copy knows from "{}/person_knows_person_0_0.csv" (HEADER=true, DELIM="|") -create rel table likes_comment (FROM Person TO Comment,creationDate TIMESTAMP,MANY_MANY) -copy likes_comment from "{}/person_likes_comment_0_0.csv" (HEADER=true, DELIM="|") -create rel table likes_post (FROM Person TO Post,creationDate TIMESTAMP,MANY_MANY) -copy likes_post from "{}/person_likes_post_0_0.csv" (HEADER=true, DELIM="|") -create rel table replyOf_comment (FROM Comment TO Comment,MANY_ONE) -copy replyOf_comment from "{}/comment_replyOf_comment_0_0.csv" (HEADER=true, DELIM="|") -create rel table replyOf_post (FROM Comment TO Post,MANY_ONE) -copy replyOf_post from "{}/comment_replyOf_post_0_0.csv" (HEADER=true, DELIM="|") -create rel table studyAt (FROM Person TO Organisation,classYear INT64,MANY_MANY) -copy studyAt from "{}/person_studyAt_organisation_0_0.csv" (HEADER=true, DELIM="|") -create rel table workAt (FROM Person TO Organisation,workFrom INT64,MANY_MANY) -copy workAt from "{}/person_workAt_organisation_0_0.csv" (HEADER=true, DELIM="|") +create node table Person (ID INT64,firstName STRING,lastName STRING,gender STRING,birthday DATE,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING, PRIMARY KEY(ID)); +copy Person from "{}/person_0_0.csv" (HEADER=true, DELIM="|"); +create node table Forum (ID INT64,title STRING,creationDate TIMESTAMP, PRIMARY KEY(ID)); +copy Forum from "{}/forum_0_0.csv" (HEADER=true, DELIM="|"); +create node table Post (ID INT64,imageFile STRING,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,language STRING,content STRING,length INT64, PRIMARY KEY(ID)); +copy Post from "{}/post_0_0.csv" (HEADER=true, DELIM="|"); +create node table Comment (ID INT64,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,content STRING,length INT64, PRIMARY KEY(ID)); +copy Comment from "{}/comment_0_0.csv" (HEADER=true, DELIM="|"); +create node table Tag (ID INT64,name STRING,url STRING, PRIMARY KEY(ID)); +copy Tag from "{}/tag_0_0.csv" (HEADER=true, DELIM="|"); +create node table Tagclass (ID INT64,name STRING,url STRING, PRIMARY KEY(ID)); +copy Tagclass from "{}/tagclass_0_0.csv" (HEADER=true, DELIM="|"); +create node table Place (ID INT64,name STRING,url STRING,type STRING, PRIMARY KEY(ID)); +copy Place from "{}/place_0_0.csv" (HEADER=true, DELIM="|"); +create node table Organisation (ID INT64,type STRING,name STRING,url STRING, PRIMARY KEY(ID)); +copy Organisation from "{}/organisation_0_0.csv" (HEADER=true, DELIM="|"); +create rel table containerOf (FROM Forum TO Post,ONE_MANY); +copy containerOf from "{}/forum_containerOf_post_0_0.csv" (HEADER=true, DELIM="|"); +create rel table comment_hasCreator (FROM Comment TO Person, MANY_ONE); +copy comment_hasCreator from "{}/comment_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|"); +create rel table post_hasCreator (FROM Post TO Person,MANY_ONE); +copy post_hasCreator from "{}/post_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|"); +create rel table hasInterest (FROM Person TO Tag, MANY_MANY); +copy hasInterest from "{}/person_hasInterest_tag_0_0.csv" (HEADER=true, DELIM="|"); +create rel table hasMember (FROM Forum TO Person,joinDate TIMESTAMP,MANY_MANY); +copy hasMember from "{}/forum_hasMember_person_0_0.csv" (HEADER=true, DELIM="|"); +create rel table hasModerator (FROM Forum TO Person,MANY_ONE); +copy hasModerator from "{}/forum_hasModerator_person_0_0.csv" (HEADER=true, DELIM="|"); +create rel table comment_hasTag (FROM Comment TO Tag,MANY_MANY); +copy comment_hasTag from "{}/comment_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|"); +create rel table forum_hasTag (FROM Forum TO Tag,MANY_MANY); +copy forum_hasTag from "{}/forum_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|"); +create rel table post_hasTag (FROM Post TO Tag,MANY_MANY); +copy post_hasTag from "{}/post_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|"); +create rel table hasType (FROM Tag TO Tagclass,MANY_ONE); +copy hasType from "{}/tag_hasType_tagclass_0_0.csv" (HEADER=true, DELIM="|"); +create rel table comment_isLocatedIn (FROM Comment TO Place,MANY_ONE); +copy comment_isLocatedIn from "{}/comment_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|"); +create rel table organisation_isLocatedIn (FROM Organisation TO Place,MANY_ONE); +copy organisation_isLocatedIn from "{}/organisation_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|"); +create rel table person_isLocatedIn (FROM Person TO Place,MANY_ONE); +copy person_isLocatedIn from "{}/person_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|"); +create rel table post_isLocatedIn (FROM Post TO Place,MANY_ONE); +copy post_isLocatedIn from "{}/post_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|"); +create rel table isPartOf (FROM Place TO Place,MANY_ONE); +copy isPartOf from "{}/place_isPartOf_place_0_0.csv" (HEADER=true, DELIM="|"); +create rel table isSubclassOf (FROM Tagclass TO Tagclass,MANY_ONE); +copy isSubclassOf from "{}/tagclass_isSubclassOf_tagclass_0_0.csv" (HEADER=true, DELIM="|"); +create rel table knows (FROM Person TO Person,creationDate TIMESTAMP,MANY_MANY); +copy knows from "{}/person_knows_person_0_0.csv" (HEADER=true, DELIM="|"); +create rel table likes_comment (FROM Person TO Comment,creationDate TIMESTAMP,MANY_MANY); +copy likes_comment from "{}/person_likes_comment_0_0.csv" (HEADER=true, DELIM="|"); +create rel table likes_post (FROM Person TO Post,creationDate TIMESTAMP,MANY_MANY); +copy likes_post from "{}/person_likes_post_0_0.csv" (HEADER=true, DELIM="|"); +create rel table replyOf_comment (FROM Comment TO Comment,MANY_ONE); +copy replyOf_comment from "{}/comment_replyOf_comment_0_0.csv" (HEADER=true, DELIM="|"); +create rel table replyOf_post (FROM Comment TO Post,MANY_ONE); +copy replyOf_post from "{}/comment_replyOf_post_0_0.csv" (HEADER=true, DELIM="|"); +create rel table studyAt (FROM Person TO Organisation,classYear INT64,MANY_MANY); +copy studyAt from "{}/person_studyAt_organisation_0_0.csv" (HEADER=true, DELIM="|"); +create rel table workAt (FROM Person TO Organisation,workFrom INT64,MANY_MANY); +copy workAt from "{}/person_workAt_organisation_0_0.csv" (HEADER=true, DELIM="|"); \ No newline at end of file diff --git a/benchmark/serializer.py b/benchmark/serializer.py index 5975aa7ccb1..da8de62ff16 100644 --- a/benchmark/serializer.py +++ b/benchmark/serializer.py @@ -54,6 +54,7 @@ def serialize(kuzu_exec_path, dataset_name, dataset_path, serialized_graph_path, serialize_queries += f.readlines() serialize_queries = [q.strip().replace('{}', dataset_path) for q in serialize_queries] + serialize_queries = [q for q in serialize_queries if q] table_types = {} @@ -66,7 +67,7 @@ def serialize(kuzu_exec_path, dataset_name, dataset_path, serialized_graph_path, stdout = sys.stdout if create_match or not benchmark_copy_log_dir else subprocess.PIPE process = subprocess.Popen([kuzu_exec_path, serialized_graph_path], stdin=subprocess.PIPE, stdout=stdout, encoding="utf-8") - process.stdin.write(s + ";\n") + process.stdin.write(s) process.stdin.close() if create_match: table_types[create_match.group(2)] = create_match.group(1).lower() diff --git a/dataset/demo-db/parquet/copy.cypher b/dataset/demo-db/parquet/copy.cypher index e05bb79c4d3..c6df01c16fd 100644 --- a/dataset/demo-db/parquet/copy.cypher +++ b/dataset/demo-db/parquet/copy.cypher @@ -1,4 +1,4 @@ -COPY User From "dataset/demo-db/parquet/user.parquet" -COPY City FROM "dataset/demo-db/parquet/city.parquet" -COPY Follows FROM "dataset/demo-db/parquet/follows.parquet" -COPY LivesIn FROM "dataset/demo-db/parquet/lives-in.parquet" \ No newline at end of file +COPY User From "dataset/demo-db/parquet/user.parquet"; +COPY City FROM "dataset/demo-db/parquet/city.parquet"; +COPY Follows FROM "dataset/demo-db/parquet/follows.parquet"; +COPY LivesIn FROM "dataset/demo-db/parquet/lives-in.parquet"; \ No newline at end of file diff --git a/dataset/demo-db/parquet/schema.cypher b/dataset/demo-db/parquet/schema.cypher index 5d523454e5a..fa991d1e4fc 100644 --- a/dataset/demo-db/parquet/schema.cypher +++ b/dataset/demo-db/parquet/schema.cypher @@ -1,4 +1,4 @@ -CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name)) -CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name)) -CREATE REL TABLE Follows(FROM User TO User, since INT64) -CREATE REL TABLE LivesIn(FROM User TO City) \ No newline at end of file +CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name)); +CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name)); +CREATE REL TABLE Follows(FROM User TO User, since INT64); +CREATE REL TABLE LivesIn(FROM User TO City); \ No newline at end of file diff --git a/scripts/generate_binary_tinysnb.sh b/scripts/generate_binary_tinysnb.sh new file mode 100755 index 00000000000..49b6d8b7cf3 --- /dev/null +++ b/scripts/generate_binary_tinysnb.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +CD=`dirname "$0"` +DATASET_DIR=$CD/../dataset +python3 $CD/../benchmark/serializer.py TinySNB $DATASET_DIR/tinysnb $CD/../tinysnb