Skip to content

Commit

Permalink
Automatically upload binary dataset to s3 (kuzudb#4114)
Browse files Browse the repository at this point in the history
(cherry picked from commit e90380e)
  • Loading branch information
mewim authored and wangqiang committed Aug 26, 2024
1 parent bd8ef57 commit 794a83c
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 71 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,20 @@ jobs:

- name: Generate datasets
run: bash scripts/generate_binary_demo.sh

- name: Generate and upload tinysnb
run: |
bash scripts/generate_binary_tinysnb.sh
s3cmd get s3://kuzu-test/tinysnb/tinysnb/version.txt
if [ "$(cat tinysnb/version.txt)" == "$(cat version.txt)" ]; then
echo "TinySNB dataset is up to date, skipping upload"
rm -rf tinysnb version.txt
exit 0
fi
echo "TinySNB dataset is outdated, uploading..."
s3cmd del -r s3://kuzu-test/tinysnb/
s3cmd sync ./tinysnb s3://kuzu-test/tinysnb/
rm -rf tinysnb version.txt
- name: Upload binary-demo
uses: actions/upload-artifact@v4
Expand Down
124 changes: 62 additions & 62 deletions benchmark/serialize.cypher
Original file line number Diff line number Diff line change
@@ -1,62 +1,62 @@
create node table Person (ID INT64,firstName STRING,lastName STRING,gender STRING,birthday DATE,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING, PRIMARY KEY(ID))
copy Person from "{}/person_0_0.csv" (HEADER=true, DELIM="|")
create node table Forum (ID INT64,title STRING,creationDate TIMESTAMP, PRIMARY KEY(ID))
copy Forum from "{}/forum_0_0.csv" (HEADER=true, DELIM="|")
create node table Post (ID INT64,imageFile STRING,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,language STRING,content STRING,length INT64, PRIMARY KEY(ID))
copy Post from "{}/post_0_0.csv" (HEADER=true, DELIM="|")
create node table Comment (ID INT64,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,content STRING,length INT64, PRIMARY KEY(ID))
copy Comment from "{}/comment_0_0.csv" (HEADER=true, DELIM="|")
create node table Tag (ID INT64,name STRING,url STRING, PRIMARY KEY(ID))
copy Tag from "{}/tag_0_0.csv" (HEADER=true, DELIM="|")
create node table Tagclass (ID INT64,name STRING,url STRING, PRIMARY KEY(ID))
copy Tagclass from "{}/tagclass_0_0.csv" (HEADER=true, DELIM="|")
create node table Place (ID INT64,name STRING,url STRING,type STRING, PRIMARY KEY(ID))
copy Place from "{}/place_0_0.csv" (HEADER=true, DELIM="|")
create node table Organisation (ID INT64,type STRING,name STRING,url STRING, PRIMARY KEY(ID))
copy Organisation from "{}/organisation_0_0.csv" (HEADER=true, DELIM="|")
create rel table containerOf (FROM Forum TO Post,ONE_MANY)
copy containerOf from "{}/forum_containerOf_post_0_0.csv" (HEADER=true, DELIM="|")
create rel table comment_hasCreator (FROM Comment TO Person, MANY_ONE)
copy comment_hasCreator from "{}/comment_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|")
create rel table post_hasCreator (FROM Post TO Person,MANY_ONE)
copy post_hasCreator from "{}/post_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|")
create rel table hasInterest (FROM Person TO Tag, MANY_MANY)
copy hasInterest from "{}/person_hasInterest_tag_0_0.csv" (HEADER=true, DELIM="|")
create rel table hasMember (FROM Forum TO Person,joinDate TIMESTAMP,MANY_MANY)
copy hasMember from "{}/forum_hasMember_person_0_0.csv" (HEADER=true, DELIM="|")
create rel table hasModerator (FROM Forum TO Person,MANY_ONE)
copy hasModerator from "{}/forum_hasModerator_person_0_0.csv" (HEADER=true, DELIM="|")
create rel table comment_hasTag (FROM Comment TO Tag,MANY_MANY)
copy comment_hasTag from "{}/comment_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|")
create rel table forum_hasTag (FROM Forum TO Tag,MANY_MANY)
copy forum_hasTag from "{}/forum_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|")
create rel table post_hasTag (FROM Post TO Tag,MANY_MANY)
copy post_hasTag from "{}/post_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|")
create rel table hasType (FROM Tag TO Tagclass,MANY_ONE)
copy hasType from "{}/tag_hasType_tagclass_0_0.csv" (HEADER=true, DELIM="|")
create rel table comment_isLocatedIn (FROM Comment TO Place,MANY_ONE)
copy comment_isLocatedIn from "{}/comment_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|")
create rel table organisation_isLocatedIn (FROM Organisation TO Place,MANY_ONE)
copy organisation_isLocatedIn from "{}/organisation_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|")
create rel table person_isLocatedIn (FROM Person TO Place,MANY_ONE)
copy person_isLocatedIn from "{}/person_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|")
create rel table post_isLocatedIn (FROM Post TO Place,MANY_ONE)
copy post_isLocatedIn from "{}/post_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|")
create rel table isPartOf (FROM Place TO Place,MANY_ONE)
copy isPartOf from "{}/place_isPartOf_place_0_0.csv" (HEADER=true, DELIM="|")
create rel table isSubclassOf (FROM Tagclass TO Tagclass,MANY_ONE)
copy isSubclassOf from "{}/tagclass_isSubclassOf_tagclass_0_0.csv" (HEADER=true, DELIM="|")
create rel table knows (FROM Person TO Person,creationDate TIMESTAMP,MANY_MANY)
copy knows from "{}/person_knows_person_0_0.csv" (HEADER=true, DELIM="|")
create rel table likes_comment (FROM Person TO Comment,creationDate TIMESTAMP,MANY_MANY)
copy likes_comment from "{}/person_likes_comment_0_0.csv" (HEADER=true, DELIM="|")
create rel table likes_post (FROM Person TO Post,creationDate TIMESTAMP,MANY_MANY)
copy likes_post from "{}/person_likes_post_0_0.csv" (HEADER=true, DELIM="|")
create rel table replyOf_comment (FROM Comment TO Comment,MANY_ONE)
copy replyOf_comment from "{}/comment_replyOf_comment_0_0.csv" (HEADER=true, DELIM="|")
create rel table replyOf_post (FROM Comment TO Post,MANY_ONE)
copy replyOf_post from "{}/comment_replyOf_post_0_0.csv" (HEADER=true, DELIM="|")
create rel table studyAt (FROM Person TO Organisation,classYear INT64,MANY_MANY)
copy studyAt from "{}/person_studyAt_organisation_0_0.csv" (HEADER=true, DELIM="|")
create rel table workAt (FROM Person TO Organisation,workFrom INT64,MANY_MANY)
copy workAt from "{}/person_workAt_organisation_0_0.csv" (HEADER=true, DELIM="|")
create node table Person (ID INT64,firstName STRING,lastName STRING,gender STRING,birthday DATE,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING, PRIMARY KEY(ID));
copy Person from "{}/person_0_0.csv" (HEADER=true, DELIM="|");
create node table Forum (ID INT64,title STRING,creationDate TIMESTAMP, PRIMARY KEY(ID));
copy Forum from "{}/forum_0_0.csv" (HEADER=true, DELIM="|");
create node table Post (ID INT64,imageFile STRING,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,language STRING,content STRING,length INT64, PRIMARY KEY(ID));
copy Post from "{}/post_0_0.csv" (HEADER=true, DELIM="|");
create node table Comment (ID INT64,creationDate TIMESTAMP,locationIP STRING,browserUsed STRING,content STRING,length INT64, PRIMARY KEY(ID));
copy Comment from "{}/comment_0_0.csv" (HEADER=true, DELIM="|");
create node table Tag (ID INT64,name STRING,url STRING, PRIMARY KEY(ID));
copy Tag from "{}/tag_0_0.csv" (HEADER=true, DELIM="|");
create node table Tagclass (ID INT64,name STRING,url STRING, PRIMARY KEY(ID));
copy Tagclass from "{}/tagclass_0_0.csv" (HEADER=true, DELIM="|");
create node table Place (ID INT64,name STRING,url STRING,type STRING, PRIMARY KEY(ID));
copy Place from "{}/place_0_0.csv" (HEADER=true, DELIM="|");
create node table Organisation (ID INT64,type STRING,name STRING,url STRING, PRIMARY KEY(ID));
copy Organisation from "{}/organisation_0_0.csv" (HEADER=true, DELIM="|");
create rel table containerOf (FROM Forum TO Post,ONE_MANY);
copy containerOf from "{}/forum_containerOf_post_0_0.csv" (HEADER=true, DELIM="|");
create rel table comment_hasCreator (FROM Comment TO Person, MANY_ONE);
copy comment_hasCreator from "{}/comment_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|");
create rel table post_hasCreator (FROM Post TO Person,MANY_ONE);
copy post_hasCreator from "{}/post_hasCreator_person_0_0.csv" (HEADER=true, DELIM="|");
create rel table hasInterest (FROM Person TO Tag, MANY_MANY);
copy hasInterest from "{}/person_hasInterest_tag_0_0.csv" (HEADER=true, DELIM="|");
create rel table hasMember (FROM Forum TO Person,joinDate TIMESTAMP,MANY_MANY);
copy hasMember from "{}/forum_hasMember_person_0_0.csv" (HEADER=true, DELIM="|");
create rel table hasModerator (FROM Forum TO Person,MANY_ONE);
copy hasModerator from "{}/forum_hasModerator_person_0_0.csv" (HEADER=true, DELIM="|");
create rel table comment_hasTag (FROM Comment TO Tag,MANY_MANY);
copy comment_hasTag from "{}/comment_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|");
create rel table forum_hasTag (FROM Forum TO Tag,MANY_MANY);
copy forum_hasTag from "{}/forum_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|");
create rel table post_hasTag (FROM Post TO Tag,MANY_MANY);
copy post_hasTag from "{}/post_hasTag_tag_0_0.csv" (HEADER=true, DELIM="|");
create rel table hasType (FROM Tag TO Tagclass,MANY_ONE);
copy hasType from "{}/tag_hasType_tagclass_0_0.csv" (HEADER=true, DELIM="|");
create rel table comment_isLocatedIn (FROM Comment TO Place,MANY_ONE);
copy comment_isLocatedIn from "{}/comment_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|");
create rel table organisation_isLocatedIn (FROM Organisation TO Place,MANY_ONE);
copy organisation_isLocatedIn from "{}/organisation_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|");
create rel table person_isLocatedIn (FROM Person TO Place,MANY_ONE);
copy person_isLocatedIn from "{}/person_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|");
create rel table post_isLocatedIn (FROM Post TO Place,MANY_ONE);
copy post_isLocatedIn from "{}/post_isLocatedIn_place_0_0.csv" (HEADER=true, DELIM="|");
create rel table isPartOf (FROM Place TO Place,MANY_ONE);
copy isPartOf from "{}/place_isPartOf_place_0_0.csv" (HEADER=true, DELIM="|");
create rel table isSubclassOf (FROM Tagclass TO Tagclass,MANY_ONE);
copy isSubclassOf from "{}/tagclass_isSubclassOf_tagclass_0_0.csv" (HEADER=true, DELIM="|");
create rel table knows (FROM Person TO Person,creationDate TIMESTAMP,MANY_MANY);
copy knows from "{}/person_knows_person_0_0.csv" (HEADER=true, DELIM="|");
create rel table likes_comment (FROM Person TO Comment,creationDate TIMESTAMP,MANY_MANY);
copy likes_comment from "{}/person_likes_comment_0_0.csv" (HEADER=true, DELIM="|");
create rel table likes_post (FROM Person TO Post,creationDate TIMESTAMP,MANY_MANY);
copy likes_post from "{}/person_likes_post_0_0.csv" (HEADER=true, DELIM="|");
create rel table replyOf_comment (FROM Comment TO Comment,MANY_ONE);
copy replyOf_comment from "{}/comment_replyOf_comment_0_0.csv" (HEADER=true, DELIM="|");
create rel table replyOf_post (FROM Comment TO Post,MANY_ONE);
copy replyOf_post from "{}/comment_replyOf_post_0_0.csv" (HEADER=true, DELIM="|");
create rel table studyAt (FROM Person TO Organisation,classYear INT64,MANY_MANY);
copy studyAt from "{}/person_studyAt_organisation_0_0.csv" (HEADER=true, DELIM="|");
create rel table workAt (FROM Person TO Organisation,workFrom INT64,MANY_MANY);
copy workAt from "{}/person_workAt_organisation_0_0.csv" (HEADER=true, DELIM="|");
3 changes: 2 additions & 1 deletion benchmark/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def serialize(kuzu_exec_path, dataset_name, dataset_path, serialized_graph_path,
serialize_queries += f.readlines()
serialize_queries = [q.strip().replace('{}', dataset_path)
for q in serialize_queries]
serialize_queries = [q for q in serialize_queries if q]

table_types = {}

Expand All @@ -66,7 +67,7 @@ def serialize(kuzu_exec_path, dataset_name, dataset_path, serialized_graph_path,
stdout = sys.stdout if create_match or not benchmark_copy_log_dir else subprocess.PIPE
process = subprocess.Popen([kuzu_exec_path, serialized_graph_path],
stdin=subprocess.PIPE, stdout=stdout, encoding="utf-8")
process.stdin.write(s + ";\n")
process.stdin.write(s)
process.stdin.close()
if create_match:
table_types[create_match.group(2)] = create_match.group(1).lower()
Expand Down
8 changes: 4 additions & 4 deletions dataset/demo-db/parquet/copy.cypher
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
COPY User From "dataset/demo-db/parquet/user.parquet"
COPY City FROM "dataset/demo-db/parquet/city.parquet"
COPY Follows FROM "dataset/demo-db/parquet/follows.parquet"
COPY LivesIn FROM "dataset/demo-db/parquet/lives-in.parquet"
COPY User From "dataset/demo-db/parquet/user.parquet";
COPY City FROM "dataset/demo-db/parquet/city.parquet";
COPY Follows FROM "dataset/demo-db/parquet/follows.parquet";
COPY LivesIn FROM "dataset/demo-db/parquet/lives-in.parquet";
8 changes: 4 additions & 4 deletions dataset/demo-db/parquet/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name))
CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name))
CREATE REL TABLE Follows(FROM User TO User, since INT64)
CREATE REL TABLE LivesIn(FROM User TO City)
CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name));
CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name));
CREATE REL TABLE Follows(FROM User TO User, since INT64);
CREATE REL TABLE LivesIn(FROM User TO City);
5 changes: 5 additions & 0 deletions scripts/generate_binary_tinysnb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

CD=`dirname "$0"`
DATASET_DIR=$CD/../dataset
python3 $CD/../benchmark/serializer.py TinySNB $DATASET_DIR/tinysnb $CD/../tinysnb

0 comments on commit 794a83c

Please sign in to comment.