-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
sardev
committed
Sep 13, 2024
1 parent
b6b5d52
commit 996bba9
Showing
7 changed files
with
619 additions
and
474 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
Processing snapshot update_57 | ||
Uploading result to remapped_new_node_embeddings_update_57.npy | ||
Processing snapshot update_44 | ||
Uploading result to remapped_new_node_embeddings_update_44.npy | ||
Processing snapshot update_53 | ||
Uploading result to remapped_new_node_embeddings_update_53.npy | ||
Processing snapshot update_46 | ||
Uploading result to remapped_new_node_embeddings_update_46.npy | ||
Processing snapshot update_34 | ||
Uploading result to remapped_new_node_embeddings_update_34.npy | ||
Processing snapshot update_58 | ||
Uploading result to remapped_new_node_embeddings_update_58.npy | ||
Processing snapshot update_1 | ||
Uploading result to remapped_new_node_embeddings_update_1.npy | ||
Processing snapshot update_27 | ||
Uploading result to remapped_new_node_embeddings_update_27.npy | ||
Processing snapshot update_12 | ||
Uploading result to remapped_new_node_embeddings_update_12.npy | ||
Processing snapshot update_18 | ||
Uploading result to remapped_new_node_embeddings_update_18.npy | ||
Processing snapshot update_11 | ||
Uploading result to remapped_new_node_embeddings_update_11.npy | ||
Processing snapshot update_28 | ||
Uploading result to remapped_new_node_embeddings_update_28.npy | ||
Processing snapshot update_22 | ||
Uploading result to remapped_new_node_embeddings_update_22.npy | ||
Processing snapshot update_39 | ||
Uploading result to remapped_new_node_embeddings_update_39.npy | ||
Processing snapshot update_20 | ||
Uploading result to remapped_new_node_embeddings_update_20.npy | ||
Processing snapshot update_48 | ||
Uploading result to remapped_new_node_embeddings_update_48.npy | ||
Processing snapshot update_26 | ||
Uploading result to remapped_new_node_embeddings_update_26.npy | ||
Processing snapshot update_16 | ||
Uploading result to remapped_new_node_embeddings_update_16.npy | ||
Processing snapshot update_29 | ||
Uploading result to remapped_new_node_embeddings_update_29.npy | ||
Processing snapshot update_43 | ||
Uploading result to remapped_new_node_embeddings_update_43.npy | ||
Processing snapshot update_50 | ||
Uploading result to remapped_new_node_embeddings_update_50.npy | ||
Processing snapshot update_37 | ||
Uploading result to remapped_new_node_embeddings_update_37.npy | ||
Processing snapshot update_23 | ||
Uploading result to remapped_new_node_embeddings_update_23.npy | ||
Processing snapshot update_17 | ||
Uploading result to remapped_new_node_embeddings_update_17.npy | ||
Processing snapshot update_33 | ||
Uploading result to remapped_new_node_embeddings_update_33.npy | ||
Processing snapshot update_55 | ||
Uploading result to remapped_new_node_embeddings_update_55.npy | ||
Processing snapshot update_47 | ||
Uploading result to remapped_new_node_embeddings_update_47.npy | ||
Processing snapshot update_5 | ||
Uploading result to remapped_new_node_embeddings_update_5.npy | ||
Processing snapshot update_30 | ||
Uploading result to remapped_new_node_embeddings_update_30.npy | ||
Processing snapshot update_52 | ||
Uploading result to remapped_new_node_embeddings_update_52.npy | ||
Processing snapshot update_35 | ||
Uploading result to remapped_new_node_embeddings_update_35.npy | ||
Processing snapshot update_59 | ||
Uploading result to remapped_new_node_embeddings_update_59.npy | ||
Processing snapshot update_6 | ||
Uploading result to remapped_new_node_embeddings_update_6.npy | ||
Processing snapshot update_21 | ||
Uploading result to remapped_new_node_embeddings_update_21.npy | ||
Processing snapshot update_40 | ||
Uploading result to remapped_new_node_embeddings_update_40.npy | ||
Processing snapshot update_60 | ||
Uploading result to remapped_new_node_embeddings_update_60.npy | ||
Processing snapshot update_56 | ||
Uploading result to remapped_new_node_embeddings_update_56.npy | ||
Processing snapshot update_13 | ||
Uploading result to remapped_new_node_embeddings_update_13.npy | ||
Processing snapshot update_0 | ||
Uploading result to remapped_new_node_embeddings_update_0.npy | ||
Processing snapshot update_19 | ||
Uploading result to remapped_new_node_embeddings_update_19.npy | ||
Processing snapshot update_10 | ||
Uploading result to remapped_new_node_embeddings_update_10.npy | ||
Processing snapshot update_9 | ||
Uploading result to remapped_new_node_embeddings_update_9.npy | ||
Processing snapshot update_45 | ||
Uploading result to remapped_new_node_embeddings_update_45.npy | ||
Processing snapshot update_2 | ||
Uploading result to remapped_new_node_embeddings_update_2.npy | ||
Processing snapshot update_25 | ||
Uploading result to remapped_new_node_embeddings_update_25.npy | ||
Processing snapshot update_24 | ||
Uploading result to remapped_new_node_embeddings_update_24.npy | ||
Processing snapshot update_15 | ||
Uploading result to remapped_new_node_embeddings_update_15.npy | ||
Processing snapshot update_41 | ||
Uploading result to remapped_new_node_embeddings_update_41.npy | ||
Processing snapshot update_49 | ||
Uploading result to remapped_new_node_embeddings_update_49.npy | ||
Processing snapshot initial_snapshot | ||
Uploading result to remapped_new_node_embeddings_initial_snapshot.npy | ||
Processing snapshot update_14 | ||
Uploading result to remapped_new_node_embeddings_update_14.npy | ||
Processing snapshot update_7 | ||
Uploading result to remapped_new_node_embeddings_update_7.npy | ||
Processing snapshot update_32 | ||
Uploading result to remapped_new_node_embeddings_update_32.npy | ||
Processing snapshot update_54 | ||
Uploading result to remapped_new_node_embeddings_update_54.npy | ||
Processing snapshot update_3 | ||
Uploading result to remapped_new_node_embeddings_update_3.npy | ||
Processing snapshot update_4 | ||
Uploading result to remapped_new_node_embeddings_update_4.npy | ||
Processing snapshot update_38 | ||
Uploading result to remapped_new_node_embeddings_update_38.npy | ||
Processing snapshot update_36 | ||
Uploading result to remapped_new_node_embeddings_update_36.npy | ||
Processing snapshot update_51 | ||
Uploading result to remapped_new_node_embeddings_update_51.npy | ||
Processing snapshot update_31 | ||
Uploading result to remapped_new_node_embeddings_update_31.npy | ||
Processing snapshot update_62 | ||
Uploading result to remapped_new_node_embeddings_update_62.npy | ||
Processing snapshot update_61 | ||
Uploading result to remapped_new_node_embeddings_update_61.npy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import pandas as pd | ||
import os | ||
import boto3 | ||
import numpy as np | ||
import time | ||
|
||
USER_DIR = os.path.expanduser("~") | ||
DATA_DIR = os.path.join(USER_DIR, "all_datasets") | ||
INDIVIDUAL_GRAPH_SNAPSHOTS = os.path.join(DATA_DIR, "wikipedia_dataset") | ||
UPDATE_DIR_PREFIX = "update_" | ||
BUCKET_NAME = "wikidata-update-history" | ||
NEW_NODES_OBJECT_PREFIX = "remapped_new_nodes_" | ||
|
||
SNAPSHOT_RANGE = [0, 103] # [0, 104] | ||
def new_node_creator(): | ||
s3_client = boto3.client('s3') | ||
nodes_already_seem = pd.Series() | ||
for snapshot_id in range(SNAPSHOT_RANGE[0], SNAPSHOT_RANGE[1] + 1): | ||
if snapshot_id == 0: | ||
snapshot_name = "initial_snapshot" | ||
else: | ||
snapshot_name = "update_" + str(snapshot_id - 1) | ||
|
||
snapshot_dir = os.path.join(INDIVIDUAL_GRAPH_SNAPSHOTS, snapshot_name) | ||
save_path = os.path.join(snapshot_dir, "new_nodes.npy") | ||
|
||
# Load in the graph | ||
print("Processing dir", snapshot_dir) | ||
graph_path = os.path.join(snapshot_dir, "graph.csv") | ||
graph_df = pd.read_csv(graph_path, header = None, names = ["src_node", "edge_type", "dst_node"]) | ||
|
||
# Get the unique nodes in the graph | ||
all_nodes = pd.concat([graph_df["src_node"], graph_df["dst_node"]]).drop_duplicates() | ||
new_nodes = all_nodes[~all_nodes.isin(nodes_already_seem)].values | ||
|
||
# Write those to disk and upload to S3 | ||
np.save(save_path, new_nodes) | ||
upload_name = NEW_NODES_OBJECT_PREFIX + snapshot_name + ".npy" | ||
print("Uploading file", upload_name, "with", new_nodes.shape[0], "nodes") | ||
s3_client.upload_file(save_path, BUCKET_NAME, upload_name) | ||
|
||
# Update the nodes already seem | ||
nodes_already_seem = pd.concat([all_nodes, nodes_already_seem]).drop_duplicates() | ||
|
||
NEW_NODE_EMBEDDINGS_PREFIX = "remapped_new_node_embeddings_" | ||
SLEEP_TIME = 60 * 10 | ||
def delta_creator(): | ||
s3_client = boto3.client('s3') | ||
found_new_results = True | ||
while found_new_results: | ||
found_new_results = False | ||
for snapshot_name in os.listdir(INDIVIDUAL_GRAPH_SNAPSHOTS): | ||
if snapshot_name[0] == '.' or "json" in snapshot_name: | ||
continue | ||
|
||
# Get the paths and make sure they exists | ||
snapshot_dir = os.path.join(INDIVIDUAL_GRAPH_SNAPSHOTS, snapshot_name) | ||
model_dir = os.path.join(snapshot_dir, "marius_formatted", "model_0") | ||
new_nodes_path = os.path.join(snapshot_dir, "new_nodes.npy") | ||
embeddings_path = os.path.join(model_dir, "embeddings.bin") | ||
if not os.path.exists(new_nodes_path) or not os.path.exists(embeddings_path): | ||
continue | ||
|
||
# Determine the save path and see if we already processed this one | ||
save_path = os.path.join(snapshot_dir, "new_node_embeddings.npy") | ||
if os.path.exists(save_path): | ||
continue | ||
|
||
# Read in the new nodes and embeddings | ||
print("Processing snapshot", snapshot_name) | ||
new_nodes = np.load(new_nodes_path) | ||
node_embeddings = np.fromfile(embeddings_path, dtype = np.float32).reshape(-1, 128) | ||
|
||
# Save the new node embeddings and upload to S3 | ||
new_node_embeddings = node_embeddings[new_nodes] | ||
np.save(save_path, new_node_embeddings) | ||
upload_name = NEW_NODE_EMBEDDINGS_PREFIX + snapshot_name + ".npy" | ||
print("Uploading result to", upload_name) | ||
s3_client.upload_file(save_path, BUCKET_NAME, upload_name) | ||
|
||
found_new_results = True | ||
|
||
if __name__ == "__main__": | ||
# new_node_creator() | ||
delta_creator() |
Oops, something went wrong.