Skip to content

Commit

Permalink
Updated code
Browse files Browse the repository at this point in the history
  • Loading branch information
sardev committed Sep 13, 2024
1 parent b6b5d52 commit 996bba9
Show file tree
Hide file tree
Showing 7 changed files with 619 additions and 474 deletions.
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, name, sourcedir=""):
class CMakeBuild(build_ext):
def run(self):
try:
_ = subprocess.check_output(["cmake", "--version"])
_ = subprocess.run(["cmake", "--version"])
except OSError:
raise RuntimeError(
"CMake must be installed to build the following extensions: "
Expand Down Expand Up @@ -69,8 +69,8 @@ def build_extension(self, ext):

print(cmake_args)

subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
subprocess.run(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.run(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
print() # Add an empty line for cleaner output


Expand Down
124 changes: 124 additions & 0 deletions wikipedia_processing/new_node_delta.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
Processing snapshot update_57
Uploading result to remapped_new_node_embeddings_update_57.npy
Processing snapshot update_44
Uploading result to remapped_new_node_embeddings_update_44.npy
Processing snapshot update_53
Uploading result to remapped_new_node_embeddings_update_53.npy
Processing snapshot update_46
Uploading result to remapped_new_node_embeddings_update_46.npy
Processing snapshot update_34
Uploading result to remapped_new_node_embeddings_update_34.npy
Processing snapshot update_58
Uploading result to remapped_new_node_embeddings_update_58.npy
Processing snapshot update_1
Uploading result to remapped_new_node_embeddings_update_1.npy
Processing snapshot update_27
Uploading result to remapped_new_node_embeddings_update_27.npy
Processing snapshot update_12
Uploading result to remapped_new_node_embeddings_update_12.npy
Processing snapshot update_18
Uploading result to remapped_new_node_embeddings_update_18.npy
Processing snapshot update_11
Uploading result to remapped_new_node_embeddings_update_11.npy
Processing snapshot update_28
Uploading result to remapped_new_node_embeddings_update_28.npy
Processing snapshot update_22
Uploading result to remapped_new_node_embeddings_update_22.npy
Processing snapshot update_39
Uploading result to remapped_new_node_embeddings_update_39.npy
Processing snapshot update_20
Uploading result to remapped_new_node_embeddings_update_20.npy
Processing snapshot update_48
Uploading result to remapped_new_node_embeddings_update_48.npy
Processing snapshot update_26
Uploading result to remapped_new_node_embeddings_update_26.npy
Processing snapshot update_16
Uploading result to remapped_new_node_embeddings_update_16.npy
Processing snapshot update_29
Uploading result to remapped_new_node_embeddings_update_29.npy
Processing snapshot update_43
Uploading result to remapped_new_node_embeddings_update_43.npy
Processing snapshot update_50
Uploading result to remapped_new_node_embeddings_update_50.npy
Processing snapshot update_37
Uploading result to remapped_new_node_embeddings_update_37.npy
Processing snapshot update_23
Uploading result to remapped_new_node_embeddings_update_23.npy
Processing snapshot update_17
Uploading result to remapped_new_node_embeddings_update_17.npy
Processing snapshot update_33
Uploading result to remapped_new_node_embeddings_update_33.npy
Processing snapshot update_55
Uploading result to remapped_new_node_embeddings_update_55.npy
Processing snapshot update_47
Uploading result to remapped_new_node_embeddings_update_47.npy
Processing snapshot update_5
Uploading result to remapped_new_node_embeddings_update_5.npy
Processing snapshot update_30
Uploading result to remapped_new_node_embeddings_update_30.npy
Processing snapshot update_52
Uploading result to remapped_new_node_embeddings_update_52.npy
Processing snapshot update_35
Uploading result to remapped_new_node_embeddings_update_35.npy
Processing snapshot update_59
Uploading result to remapped_new_node_embeddings_update_59.npy
Processing snapshot update_6
Uploading result to remapped_new_node_embeddings_update_6.npy
Processing snapshot update_21
Uploading result to remapped_new_node_embeddings_update_21.npy
Processing snapshot update_40
Uploading result to remapped_new_node_embeddings_update_40.npy
Processing snapshot update_60
Uploading result to remapped_new_node_embeddings_update_60.npy
Processing snapshot update_56
Uploading result to remapped_new_node_embeddings_update_56.npy
Processing snapshot update_13
Uploading result to remapped_new_node_embeddings_update_13.npy
Processing snapshot update_0
Uploading result to remapped_new_node_embeddings_update_0.npy
Processing snapshot update_19
Uploading result to remapped_new_node_embeddings_update_19.npy
Processing snapshot update_10
Uploading result to remapped_new_node_embeddings_update_10.npy
Processing snapshot update_9
Uploading result to remapped_new_node_embeddings_update_9.npy
Processing snapshot update_45
Uploading result to remapped_new_node_embeddings_update_45.npy
Processing snapshot update_2
Uploading result to remapped_new_node_embeddings_update_2.npy
Processing snapshot update_25
Uploading result to remapped_new_node_embeddings_update_25.npy
Processing snapshot update_24
Uploading result to remapped_new_node_embeddings_update_24.npy
Processing snapshot update_15
Uploading result to remapped_new_node_embeddings_update_15.npy
Processing snapshot update_41
Uploading result to remapped_new_node_embeddings_update_41.npy
Processing snapshot update_49
Uploading result to remapped_new_node_embeddings_update_49.npy
Processing snapshot initial_snapshot
Uploading result to remapped_new_node_embeddings_initial_snapshot.npy
Processing snapshot update_14
Uploading result to remapped_new_node_embeddings_update_14.npy
Processing snapshot update_7
Uploading result to remapped_new_node_embeddings_update_7.npy
Processing snapshot update_32
Uploading result to remapped_new_node_embeddings_update_32.npy
Processing snapshot update_54
Uploading result to remapped_new_node_embeddings_update_54.npy
Processing snapshot update_3
Uploading result to remapped_new_node_embeddings_update_3.npy
Processing snapshot update_4
Uploading result to remapped_new_node_embeddings_update_4.npy
Processing snapshot update_38
Uploading result to remapped_new_node_embeddings_update_38.npy
Processing snapshot update_36
Uploading result to remapped_new_node_embeddings_update_36.npy
Processing snapshot update_51
Uploading result to remapped_new_node_embeddings_update_51.npy
Processing snapshot update_31
Uploading result to remapped_new_node_embeddings_update_31.npy
Processing snapshot update_62
Uploading result to remapped_new_node_embeddings_update_62.npy
Processing snapshot update_61
Uploading result to remapped_new_node_embeddings_update_61.npy
85 changes: 85 additions & 0 deletions wikipedia_processing/new_node_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd
import os
import boto3
import numpy as np
import time

USER_DIR = os.path.expanduser("~")
DATA_DIR = os.path.join(USER_DIR, "all_datasets")
INDIVIDUAL_GRAPH_SNAPSHOTS = os.path.join(DATA_DIR, "wikipedia_dataset")
UPDATE_DIR_PREFIX = "update_"
BUCKET_NAME = "wikidata-update-history"
NEW_NODES_OBJECT_PREFIX = "remapped_new_nodes_"

SNAPSHOT_RANGE = [0, 103] # [0, 104]
def new_node_creator():
s3_client = boto3.client('s3')
nodes_already_seem = pd.Series()
for snapshot_id in range(SNAPSHOT_RANGE[0], SNAPSHOT_RANGE[1] + 1):
if snapshot_id == 0:
snapshot_name = "initial_snapshot"
else:
snapshot_name = "update_" + str(snapshot_id - 1)

snapshot_dir = os.path.join(INDIVIDUAL_GRAPH_SNAPSHOTS, snapshot_name)
save_path = os.path.join(snapshot_dir, "new_nodes.npy")

# Load in the graph
print("Processing dir", snapshot_dir)
graph_path = os.path.join(snapshot_dir, "graph.csv")
graph_df = pd.read_csv(graph_path, header = None, names = ["src_node", "edge_type", "dst_node"])

# Get the unique nodes in the graph
all_nodes = pd.concat([graph_df["src_node"], graph_df["dst_node"]]).drop_duplicates()
new_nodes = all_nodes[~all_nodes.isin(nodes_already_seem)].values

# Write those to disk and upload to S3
np.save(save_path, new_nodes)
upload_name = NEW_NODES_OBJECT_PREFIX + snapshot_name + ".npy"
print("Uploading file", upload_name, "with", new_nodes.shape[0], "nodes")
s3_client.upload_file(save_path, BUCKET_NAME, upload_name)

# Update the nodes already seem
nodes_already_seem = pd.concat([all_nodes, nodes_already_seem]).drop_duplicates()

NEW_NODE_EMBEDDINGS_PREFIX = "remapped_new_node_embeddings_"
SLEEP_TIME = 60 * 10
def delta_creator():
s3_client = boto3.client('s3')
found_new_results = True
while found_new_results:
found_new_results = False
for snapshot_name in os.listdir(INDIVIDUAL_GRAPH_SNAPSHOTS):
if snapshot_name[0] == '.' or "json" in snapshot_name:
continue

# Get the paths and make sure they exists
snapshot_dir = os.path.join(INDIVIDUAL_GRAPH_SNAPSHOTS, snapshot_name)
model_dir = os.path.join(snapshot_dir, "marius_formatted", "model_0")
new_nodes_path = os.path.join(snapshot_dir, "new_nodes.npy")
embeddings_path = os.path.join(model_dir, "embeddings.bin")
if not os.path.exists(new_nodes_path) or not os.path.exists(embeddings_path):
continue

# Determine the save path and see if we already processed this one
save_path = os.path.join(snapshot_dir, "new_node_embeddings.npy")
if os.path.exists(save_path):
continue

# Read in the new nodes and embeddings
print("Processing snapshot", snapshot_name)
new_nodes = np.load(new_nodes_path)
node_embeddings = np.fromfile(embeddings_path, dtype = np.float32).reshape(-1, 128)

# Save the new node embeddings and upload to S3
new_node_embeddings = node_embeddings[new_nodes]
np.save(save_path, new_node_embeddings)
upload_name = NEW_NODE_EMBEDDINGS_PREFIX + snapshot_name + ".npy"
print("Uploading result to", upload_name)
s3_client.upload_file(save_path, BUCKET_NAME, upload_name)

found_new_results = True

if __name__ == "__main__":
# new_node_creator()
delta_creator()
Loading

0 comments on commit 996bba9

Please sign in to comment.