Skip to content

Commit

Permalink
Not working code
Browse files Browse the repository at this point in the history
  • Loading branch information
Devesh Sarda committed Oct 27, 2023
1 parent 82d6b9c commit 8177358
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 143 deletions.
13 changes: 12 additions & 1 deletion src/python/tools/marius_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import shutil
from pathlib import Path

from marius.tools.preprocess import custom
from marius.tools.preprocess.datasets import (
fb15k,
fb15k_237,
Expand All @@ -19,6 +18,7 @@
twitter
)
from preprocess.datasets import ogbl_collab
from preprocess import custom


def set_args():
Expand Down Expand Up @@ -101,6 +101,15 @@ def set_args():
help="List of column ids of input delimited files which denote the src node, edge-type, and dst node of edges.",
)

parser.add_argument(
"--edge_weight_column",
metavar="edge_weight_column",
required=False,
type=int,
default=-1,
help="The column id which denotes the edge weight column",
)

return parser


Expand Down Expand Up @@ -133,6 +142,7 @@ def main():

dataset = dataset_dict.get(args.dataset.upper())
if dataset is not None:
print("Using existing dataset of", args.dataset.upper())
dataset = dataset(args.output_directory, spark=args.spark)
dataset.download(args.overwrite)
dataset.preprocess(
Expand Down Expand Up @@ -161,6 +171,7 @@ def main():
partitioned_eval=args.partitioned_eval,
sequential_train_nodes=args.sequential_train_nodes,
columns=args.columns,
edge_weight_column = args.edge_weight_column,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@

import torch # isort:skip


def dataframe_to_tensor(input_dataframe):
np_array = input_dataframe.to_dask_array().compute()
return torch.from_numpy(np_array)

def dataframe_to_tensor(df):
return torch.tensor(df.to_numpy())

def partition_edges(edges, num_nodes, num_partitions):
partition_size = int(np.ceil(num_nodes / num_partitions))
Expand Down
16 changes: 4 additions & 12 deletions src/python/tools/preprocess/converters/readers/pandas_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,23 @@ def __init__(

self.delim = delim

if len(self.columns) == 2:
self.has_rels = False
elif len(self.columns) == 3:
self.has_rels = True
else:
raise RuntimeError(
"Incorrect number of columns specified, expected length 2 or 3, received {}".format(len(self.columns))
)

def read(self):
train_edges_df: pd.DataFrame = None
valid_edges_df: pd.DataFrame = None
test_edges_df: pd.DataFrame = None

assert self.train_edges is not None
train_edges_df = pd.read_csv(self.train_edges, delimiter=self.delim, skiprows=self.header_length, header=None)
train_edges_df = train_edges_df[train_edges_df.columns[self.columns]]
train_edges_df = train_edges_df[train_edges_df.columns[self.columns]].astype(str)

if self.valid_edges is not None:
valid_edges_df = pd.read_csv(
self.valid_edges, delimiter=self.delim, skiprows=self.header_length, header=None
)
valid_edges_df = valid_edges_df[valid_edges_df.columns[self.columns]]
valid_edges_df = valid_edges_df[valid_edges_df.columns[self.columns]].astype(str)

if self.test_edges is not None:
test_edges_df = pd.read_csv(self.test_edges, delimiter=self.delim, skiprows=self.header_length, header=None)
test_edges_df = test_edges_df[test_edges_df.columns[self.columns]]
test_edges_df = test_edges_df[test_edges_df.columns[self.columns]].astype(str)

return train_edges_df, valid_edges_df, test_edges_df
Loading

0 comments on commit 8177358

Please sign in to comment.