diff --git a/src/python/tools/marius_preprocess.py b/src/python/tools/marius_preprocess.py index 5638bdd3..f22bee93 100644 --- a/src/python/tools/marius_preprocess.py +++ b/src/python/tools/marius_preprocess.py @@ -2,7 +2,6 @@ import shutil from pathlib import Path -from marius.tools.preprocess import custom from marius.tools.preprocess.datasets import ( fb15k, fb15k_237, @@ -19,6 +18,7 @@ twitter ) from preprocess.datasets import ogbl_collab +from preprocess import custom def set_args(): @@ -101,6 +101,15 @@ def set_args(): help="List of column ids of input delimited files which denote the src node, edge-type, and dst node of edges.", ) + parser.add_argument( + "--edge_weight_column", + metavar="edge_weight_column", + required=False, + type=int, + default=-1, + help="The column id which denotes the edge weight column", + ) + return parser @@ -133,6 +142,7 @@ def main(): dataset = dataset_dict.get(args.dataset.upper()) if dataset is not None: + print("Using existing dataset of", args.dataset.upper()) dataset = dataset(args.output_directory, spark=args.spark) dataset.download(args.overwrite) dataset.preprocess( @@ -161,6 +171,7 @@ def main(): partitioned_eval=args.partitioned_eval, sequential_train_nodes=args.sequential_train_nodes, columns=args.columns, + edge_weight_column = args.edge_weight_column, ) diff --git a/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py b/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py index b96a4633..050eb8bf 100644 --- a/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py +++ b/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py @@ -4,11 +4,8 @@ import torch # isort:skip - -def dataframe_to_tensor(input_dataframe): - np_array = input_dataframe.to_dask_array().compute() - return torch.from_numpy(np_array) - +def dataframe_to_tensor(df): + return torch.tensor(df.to_numpy()) def partition_edges(edges, num_nodes, num_partitions): partition_size = int(np.ceil(num_nodes / num_partitions)) diff --git a/src/python/tools/preprocess/converters/readers/pandas_readers.py b/src/python/tools/preprocess/converters/readers/pandas_readers.py index 6c5265fc..63447ceb 100644 --- a/src/python/tools/preprocess/converters/readers/pandas_readers.py +++ b/src/python/tools/preprocess/converters/readers/pandas_readers.py @@ -40,15 +40,6 @@ def __init__( self.delim = delim - if len(self.columns) == 2: - self.has_rels = False - elif len(self.columns) == 3: - self.has_rels = True - else: - raise RuntimeError( - "Incorrect number of columns specified, expected length 2 or 3, received {}".format(len(self.columns)) - ) - def read(self): train_edges_df: pd.DataFrame = None valid_edges_df: pd.DataFrame = None @@ -56,15 +47,16 @@ def read(self): assert self.train_edges is not None train_edges_df = pd.read_csv(self.train_edges, delimiter=self.delim, skiprows=self.header_length, header=None) - train_edges_df = train_edges_df[train_edges_df.columns[self.columns]] + train_edges_df = train_edges_df[train_edges_df.columns[self.columns]].astype(str) if self.valid_edges is not None: valid_edges_df = pd.read_csv( self.valid_edges, delimiter=self.delim, skiprows=self.header_length, header=None ) - valid_edges_df = valid_edges_df[valid_edges_df.columns[self.columns]] + valid_edges_df = valid_edges_df[valid_edges_df.columns[self.columns]].astype(str) + if self.test_edges is not None: test_edges_df = pd.read_csv(self.test_edges, delimiter=self.delim, skiprows=self.header_length, header=None) - test_edges_df = test_edges_df[test_edges_df.columns[self.columns]] + test_edges_df = test_edges_df[test_edges_df.columns[self.columns]].astype(str) return train_edges_df, valid_edges_df, test_edges_df diff --git a/src/python/tools/preprocess/converters/torch_converter.py b/src/python/tools/preprocess/converters/torch_converter.py index a503ee66..c280685e 100644 --- a/src/python/tools/preprocess/converters/torch_converter.py +++ b/src/python/tools/preprocess/converters/torch_converter.py @@ -76,14 +76,16 @@ def apply_mapping1d(input_ids, mapping_df): raise RuntimeError("Unsupported datatype for input. Must be a pandas.Series or a 1D torch.Tensor") -def map_edge_list_dfs(edge_lists: list, known_node_ids=None, sequential_train_nodes=False, sequential_deg_nodes=0): +def map_edge_list_dfs(edge_lists: list, known_node_ids=None, sequential_train_nodes=False, sequential_deg_nodes=0, +columns = None, edge_type_column = -1, edge_weight_column = -1): if sequential_train_nodes or sequential_deg_nodes > 0: raise RuntimeError("sequential_train_nodes not yet supported for map_edge_list_dfs") + src_col, dst_col = columns[0], columns[-1] all_edges_df = pd.concat(edge_lists) - unique_src = all_edges_df.iloc[:, 0].unique() - unique_dst = all_edges_df.iloc[:, -1].unique() + unique_src = all_edges_df.iloc[:, src_col].unique() + unique_dst = all_edges_df.iloc[:, dst_col].unique() if known_node_ids is None: unique_nodes = np.unique(np.concatenate([unique_src.astype(str), unique_dst.astype(str)])) @@ -98,15 +100,13 @@ def map_edge_list_dfs(edge_lists: list, known_node_ids=None, sequential_train_no mapped_node_ids = np.random.permutation(num_nodes) nodes_dict = dict(zip(list(unique_nodes), list(mapped_node_ids))) - has_rels = False + has_rels = edge_type_column >= 0 unique_rels = torch.empty([0]) mapped_rel_ids = torch.empty([0]) rels_dict = None - if len(all_edges_df.columns) == 3: - has_rels = True if has_rels: - unique_rels = all_edges_df.iloc[:, 1].unique() + unique_rels = all_edges_df.iloc[:, edge_type_column].unique() num_rels = unique_rels.shape[0] mapped_rel_ids = np.random.permutation(num_rels) rels_dict = dict(zip(list(unique_rels), list(mapped_rel_ids))) @@ -114,25 +114,40 @@ def map_edge_list_dfs(edge_lists: list, known_node_ids=None, sequential_train_no all_edges_df = None # can safely free this df output_edge_lists = [] - for edge_list in edge_lists: - node_columns = edge_list.columns[[0, -1]] + save_order = ["src_col", "dst_col"] + for idx, edge_list in enumerate(edge_lists): + node_columns = edge_list.columns[[src_col, dst_col]] edge_list[node_columns] = edge_list[node_columns].applymap(nodes_dict.get) if has_rels: - rel_columns = edge_list.columns[1] + rel_columns = edge_list.columns[edge_type_column] edge_list[rel_columns] = edge_list[rel_columns].map(rels_dict.get) + cols_ids_to_keep = [src_col, dst_col] + if edge_type_column >= 0: + cols_ids_to_keep.insert(len(cols_ids_to_keep) - 1, edge_type_column) + if idx == 0: + save_order.insert(len(save_order) - 1, "edge_type") + + if edge_weight_column >= 0: + edge_list[edge_weight_column] = edge_list[edge_weight_column].astype(float) + cols_ids_to_keep.insert(len(cols_ids_to_keep) - 1, edge_weight_column) + if idx == 0: + save_order.insert(len(save_order) - 1, "edge_weight") + + edge_list = edge_list[edge_list.columns[cols_ids_to_keep]] output_edge_lists.append(dataframe_to_tensor(edge_list)) node_mapping = np.stack([unique_nodes, mapped_node_ids], axis=1) rel_mapping = None if has_rels: rel_mapping = np.stack([unique_rels, mapped_rel_ids], axis=1) - return output_edge_lists, node_mapping, rel_mapping + + return output_edge_lists, node_mapping, rel_mapping, save_order def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, sequential_train_nodes=False, -sequential_deg_nodes=0, edge_type_column = -1, edge_weight_column = -1): +sequential_deg_nodes=0, columns = None, edge_type_column = -1, edge_weight_column = -1): print("Remapping Edges") defined_edges = [] @@ -145,7 +160,8 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s if isinstance(edge_lists[0], pd.DataFrame): if isinstance(edge_lists[0].iloc[0][0], str): # need to take uniques using pandas for string datatypes, since torch doesn't support strings - return map_edge_list_dfs(edge_lists, known_node_ids, sequential_train_nodes, sequential_deg_nodes) + return map_edge_list_dfs(edge_lists, known_node_ids, sequential_train_nodes, sequential_deg_nodes, + columns = columns, edge_type_column = edge_type_column, edge_weight_column = edge_weight_column) new_edge_lists = [] for edge_list in edge_lists: @@ -154,6 +170,7 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s edge_lists = new_edge_lists all_edges = torch.cat(edge_lists) + src_col, dst_col = columns[0], columns[-1] has_rels, has_weights = edge_type_column >= 0, edge_weight_column >= 0 num_rels = 1 unique_rels = torch.empty([0]) @@ -161,8 +178,8 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s output_dtype = torch.int32 if perform_unique: - unique_src = torch.unique(all_edges[:, 0]) - unique_dst = torch.unique(all_edges[:, -1]) + unique_src = torch.unique(all_edges[:, src_col]) + unique_dst = torch.unique(all_edges[:, dst_col]) if known_node_ids is None: unique_nodes = torch.unique(torch.cat([unique_src, unique_dst]), sorted=True) else: @@ -177,11 +194,11 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s print("Number of relationships", num_rels) else: - num_nodes = torch.max(all_edges[:, 0])[0] + num_nodes = torch.max(all_edges[:, src_col])[0] unique_nodes = torch.arange(num_nodes).to(output_dtype) if has_rels: - num_rels = torch.max(all_edges[:, 1])[0] + num_rels = torch.max(all_edges[:, edge_type_column])[0] unique_rels = torch.arange(num_rels).to(output_dtype) if sequential_train_nodes or sequential_deg_nodes > 0: @@ -273,8 +290,8 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s output_edge_lists = [] save_order = ["src_node", "dest_node"] for idx, edge_list in enumerate(edge_lists): - new_src = extended_map[edge_list[:, 0].to(torch.int64)] - new_dst = extended_map[edge_list[:, -1].to(torch.int64)] + new_src = extended_map[edge_list[:, src_col].to(torch.int64)] + new_dst = extended_map[edge_list[:, dst_col].to(torch.int64)] if has_rels: new_rel = mapped_rel_ids[edge_list[:, edge_type_column].to(torch.int64)] @@ -285,7 +302,7 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s save_order.insert(1, "edge_type") if has_weights: - curr_row.insert(2, edge_list[ : , edge_weight_column]) + curr_row.insert(2, edge_list[ : , edge_weight_column].to(torch.float32)) if idx == 0: save_order.insert(2, "edge_weight") @@ -293,7 +310,7 @@ def map_edge_lists(edge_lists: list, perform_unique=True, known_node_ids=None, s else: curr_row = [new_src, new_dst] if has_weights: - curr_row.insert(1, edge_list[ : , edge_weight_column]) + curr_row.insert(1, edge_list[ : , edge_weight_column].to(torch.float32)) if idx == 0: save_order.insert(1, "edge_weight") @@ -411,14 +428,12 @@ def __init__( E.g. columns=[0, 2, 1] means that the source nodes are found in the first column, the edge-types are found in in the third column, and the destination nodes are found in the second column. For datasets with both edge-types and edge-weights, the edge-weights column should be specified by edge_weight_column - field while the edge-types is read from this parameter. - For datasets with only edge-weights, the edge-weight column should be replaced with the edge-type column in this parameter - as well as passed into the edge_weight_column parameter. Thus if edge weights are in column 3, edge src in column 0, edge - dst in column 1, then columns should be set to [0, 3, 1] and edge_weight_column should be set to 3. - For datasets without edge-types or weight, only two column ids should be specified. E.g. columns=[0, 2] - :param edge_weight_column: The column id storing the edge weight. If the file has edge weights and not edge types then the edge weight column id must - also be passed into the columns parameter. See the columns parameter for more details on how to do so. If the data is in - memory then this parameter specifies the idx in each row that contains the edge weight. + field while the edge-types is read from the middle field of this parameter. + For datasets with only edge-weights, the edge-weight column should be set and the src and dest node columns should be + specified by this field. Thus if edge weights are in column 3, edge src in column 0, edge + dst in column 1, then columns should be set to [0, 1] and edge_weight_column should be set to 3. + For datasets without edge-types or weight, only the src and dest node columns need to be passed in through this parameter + :param edge_weight_column: The column id storing the edge weight. :param header_length: Length of the header for input delimited files :param delim: Delimiter of the input delimited files :param dtype: Datatype of the node ids in the output preprocessed datasets. Unless you have over 2 billion nodes, this should @@ -449,14 +464,15 @@ def __init__( self.edge_type_column = -1 self.edge_weight_column = edge_weight_column if len(self.columns) > 2: - possible_edge_type_column = self.columns[1] - if possible_edge_type_column > 0 and possible_edge_type_column != self.edge_weight_column: - self.edge_type_column = possible_edge_type_column + self.edge_type_column = self.columns[1] + print("Edge type column", self.edge_type_column, "with edge weight column", self.edge_weight_column) if format.upper() in SUPPORTED_DELIM_FORMATS: assert isinstance(train_edges, str) or isinstance(train_edges, Path) - self.reader = PandasDelimitedFileReader(train_edges, valid_edges, test_edges, columns, header_length, delim) + max_col_num = max(max(self.columns), self.edge_weight_column) + all_cols_to_read = [i for i in range(0, max_col_num + 1)] + self.reader = PandasDelimitedFileReader(train_edges, valid_edges, test_edges, all_cols_to_read, header_length, delim) elif format.upper() in SUPPORTED_IN_MEMORY_FORMATS: self.reader = None @@ -499,7 +515,7 @@ def __init__( # Determine if this has edge types self.has_rels = False - if len(columns) == 3 and columns[1] != self.edge_weight_column: + if self.edge_type_column >= 0: self.has_rels = True if dtype.upper() == "INT32" or dtype.upper() == "INT": @@ -552,13 +568,8 @@ def extract_edge_mapping(self): save_order.insert(1, "edge_weight") elif self.edge_type_column >= 0 and self.edge_weight_column >= 0: # Have both edge type and edge weight - first_val, second_val = "edge_type", "edge_weight" - if self.edge_type_column > self.edge_weight_column: - # Edge type column comes later - first_val = "edge_weight" - second_val = "edge_type" - save_order.insert(1, first_val) - save_order.insert(2, second_val) + save_order.insert(1, "edge_type") + save_order.insert(2, "edge_weight") return save_order @@ -583,6 +594,7 @@ def convert(self): sequential_deg_nodes=self.sequential_deg_nodes, edge_type_column = self.edge_type_column, edge_weight_column = self.edge_weight_column, + columns = self.columns, ) self.num_nodes = node_mapping.shape[0] @@ -624,13 +636,23 @@ def convert(self): delimiter=",", ) else: + cols_to_keep = [self.columns[0], self.columns[-1]] + if self.edge_type_column >= 0: + cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column) + + if self.edge_weight_column >= 0: + cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column) + + train_edges_df = train_edges_df[train_edges_df.columns[cols_to_keep]].astype(int) train_edges_tens = dataframe_to_tensor(train_edges_df) if valid_edges_df is not None: + valid_edges_df = valid_edges_df[valid_edges_df.columns[cols_to_keep]].astype(int) valid_edges_tens = dataframe_to_tensor(valid_edges_df) if test_edges_df is not None: - test_edges_tens = dataframe_to_tensor(test_edges_df) + test_edges_df = valid_edges_df[test_edges_df.columns[cols_to_keep]].astype(int) + test_edges_tens = dataframe_to_tensor(test_edges_df) else: print("Using in memory data") @@ -646,6 +668,7 @@ def convert(self): sequential_deg_nodes=self.sequential_deg_nodes, edge_type_column = self.edge_type_column, edge_weight_column = self.edge_weight_column, + columns = self.columns, ) self.num_nodes = node_mapping.shape[0] @@ -685,19 +708,60 @@ def convert(self): fmt="%s", delimiter=",", ) + + else: + cols_to_keep = [self.columns[0], self.columns[-1]] + if self.edge_type_column >= 0: + cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column) + + if self.edge_weight_column >= 0: + cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column) + + train_edges_tens = train_edges_tens[ : , [cols_to_keep]] + if valid_edges_tens is not None: + valid_edges_tens = valid_edges_tens[ : , [cols_to_keep]] + if test_edges_tens is not None: + test_edges_tens = test_edges_tens[ : , [cols_to_keep]] + + # Split the edges + if self.splits is not None: + train_edges_tens, valid_edges_tens, test_edges_tens = split_edges(train_edges_tens, self.splits) + + if save_order is None: + save_order = self.save_order() + print("Determine tensor save order of", save_order) + + # Extract the weights if they exist + train_edges_weights, valid_edges_weights, test_edges_weights = None, None, None + if "edge_weight" in save_order: + edge_idx = save_order.index("edge_weight") + cols_to_keep = [i for i in range(len(save_order))] + cols_to_keep.pop(edge_idx) + save_order.pop(edge_idx) + + train_edges_weights = train_edges_tens[ : , [edge_idx]].to(torch.float32) + train_edges_tens = train_edges_tens[ : , cols_to_keep] + + if valid_edges_tens is not None: + valid_edges_weights = valid_edges_tens[ : , [edge_idx]].to(torch.float32) + valid_edges_tens = valid_edges_tens[ : , cols_to_keep] + + if test_edges_tens is not None: + test_edges_weights = test_edges_tens[ : , [edge_idx]].to(torch.float32) + test_edges_tens = test_edges_tens[ : , cols_to_keep] train_edges_tens = train_edges_tens.to(self.dtype) if valid_edges_tens is not None: valid_edges_tens = valid_edges_tens.to(self.dtype) if test_edges_tens is not None: test_edges_tens = test_edges_tens.to(self.dtype) + + # Add this point the data must be in the save order. This means that the src node will always be in col 0 + # and the dst node in col -1. + + print("First row of train_edges_tens is", train_edges_tens[0], "with edges", train_edges_weights[0]) - if self.splits is not None: - train_edges_tens, valid_edges_tens, test_edges_tens = split_edges(train_edges_tens, self.splits) - - if save_order is None: - save_order = self.save_order() - + ''' if self.partitioner is not None: print("Partition nodes into {} partitions".format(self.num_partitions)) ( @@ -721,9 +785,9 @@ def convert(self): train_edges_offsets, valid_edges_offsets, test_edges_offsets, - save_order = save_order ) else: return self.writer.write_to_binary( train_edges_tens, valid_edges_tens, test_edges_tens, self.num_nodes, - self.num_rels, self.num_partitions, save_order = save_order) \ No newline at end of file + self.num_rels, self.num_partitions) + ''' \ No newline at end of file diff --git a/src/python/tools/preprocess/custom.py b/src/python/tools/preprocess/custom.py index 9f032c0a..183a1a99 100644 --- a/src/python/tools/preprocess/custom.py +++ b/src/python/tools/preprocess/custom.py @@ -1,7 +1,7 @@ import importlib from pathlib import Path -from marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter +from preprocess.converters.torch_converter import TorchEdgeListConverter from marius.tools.preprocess.dataset import LinkPredictionDataset pyspark_loader = importlib.find_loader("pyspark") @@ -44,12 +44,13 @@ def preprocess( partitioned_eval=False, sequential_train_nodes=False, columns=[0, 1, 2], + edge_weight_column = -1, ): if self.spark and pyspark_found: converter_class = SparkEdgeListConverter else: converter_class = TorchEdgeListConverter - + converter = converter_class( output_dir=self.output_directory, train_edges=self.train_edges_file, @@ -61,6 +62,7 @@ def preprocess( splits=splits, remap_ids=remap_ids, partitioned_evaluation=partitioned_eval, + edge_weight_column = edge_weight_column, ) - converter.convert() + return converter.convert() \ No newline at end of file diff --git a/src/python/tools/preprocess/datasets/ogbl_collab.py b/src/python/tools/preprocess/datasets/ogbl_collab.py index 8dc76f0b..cb867f7a 100644 --- a/src/python/tools/preprocess/datasets/ogbl_collab.py +++ b/src/python/tools/preprocess/datasets/ogbl_collab.py @@ -23,7 +23,7 @@ class OGBLCollab(LinkPredictionDataset): than one year. """ - def __init__(self, output_directory: Path, spark=False, include_edge_type = False, include_edge_weight = False): + def __init__(self, output_directory: Path, spark=False, include_edge_type = False, include_edge_weight = True): super().__init__(output_directory, spark) self.dataset_name = "ogbl_citation2" @@ -60,75 +60,64 @@ def download(self, overwrite=False): self.num_nodes = df.iloc[0][0] def preprocess(self, num_partitions=1, remap_ids=True, splits=None, - sequential_train_nodes=False, partitioned_eval=False): + sequential_train_nodes=False, partitioned_eval=False, df_save_dir = None): + + # Read in the training data + train_idx = torch.load(self.input_train_edges_file) + train_edges = train_idx.get("edge") + + # Read in the valid data + valid_idx = torch.load(self.input_valid_edges_file) + valid_edges = valid_idx.get("edge") + + # Read in the test data + test_idx = torch.load(self.input_test_edges_file) + test_edges = test_idx.get("edge") + + weights_col_id = -1 + col_ids = [0, 1] + + if self.include_edge_type: + # Added in the year information + train_edges = np.hstack([train_edges, train_idx.get("year").reshape(-1, 1)]) + valid_edges = np.hstack([valid_edges, valid_idx.get("year").reshape(-1, 1)]) + test_edges = np.hstack([test_edges, test_idx.get("year").reshape(-1, 1)]) + + # Normalize the edge types + min_year = min([np.min(train_edges[ : , -1]), np.min(valid_edges[ : , -1]), np.min(test_edges[ : , -1])]) + train_edges[ : , -1] = train_edges[ : , -1] - min_year + valid_edges[ : , -1] = valid_edges[ : , -1] - min_year + test_edges[ : , -1] = test_edges[ : , -1] - min_year + + # Added in edge type column id + col_ids.insert(1, train_edges.shape[1] - 1) - for edge_type in [False, True]: - for edge_weight in [False, True]: - self.include_edge_type = edge_type - self.include_edge_weight = edge_weight - save_dir = os.path.join(self.output_directory, "type_" + str(self.include_edge_type) + "_weight_" + str(self.include_edge_weight)) - os.makedirs(save_dir, exist_ok = True) - - # Read in the training data - train_idx = torch.load(self.input_train_edges_file) - train_edges = train_idx.get("edge") - - # Read in the valid data - valid_idx = torch.load(self.input_valid_edges_file) - valid_edges = valid_idx.get("edge") - - # Read in the test data - test_idx = torch.load(self.input_test_edges_file) - test_edges = test_idx.get("edge") - - weights_col_id = -1 - col_ids = [0, 1] - if self.include_edge_weight: - # Add in the weights - train_weights = train_idx.get("weight").reshape(-1, 1) - train_edges = np.hstack([train_edges, train_weights]) - - valid_weights = valid_idx.get("weight").reshape(-1, 1) - valid_edges = np.hstack([valid_edges, valid_weights]) - - test_weights = test_idx.get("weight").reshape(-1, 1) - test_edges = np.hstack([test_edges, test_weights]) - - weights_col_id = 2 - col_ids.insert(1, weights_col_id) - - if self.include_edge_type: - # Added in the year information - train_edges = np.hstack([train_edges, train_idx.get("year").reshape(-1, 1)]) - valid_edges = np.hstack([valid_edges, valid_idx.get("year").reshape(-1, 1)]) - test_edges = np.hstack([test_edges, test_idx.get("year").reshape(-1, 1)]) - - # Normalize the edge types - min_year = min([np.min(train_edges[ : , -1]), np.min(valid_edges[ : , -1]), np.min(test_edges[ : , -1])]) - train_edges[ : , -1] = train_edges[ : , -1] - min_year - valid_edges[ : , -1] = valid_edges[ : , -1] - min_year - test_edges[ : , -1] = test_edges[ : , -1] - min_year - - # Added in edge type column id - last_col_id = train_edges.shape[1] - 1 - if len(col_ids) == 3: - col_ids[1] = last_col_id - else: - col_ids.insert(1, last_col_id) - - # Add in the edge type information - converter = TorchEdgeListConverter( - output_dir = save_dir, - train_edges = train_edges, - valid_edges = valid_edges, - test_edges = test_edges, - num_partitions = num_partitions, - remap_ids = remap_ids, - known_node_ids = [ torch.arange(self.num_nodes) ], - format = "numpy", - edge_weight_column = weights_col_id, - columns = col_ids, - partitioned_evaluation=partitioned_eval, - ) - - converter.convert() \ No newline at end of file + if self.include_edge_weight: + # Add in the weights + train_weights = train_idx.get("weight").reshape(-1, 1) + train_edges = np.hstack([train_edges, train_weights]) + + valid_weights = valid_idx.get("weight").reshape(-1, 1) + valid_edges = np.hstack([valid_edges, valid_weights]) + + test_weights = test_idx.get("weight").reshape(-1, 1) + test_edges = np.hstack([test_edges, test_weights]) + + weights_col_id = train_edges.shape[1] - 1 + + # Add in the edge type information + converter = TorchEdgeListConverter( + output_dir = self.output_directory, + train_edges = train_edges, + valid_edges = valid_edges, + test_edges = test_edges, + num_partitions = num_partitions, + remap_ids = remap_ids, + known_node_ids = [ torch.arange(self.num_nodes) ], + format = "numpy", + edge_weight_column = weights_col_id, + columns = col_ids, + partitioned_evaluation=partitioned_eval, + ) + + converter.convert()