forked from frigategnn/Bonsai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
169 lines (150 loc) · 6.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
r"""Utilities for Bonsai: Gradient-free Graph Distillation for Node Classification
"""
import os
from typing import List, Tuple
from collections import defaultdict
import heapq
import numpy as np
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
import torch
from torch_geometric.data import Data
def wl2rknn(WL_distances: np.ndarray, *, sampled_nodes: np.ndarray, k: int) -> dict:
r"""Compute rev-k-nn's from WL distances."""
assert isinstance(WL_distances, np.ndarray)
sampled_nnodes_ind = WL_distances.shape[0]
knn = []
# nnodes is len(WL_distances), that is same index as WL_distances.
for node in tqdm(range(sampled_nnodes_ind), desc="Eval KNN", ascii=True, ncols=120):
topk = np.argpartition(WL_distances[node], k)[:k]
knn.append(topk)
rknn = defaultdict(set)
for node, knn_node in tqdm(enumerate(knn), desc="Eval rKNN", ascii=True, ncols=120):
_ = [rknn[q].add(sampled_nodes[node]) for q in knn_node]
del _
# rknn has as index the same index as WL_distances
# the index is used in two places, rknn key and rknn values
return {"rknn": dict(rknn), "knn": np.asarray(knn)}
def select_max_coverage_rknn_celf(rknn_dict: dict) -> List[int]:
r"""https://www.cs.cmu.edu/~jure/pubs/detect-kdd07.pdf"""
covered_nodes = set()
selected_nodes = []
# Initialize priority queue with initial marginal gains
#print(f"init calc: {len(rknn_dict)}")
pq = [(-len(neighbors), node) for node, neighbors in rknn_dict.items()]
heapq.heapify(pq)
neg_gain, node = heapq.heappop(pq)
covered_nodes.update(rknn_dict[node])
selected_nodes.append(node)
while pq:
# Get the node with the highest marginal gain
neg_gain, node = heapq.heappop(pq)
if neg_gain == 0:
selected_nodes.extend(node for _, node in pq)
selected_nodes.append(node)
break
# Add the selected node to the list and update the set of covered nodes
#print(f"Recalc: {node}")
new_gain = len(set(rknn_dict[node]) - covered_nodes)
max_gain, max_node = new_gain, node
if len(pq) != 0:
stale_gain, top_node = pq[0]
temp_list = [(new_gain, node)]
max_node_idx = 0
idx = 0
while new_gain < -stale_gain:
new_gain = len(set(rknn_dict[top_node]) - covered_nodes)
temp_list.append((new_gain, top_node))
idx += 1
if new_gain > max_gain:
max_node_idx = idx
max_gain = new_gain
max_node = top_node
heapq.heappop(pq)
stale_gain, top_node = pq[0]
temp_list.pop(max_node_idx)
[heapq.heappush(pq, (-new_gain, node)) for new_gain, node in temp_list]
selected_nodes.append(max_node)
covered_nodes.update(rknn_dict[max_node])
del rknn_dict[max_node]
return selected_nodes
# def select_max_coverage_rknn_celf(rknn_dict: dict) -> List[int]:
# r"""https://www.cs.cmu.edu/~jure/pubs/detect-kdd07.pdf"""
# covered_nodes = set()
# selected_nodes = []
#
# # Initialize priority queue with initial marginal gains
# pq = [(-len(neighbors), 0, node) for node, neighbors in rknn_dict.items()]
# heapq.heapify(pq)
#
# iteration = 1
#
# while pq:
# # Get the node with the highest marginal gain
# neg_gain, last_iteration, node = heapq.heappop(pq)
#
# # If this node's gain was last calculated in the current iteration, it's the best node
# if last_iteration == iteration - 1:
# # Early stopping: if the max coverage is zero, append remaining nodes and break
# if neg_gain == 0:
# selected_nodes.extend(node for _, _, node in pq)
# selected_nodes.append(node)
# break
#
# # Add the selected node to the list and update the set of covered nodes
# selected_nodes.append(node)
# covered_nodes.update(rknn_dict[node])
# del rknn_dict[node]
# iteration += 1
# else:
# # Recalculate the marginal gain
# new_gain = len(set(rknn_dict[node]) - covered_nodes)
# heapq.heappush(pq, (-new_gain, iteration - 1, node))
#
# return selected_nodes
# Set environment variables to limit the number of threads
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
def collect_features_used(tree) -> List[int]:
r"""A utility to extract features used by a decision tree.
"""
features_used = set()
def traverse(node_id):
node = tree.tree_
# if node_id == 2:
if node.feature[node_id] >= 0:
# Collect the feature used for splitting at this node
features_used.add(node.feature[node_id])
# Recursively traverse left and right children
traverse(node.children_left[node_id])
traverse(node.children_right[node_id])
traverse(0) # Start traversal from the root node
return sorted(features_used) # Return sorted list of features used
def transform_features_with_tree(
data: Data, ego_graph_vmed: List[torch.Tensor], train: List[int]
) -> Tuple[List[torch.Tensor], List[int]]:
r"""Extracts most useful features with decision tree.
"""
# Extract features and labels
features = [
ego_graph_vmed_sample.numpy() for ego_graph_vmed_sample in ego_graph_vmed
]
labels = data.y[train].numpy()
# Normalize features if needed, but binary features are already in range [0, 1]
# scaler = preprocessing.StandardScaler()
# features_normalized = scaler.fit_transform(features)
# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=50)
clf.fit(features, labels)
# Get all features used for splitting
features_used = collect_features_used(clf)
# Create new feature vectors based on the features used in splitting
# This will be a binary feature vector where the columns
# corresponding to features_used are retained
# and all other columns are set to 0
# Convert to PyTorch tensor
# ego_graph_vmed = [ego_graph_vmed[i] for i in features_used]
filtered_ego_graph_vmed = [tensor[features_used] for tensor in ego_graph_vmed]
return filtered_ego_graph_vmed, features_used