diff --git a/src/graph/sampler.cc b/src/graph/sampler.cc index e3a3043a43f1..5b0ceb851406 100644 --- a/src/graph/sampler.cc +++ b/src/graph/sampler.cc @@ -220,116 +220,15 @@ static void GetNonUniformSample(const float* probability, } /* - * Used for subgraph sampling + * This constructs a subgraph from the sampled result. */ -struct neigh_list { - std::vector neighs; - std::vector edges; - neigh_list(const std::vector &_neighs, - const std::vector &_edges) - : neighs(_neighs), edges(_edges) {} -}; - -SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, - const float* probability, - const std::string &neigh_type, - int num_hops, - size_t num_neighbor) const { - unsigned int time_seed = time(nullptr); - size_t num_seeds = seed_arr->shape[0]; - auto orig_csr = neigh_type == "in" ? GetInCSR() : GetOutCSR(); - const dgl_id_t* val_list = orig_csr->edge_ids.data(); - const dgl_id_t* col_list = orig_csr->indices.data(); - const int64_t* indptr = orig_csr->indptr.data(); - const dgl_id_t* seed = static_cast(seed_arr->data); - - std::unordered_set sub_ver_map; // The vertex Ids in a layer. - std::vector > sub_vers; - sub_vers.reserve(num_seeds * 10); - // add seed vertices - for (size_t i = 0; i < num_seeds; ++i) { - auto ret = sub_ver_map.insert(seed[i]); - // If the vertex is inserted successfully. - if (ret.second) { - sub_vers.emplace_back(seed[i], 0); - } - } - std::vector tmp_sampled_src_list; - std::vector tmp_sampled_edge_list; - // ver_id, position - std::vector > neigh_pos; - neigh_pos.reserve(num_seeds); - std::vector neighbor_list; - std::vector layer_offsets(num_hops + 1); - int64_t num_edges = 0; - - layer_offsets[0] = 0; - layer_offsets[1] = sub_vers.size(); - size_t idx = 0; - for (size_t layer_id = 1; layer_id < num_hops; layer_id++) { - // We need to avoid resampling the same node in a layer, but we allow a node - // to be resampled in multiple layers. We use `sub_ver_map` to keep track of - // sampled nodes in a layer, and clear it when entering a new layer. - sub_ver_map.clear(); - // sub_vers is used both as a node collection and a queue. - // In the while loop, we iterate over sub_vers and new nodes are added to the vector. - // A vertex in the vector only needs to be accessed once. If there is a vertex behind idx - // isn't in the last level, we will sample its neighbors. If not, the while loop terminates. - while (idx < sub_vers.size() && layer_id - 1 == sub_vers[idx].second) { - dgl_id_t dst_id = sub_vers[idx].first; - int cur_node_level = sub_vers[idx].second; - idx++; - - tmp_sampled_src_list.clear(); - tmp_sampled_edge_list.clear(); - dgl_id_t ver_len = *(indptr+dst_id+1) - *(indptr+dst_id); - if (probability == nullptr) { // uniform-sample - GetUniformSample(val_list + *(indptr + dst_id), - col_list + *(indptr + dst_id), - ver_len, - num_neighbor, - &tmp_sampled_src_list, - &tmp_sampled_edge_list, - &time_seed); - } else { // non-uniform-sample - GetNonUniformSample(probability, - val_list + *(indptr + dst_id), - col_list + *(indptr + dst_id), - ver_len, - num_neighbor, - &tmp_sampled_src_list, - &tmp_sampled_edge_list, - &time_seed); - } - CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size()); - size_t pos = neighbor_list.size(); - neigh_pos.emplace_back(dst_id, pos); - // First we push the size of neighbor vector - neighbor_list.push_back(tmp_sampled_edge_list.size()); - // Then push the vertices - for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) { - neighbor_list.push_back(tmp_sampled_src_list[i]); - } - // Finally we push the edge list - for (size_t i = 0; i < tmp_sampled_edge_list.size(); ++i) { - neighbor_list.push_back(tmp_sampled_edge_list[i]); - } - num_edges += tmp_sampled_src_list.size(); - for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) { - // We need to add the neighbor in the hashtable here. This ensures that - // the vertex in the queue is unique. If we see a vertex before, we don't - // need to add it to the queue again. - auto ret = sub_ver_map.insert(tmp_sampled_src_list[i]); - // If the sampled neighbor is inserted to the map successfully. - if (ret.second) { - sub_vers.emplace_back(tmp_sampled_src_list[i], cur_node_level + 1); - } - } - } - layer_offsets[layer_id + 1] = layer_offsets[layer_id] + sub_ver_map.size(); - } - +SampledSubgraph ConstructSubgraph(const std::vector> &sub_vers, + const std::vector &layer_offsets, + const std::vector> &neigh_pos, + const std::vector &neighbor_list, + size_t num_edges) { uint64_t num_vertices = sub_vers.size(); + size_t num_hops = layer_offsets.size() - 1; SampledSubgraph subg; subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); @@ -345,17 +244,13 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, dgl_id_t* val_list_out = static_cast(subg.induced_edges->data); // Construct sub_csr_graph - auto subg_csr = std::make_shared(num_vertices, num_edges); + auto subg_csr = std::make_shared(num_vertices, num_edges); subg_csr->indices.resize(num_edges); subg_csr->edge_ids.resize(num_edges); dgl_id_t* col_list_out = subg_csr->indices.data(); int64_t* indptr_out = subg_csr->indptr.data(); size_t collected_nedges = 0; - // The data from the previous steps: - // * node data: sub_vers (vid, layer), neigh_pos, - // * edge data: neighbor_list, probability. - // * layer_offsets: the offset in sub_vers. dgl_id_t ver_id = 0; std::vector> layer_ver_maps; layer_ver_maps.resize(num_hops + 1); @@ -364,8 +259,7 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, // after remap to a subgraph. std::sort(sub_vers.begin() + layer_offsets[layer_id], sub_vers.begin() + layer_offsets[layer_id + 1], - [](const std::pair &a1, - const std::pair &a2) { + [](const std::pair &a1, const std::pair &a2) { return a1.first < a2.first; }); @@ -437,6 +331,107 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, return subg; } +SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, + const float* probability, + const std::string &neigh_type, + int num_hops, + size_t num_neighbor) const { + unsigned int time_seed = time(nullptr); + size_t num_seeds = seed_arr->shape[0]; + auto orig_csr = neigh_type == "in" ? GetInCSR() : GetOutCSR(); + const dgl_id_t* val_list = orig_csr->edge_ids.data(); + const dgl_id_t* col_list = orig_csr->indices.data(); + const int64_t* indptr = orig_csr->indptr.data(); + const dgl_id_t* seed = static_cast(seed_arr->data); + + std::unordered_set sub_ver_map; // The vertex Ids in a layer. + std::vector > sub_vers; + sub_vers.reserve(num_seeds * 10); + // add seed vertices + for (size_t i = 0; i < num_seeds; ++i) { + auto ret = sub_ver_map.insert(seed[i]); + // If the vertex is inserted successfully. + if (ret.second) { + sub_vers.emplace_back(seed[i], 0); + } + } + std::vector tmp_sampled_src_list; + std::vector tmp_sampled_edge_list; + // ver_id, position + std::vector > neigh_pos; + neigh_pos.reserve(num_seeds); + std::vector neighbor_list; + std::vector layer_offsets(num_hops + 1); + int64_t num_edges = 0; + + layer_offsets[0] = 0; + layer_offsets[1] = sub_vers.size(); + size_t idx = 0; + for (size_t layer_id = 1; layer_id < num_hops; layer_id++) { + // We need to avoid resampling the same node in a layer, but we allow a node + // to be resampled in multiple layers. We use `sub_ver_map` to keep track of + // sampled nodes in a layer, and clear it when entering a new layer. + sub_ver_map.clear(); + // sub_vers is used both as a node collection and a queue. + // In the while loop, we iterate over sub_vers and new nodes are added to the vector. + // A vertex in the vector only needs to be accessed once. If there is a vertex behind idx + // isn't in the last level, we will sample its neighbors. If not, the while loop terminates. + while (idx < sub_vers.size() && layer_id - 1 == sub_vers[idx].second) { + dgl_id_t dst_id = sub_vers[idx].first; + int cur_node_level = sub_vers[idx].second; + idx++; + + tmp_sampled_src_list.clear(); + tmp_sampled_edge_list.clear(); + dgl_id_t ver_len = *(indptr+dst_id+1) - *(indptr+dst_id); + if (probability == nullptr) { // uniform-sample + GetUniformSample(val_list + *(indptr + dst_id), + col_list + *(indptr + dst_id), + ver_len, + num_neighbor, + &tmp_sampled_src_list, + &tmp_sampled_edge_list, + &time_seed); + } else { // non-uniform-sample + GetNonUniformSample(probability, + val_list + *(indptr + dst_id), + col_list + *(indptr + dst_id), + ver_len, + num_neighbor, + &tmp_sampled_src_list, + &tmp_sampled_edge_list, + &time_seed); + } + CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size()); + size_t pos = neighbor_list.size(); + neigh_pos.emplace_back(dst_id, pos); + // First we push the size of neighbor vector + neighbor_list.push_back(tmp_sampled_edge_list.size()); + // Then push the vertices + for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) { + neighbor_list.push_back(tmp_sampled_src_list[i]); + } + // Finally we push the edge list + for (size_t i = 0; i < tmp_sampled_edge_list.size(); ++i) { + neighbor_list.push_back(tmp_sampled_edge_list[i]); + } + num_edges += tmp_sampled_src_list.size(); + for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) { + // We need to add the neighbor in the hashtable here. This ensures that + // the vertex in the queue is unique. If we see a vertex before, we don't + // need to add it to the queue again. + auto ret = sub_ver_map.insert(tmp_sampled_src_list[i]); + // If the sampled neighbor is inserted to the map successfully. + if (ret.second) { + sub_vers.emplace_back(tmp_sampled_src_list[i], cur_node_level + 1); + } + } + } + layer_offsets[layer_id + 1] = layer_offsets[layer_id] + sub_ver_map.size(); + } + return ConstructSubgraph(sub_vers, layer_offsets, neigh_pos, neighbor_list, num_edges); +} + SampledSubgraph ImmutableGraph::NeighborUniformSample(IdArray seeds, const std::string &neigh_type, int num_hops, int expand_factor) const {