From e30dfa5e1a7672f89ccdaa7cc5c8b61eb5d5a8ee Mon Sep 17 00:00:00 2001 From: Weiyue Su Date: Thu, 18 Mar 2021 16:42:35 +0800 Subject: [PATCH 1/3] sample with srand --- paddle/fluid/distributed/table/weighted_sampler.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/distributed/table/weighted_sampler.cc b/paddle/fluid/distributed/table/weighted_sampler.cc index 09ecdc2b642e4a..b2753399db5d66 100644 --- a/paddle/fluid/distributed/table/weighted_sampler.cc +++ b/paddle/fluid/distributed/table/weighted_sampler.cc @@ -41,6 +41,9 @@ std::vector WeightedSampler::sample_k(int k) { float subtract; std::unordered_map subtract_weight_map; std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); while (k--) { float query_weight = rand() % 100000 / 100000.0; query_weight *= weight - subtract_weight_map[this]; From 90f30ce85db7b31abb9fe15c4ecd837af423ed64 Mon Sep 17 00:00:00 2001 From: Weiyue Su Date: Fri, 19 Mar 2021 10:26:18 +0800 Subject: [PATCH 2/3] random sample --- .../distributed/table/common_graph_table.cc | 7 +-- paddle/fluid/distributed/table/graph_node.cc | 15 ++++-- paddle/fluid/distributed/table/graph_node.h | 4 +- .../distributed/table/weighted_sampler.cc | 47 +++++++++++++++++-- .../distributed/table/weighted_sampler.h | 24 ++++++++-- 5 files changed, 81 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 70c43e70990990..2af7c7338cbee3 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -133,6 +133,7 @@ int32_t GraphTable::load_nodes(const std::string &path) { int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { auto paths = paddle::string::split_string(path, ";"); int count = 0; + std::string sample_type = "random"; for (auto path : paths) { std::ifstream file(path); @@ -146,9 +147,10 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { if (reverse_edge) { std::swap(src_id, dst_id); } - float weight = 0; + float weight = 1; if (values.size() == 3) { weight = std::stof(values[2]); + sample_type = "weighted"; } size_t src_shard_id = src_id % shard_num; @@ -171,8 +173,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { for (auto &shard : shards) { auto bucket = shard.get_bucket(); for (int i = 0; i < bucket.size(); i++) { - bucket[i]->build_sampler(); - } + bucket[i]->build_sampler(sample_type); } } return 0; } diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc index 8408ea7aeafed1..4d557ca14182ba 100644 --- a/paddle/fluid/distributed/table/graph_node.cc +++ b/paddle/fluid/distributed/table/graph_node.cc @@ -22,10 +22,15 @@ int GraphNode::int_size = sizeof(int); int GraphNode::get_size(bool need_feature) { return id_size + int_size + (need_feature ? feature.size() : 0); } -void GraphNode::build_sampler() { - sampler = new WeightedSampler(); - GraphEdge** arr = edges.data(); - sampler->build((WeightedObject**)arr, 0, edges.size()); +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random"){ + sampler = new RandomSampler(); + } else if (sample_type == "weighted"){ + sampler = new WeightedSampler(); + } + //GraphEdge** arr = edges.data(); + //sampler->build((WeightedObject**)arr, 0, edges.size()); + sampler->build((std::vector*)&edges); } void GraphNode::to_buffer(char* buffer, bool need_feature) { int size = get_size(need_feature); @@ -51,4 +56,4 @@ void GraphNode::recover_from_buffer(char* buffer) { // type = GraphNodeType(int_state); } } -} \ No newline at end of file +} diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h index dbd387b2d4519a..f5169129d9f47f 100644 --- a/paddle/fluid/distributed/table/graph_node.h +++ b/paddle/fluid/distributed/table/graph_node.h @@ -40,7 +40,7 @@ class GraphNode { void set_feature(std::string feature) { this->feature = feature; } std::string get_feature() { return feature; } virtual int get_size(bool need_feature); - virtual void build_sampler(); + virtual void build_sampler(std::string sample_type); virtual void to_buffer(char *buffer, bool need_feature); virtual void recover_from_buffer(char *buffer); virtual void add_edge(GraphEdge *edge) { edges.push_back(edge); } @@ -58,7 +58,7 @@ class GraphNode { protected: uint64_t id; std::string feature; - WeightedSampler *sampler; + Sampler *sampler; std::vector edges; }; } diff --git a/paddle/fluid/distributed/table/weighted_sampler.cc b/paddle/fluid/distributed/table/weighted_sampler.cc index b2753399db5d66..52d44068a21dcd 100644 --- a/paddle/fluid/distributed/table/weighted_sampler.cc +++ b/paddle/fluid/distributed/table/weighted_sampler.cc @@ -14,9 +14,50 @@ #include "paddle/fluid/distributed/table/weighted_sampler.h" #include +#include namespace paddle { namespace distributed { -void WeightedSampler::build(WeightedObject **v, int start, int end) { + +void RandomSampler::build(std::vector* edges) { + this->edges = edges; +} + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n){ + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while(k--){ + int rand_int = rand() % n; + auto tmp = replace_map.find(rand_int); + if(tmp == replace_map.end()){ + sample_result.push_back(edges->at(rand_int)); + }else{ + sample_result.push_back(edges->at(tmp->second)); + } + + tmp = replace_map.find(n - 1); + if(tmp == replace_map.end()){ + replace_map[rand_int] = n - 1; + }else{ + replace_map[rand_int] = tmp->second; + } + --n; + } + return sample_result; +} + +void WeightedSampler::build(std::vector* edges) { + WeightedObject** v = edges->data(); + return build_one(v, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedObject **v, int start, int end) { count = 0; if (start + 1 == end) { left = right = NULL; @@ -27,8 +68,8 @@ void WeightedSampler::build(WeightedObject **v, int start, int end) { } else { left = new WeightedSampler(); right = new WeightedSampler(); - left->build(v, start, start + (end - start) / 2); - right->build(v, start + (end - start) / 2, end); + left->build_one(v, start, start + (end - start) / 2); + right->build_one(v, start + (end - start) / 2, end); weight = left->weight + right->weight; count = left->count + right->count; } diff --git a/paddle/fluid/distributed/table/weighted_sampler.h b/paddle/fluid/distributed/table/weighted_sampler.h index 9ed2cc04649de8..4d4640a33f5744 100644 --- a/paddle/fluid/distributed/table/weighted_sampler.h +++ b/paddle/fluid/distributed/table/weighted_sampler.h @@ -18,6 +18,7 @@ #include namespace paddle { namespace distributed { + class WeightedObject { public: WeightedObject() {} @@ -26,14 +27,31 @@ class WeightedObject { virtual float get_weight() = 0; }; -class WeightedSampler { +class Sampler { +public: + virtual ~Sampler() {} + virtual void build(std::vector* edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler: public Sampler { +public: + virtual ~RandomSampler() {} + virtual void build(std::vector* edges); + virtual std::vector sample_k(int k); + std::vector* edges; +}; + +class WeightedSampler: public Sampler { public: + virtual ~WeightedSampler() {} WeightedSampler *left, *right; WeightedObject *object; int count; float weight; - void build(WeightedObject **v, int start, int end); - std::vector sample_k(int k); + virtual void build(std::vector* edges); + virtual void build_one(WeightedObject **v, int start, int end); + virtual std::vector sample_k(int k); private: WeightedObject *sample( From ec2555ab1229dc2569304cb02f9ee6efdf32627b Mon Sep 17 00:00:00 2001 From: Weiyue Su Date: Fri, 19 Mar 2021 12:07:50 +0800 Subject: [PATCH 3/3] destruct weighted sampler --- .../distributed/table/weighted_sampler.cc | 41 +++++++++++++++---- .../distributed/table/weighted_sampler.h | 3 +- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/distributed/table/weighted_sampler.cc b/paddle/fluid/distributed/table/weighted_sampler.cc index 52d44068a21dcd..707d7ab0ba7d42 100644 --- a/paddle/fluid/distributed/table/weighted_sampler.cc +++ b/paddle/fluid/distributed/table/weighted_sampler.cc @@ -34,25 +34,50 @@ std::vector RandomSampler::sample_k(int k) { std::unordered_map replace_map; while(k--){ int rand_int = rand() % n; - auto tmp = replace_map.find(rand_int); - if(tmp == replace_map.end()){ + auto iter = replace_map.find(rand_int); + if(iter == replace_map.end()){ sample_result.push_back(edges->at(rand_int)); }else{ - sample_result.push_back(edges->at(tmp->second)); + sample_result.push_back(edges->at(iter->second)); } - tmp = replace_map.find(n - 1); - if(tmp == replace_map.end()){ + iter = replace_map.find(n - 1); + if(iter == replace_map.end()){ replace_map[rand_int] = n - 1; }else{ - replace_map[rand_int] = tmp->second; + replace_map[rand_int] = iter->second; } --n; } return sample_result; } +WeightedSampler::WeightedSampler(){ + left = nullptr; + right = nullptr; + object = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if(left != nullptr){ + delete left; + left = nullptr; + } + if(right != nullptr){ + delete right; + right = nullptr; + } +} + void WeightedSampler::build(std::vector* edges) { + if(left != nullptr){ + delete left; + left = nullptr; + } + if(right != nullptr){ + delete right; + right = nullptr; + } WeightedObject** v = edges->data(); return build_one(v, 0, edges->size()); } @@ -60,7 +85,7 @@ void WeightedSampler::build(std::vector* edges) { void WeightedSampler::build_one(WeightedObject **v, int start, int end) { count = 0; if (start + 1 == end) { - left = right = NULL; + left = right = nullptr; weight = v[start]->get_weight(); object = v[start]; count = 1; @@ -98,7 +123,7 @@ WeightedObject *WeightedSampler::sample( std::unordered_map &subtract_weight_map, std::unordered_map &subtract_count_map, float &subtract) { - if (left == NULL) { + if (left == nullptr) { subtract_weight_map[this] = weight; subtract = weight; subtract_count_map[this] = 1; diff --git a/paddle/fluid/distributed/table/weighted_sampler.h b/paddle/fluid/distributed/table/weighted_sampler.h index 4d4640a33f5744..4a7d08c1404a93 100644 --- a/paddle/fluid/distributed/table/weighted_sampler.h +++ b/paddle/fluid/distributed/table/weighted_sampler.h @@ -44,7 +44,8 @@ class RandomSampler: public Sampler { class WeightedSampler: public Sampler { public: - virtual ~WeightedSampler() {} + WeightedSampler(); + virtual ~WeightedSampler(); WeightedSampler *left, *right; WeightedObject *object; int count;