From e6234d1002eb529455fafde168f30ee98fe7cc0f Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Fri, 1 Mar 2019 20:05:15 -0800 Subject: [PATCH 1/6] supoort multiple trials --- examples/nasjob-example-RL.yaml | 2 + .../SuggestionParam.py | 54 +++---- pkg/suggestion/nasrl_service.py | 145 ++++++++++-------- 3 files changed, 115 insertions(+), 86 deletions(-) diff --git a/examples/nasjob-example-RL.yaml b/examples/nasjob-example-RL.yaml index 3942aed635e..3e2946436d8 100644 --- a/examples/nasjob-example-RL.yaml +++ b/examples/nasjob-example-RL.yaml @@ -150,6 +150,8 @@ spec: suggestionSpec: suggestionAlgorithm: "nasrl" suggestionParameters: + - name: "num_trials" + value: "1" - name: "lstm_num_cells" value: "64" - name: "lstm_num_layers" diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py index ae9f1d19777..fc75b073062 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py +++ b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py @@ -1,34 +1,36 @@ def parseSuggestionParam(params_raw): param_standard = { - "lstm_num_cells": ['value', int, [1, 'inf']], - "lstm_num_layers": ['value', int, [1, 'inf']], - "lstm_keep_prob": ['value', float, [0.0, 1.0]], - "optimizer": ['categorical', str, ["adam", "momentum", "sgd"]], - "init_learning_rate": ['value', float, [1e-6, 1.0]], - "lr_decay_start": ['value', int, [0, 'inf']], - "lr_decay_every": ['value', int, [1, 'inf']], - "lr_decay_rate": ['value', float, [0.0, 1.0]], - "skip-target": ['value', float, [0.0, 1.0]], - "skip-weight": ['value', float, [0.0, 'inf']], - "l2_reg": ['value', float, [0.0, 'inf']], - "entropy_weight": ['value', float, [0.0, 'inf']], - "baseline_decay": ['value', float, [0.0, 1.0]], + "num_trials": ['value', int, [1, 'inf']], + "lstm_num_cells": ['value', int, [1, 'inf']], + "lstm_num_layers": ['value', int, [1, 'inf']], + "lstm_keep_prob": ['value', float, [0.0, 1.0]], + "optimizer": ['categorical', str, ["adam", "momentum", "sgd"]], + "init_learning_rate": ['value', float, [1e-6, 1.0]], + "lr_decay_start": ['value', int, [0, 'inf']], + "lr_decay_every": ['value', int, [1, 'inf']], + "lr_decay_rate": ['value', float, [0.0, 1.0]], + "skip-target": ['value', float, [0.0, 1.0]], + "skip-weight": ['value', float, [0.0, 'inf']], + "l2_reg": ['value', float, [0.0, 'inf']], + "entropy_weight": ['value', float, [0.0, 'inf']], + "baseline_decay": ['value', float, [0.0, 1.0]], } suggestion_params = { - "lstm_num_cells": 64, - "lstm_num_layers": 1, - "lstm_keep_prob": 1.0, - "optimizer": "adam", - "init_learning_rate": 1e-3, - "lr_decay_start": 0, - "lr_decay_every": 1000, - "lr_decay_rate": 0.9, - "skip-target": 0.4, - "skip-weight": 0.8, - "l2_reg": 0, - "entropy_weight": 1e-4, - "baseline_decay": 0.9999 + "num_trials": 1, + "lstm_num_cells": 64, + "lstm_num_layers": 1, + "lstm_keep_prob": 1.0, + "optimizer": "adam", + "init_learning_rate": 1e-3, + "lr_decay_start": 0, + "lr_decay_every": 1000, + "lr_decay_rate": 0.9, + "skip-target": 0.4, + "skip-weight": 0.8, + "l2_reg": 0, + "entropy_weight": 1e-4, + "baseline_decay": 0.9999 } def checktype(param_name, param_value, check_mode, supposed_type, supposed_range=None): diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index d571784ad5e..d447180b103 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -10,9 +10,11 @@ import json import os + MANAGER_ADDRESS = "vizier-core" MANAGER_PORT = 6789 + class NAS_RL_StudyJob(object): def __init__(self, request, logger): self.logger = logger @@ -20,7 +22,7 @@ def __init__(self, request, logger): self.param_id = request.param_id self.study_name = None self.tf_graph = tf.Graph() - self.prev_trial_id = None + self.prev_trial_ids = list() self.ctrl_cache_file = "ctrl_cache/{}/{}.ckpt".format(request.study_id, request.study_id) self.ctrl_step = 0 self.is_first_run = True @@ -33,11 +35,13 @@ def __init__(self, request, logger): self.search_space = None self.opt_direction = None self.objective_name = None + self.num_trials = 1 self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100) self._get_study_param() self._get_suggestion_param() self._setup_controller() + self.num_trials = self.suggestion_config["num_trials"] self.logger.info("Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id)) def _get_study_param(self): @@ -188,7 +192,10 @@ def GetSuggestions(self, request, context): self.logger.info("First time running suggestion for {}. Random architecture will be given.".format(study.study_name)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) - arc = sess.run(controller_ops["sample_arc"]) + candidates = list() + for _ in range(study.num_trials): + candidates.append(sess.run(controller_ops["sample_arc"])) + # TODO: will use PVC to store the checkpoint to protect against unexpected suggestion pod restart saver.save(sess, study.ctrl_cache_file) @@ -199,90 +206,108 @@ def GetSuggestions(self, request, context): saver.restore(sess, study.ctrl_cache_file) valid_acc = ctrl.reward - result = self.GetEvaluationResult(study) + results = self.GetEvaluationResult(study) + avg_result = sum(results) / len(results) + + self.logger.info("Evaluation results of previous trials: {}".format(str(results)[1:-1])) + self.logger.info("The average is {}".format(avg_result)) # This lstm cell is designed to maximize the metrics # However, if the user want to minimize the metrics, we can take the negative of the result if study.opt_direction == api_pb2.MINIMIZE: - result = -result + avg_result = -avg_result loss, entropy, lr, gn, bl, skip, _ = sess.run( fetches=run_ops, - feed_dict={valid_acc: result}) + feed_dict={valid_acc: avg_result}) self.logger.info("Suggetion updated. LSTM Controller Reward: {}".format(loss)) - arc = sess.run(controller_ops["sample_arc"]) + candidates = list() + for _ in range(study.num_trials): + candidates.append(sess.run(controller_ops["sample_arc"])) saver.save(sess, study.ctrl_cache_file) - - arc = arc.tolist() - organized_arc = [0 for _ in range(study.num_layers)] - record = 0 - for l in range(study.num_layers): - organized_arc[l] = arc[record: record + l + 1] - record += l + 1 - - nn_config = dict() - nn_config['num_layers'] = study.num_layers - nn_config['input_size'] = study.input_size - nn_config['output_size'] = study.output_size - nn_config['embedding'] = dict() - for l in range(study.num_layers): - opt = organized_arc[l][0] - nn_config['embedding'][opt] = study.search_space[opt].get_dict() - - organized_arc_json = json.dumps(organized_arc) - nn_config_json = json.dumps(nn_config) - - organized_arc_str = str(organized_arc_json).replace('\"', '\'') - nn_config_str = str(nn_config_json).replace('\"', '\'') - - self.logger.info("\nNew Neural Network Architecture (internal representation):") - self.logger.info(organized_arc_json) - self.logger.info("\nCorresponding Seach Space Description:") - self.logger.info(nn_config_str) - self.logger.info("") - - trials = [] - trials.append(api_pb2.Trial( - study_id=request.study_id, - parameter_set=[ - api_pb2.Parameter( - name="architecture", - value=organized_arc_str, - parameter_type= api_pb2.CATEGORICAL), - api_pb2.Parameter( - name="nn_config", - value=nn_config_str, - parameter_type= api_pb2.CATEGORICAL) - ], + + organized_candidates = list() + trials = list() + + for i in range(study.num_trials): + arc = candidates[i].tolist() + organized_arc = [0 for _ in range(study.num_layers)] + record = 0 + for l in range(study.num_layers): + organized_arc[l] = arc[record: record + l + 1] + record += l + 1 + organized_candidates.append(organized_arc) + + nn_config = dict() + nn_config['num_layers'] = study.num_layers + nn_config['input_size'] = study.input_size + nn_config['output_size'] = study.output_size + nn_config['embedding'] = dict() + for l in range(study.num_layers): + opt = organized_arc[l][0] + nn_config['embedding'][opt] = study.search_space[opt].get_dict() + + organized_arc_json = json.dumps(organized_arc) + nn_config_json = json.dumps(nn_config) + + organized_arc_str = str(organized_arc_json).replace('\"', '\'') + nn_config_str = str(nn_config_json).replace('\"', '\'') + + self.logger.info("\nNeural Network Architecture Candidate #{} (internal representation):".format(i)) + self.logger.info(organized_arc_json) + self.logger.info("\nCorresponding Seach Space Description:") + self.logger.info(nn_config_str) + + trials.append(api_pb2.Trial( + study_id=request.study_id, + parameter_set=[ + api_pb2.Parameter( + name="architecture", + value=organized_arc_str, + parameter_type= api_pb2.CATEGORICAL), + api_pb2.Parameter( + name="nn_config", + value=nn_config_str, + parameter_type= api_pb2.CATEGORICAL) + ], + ) ) - ) + self.logger.info("") channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT) with api_pb2.beta_create_Manager_stub(channel) as client: for i, t in enumerate(trials): ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10) trials[i].trial_id = ctrep.trial_id - self.logger.info("Trial {} Created\n".format(ctrep.trial_id)) - study.prev_trial_id = ctrep.trial_id - + self.logger.info("Trial {} Created".format(ctrep.trial_id)) + study.prev_trial_ids.append(ctrep.trial_id) + self.logger.info("") + study.ctrl_step += 1 return api_pb2.GetSuggestionsReply(trials=trials) def GetEvaluationResult(self, study): - worker_list = [] channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT) with api_pb2.beta_create_Manager_stub(channel) as client: - gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, trial_id=study.prev_trial_id, only_latest_log=True), 10) - worker_list = gwfrep.worker_full_infos + gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10) + trials_list = gwfrep.worker_full_infos + + completed = True + for trial in trials_list: + completed = completed and (trial.Worker.status == api_pb2.COMPLETED) + + + if completed: + metrics = list() - for w in worker_list: - if w.Worker.status == api_pb2.COMPLETED: - for ml in w.metrics_logs: + for t in trials_list: + for ml in t.metrics_logs: if ml.name == study.objective_name: - self.logger.info("Evaluation result of previous candidate: {}".format(ml.values[-1].value)) - return float(ml.values[-1].value) + metrics.append(float(ml.values[-1].value)) + return metrics + # TODO: add support for multiple trials From deba168f4ea0142832a7b1ae90cb8adea6ff55bd Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Fri, 1 Mar 2019 23:49:49 -0800 Subject: [PATCH 2/6] adjust To Do --- pkg/suggestion/NAS_Reinforcement_Learning/README.md | 6 +++--- pkg/suggestion/nasrl_service.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md index 89b9735f1ff..d946e7ffbdf 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md +++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md @@ -122,6 +122,6 @@ This neural architecture can be visualized as ![a neural netowrk architecure example](example.png) ## To Do -1. Add support for multiple trials -2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` -3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts +1. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` +2. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts +3. Add `RequestCount` into API so that the suggestion can clean the information of completed studies. diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index d447180b103..86b02372cf9 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -309,5 +309,3 @@ def GetEvaluationResult(self, study): metrics.append(float(ml.values[-1].value)) return metrics - - # TODO: add support for multiple trials From ef3ff9614d45b643954cdcca81e2d02510321499 Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Sat, 2 Mar 2019 00:15:20 -0800 Subject: [PATCH 3/6] language improvement in README.md --- pkg/suggestion/NAS_Reinforcement_Learning/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md index d946e7ffbdf..25be4cbd88f 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md +++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md @@ -25,9 +25,8 @@ If n = 12, m = 6, the definition of an architecture will be like: There are n rows, the ith row has i elements and describes the ith layer. Please notice that layer 0 is the input and is not included in this definition. -In each row: -The first integer ranges from 0 to m-1, indicates the operation in this layer. -The next (i-1) integers is either 0 or 1. The kth (k>=2) integer indicates whether (k-2)th layer has a skip connection with this layer. (There will always be a connection from (k-1)th layer to kth layer) +In each row, the first integer ranges from 0 to m-1 and indicates the operation in this layer. +Starting from the second position, the kth integer is a boolean value that indicates whether (k-2)th layer has a skip connection with this layer. (There will always be a connection from (k-1)th layer to kth layer) ## Output of `GetSuggestion()` The output of `GetSuggestion()` consists of two parts: `architecture` and `nn_config`. From d3289041299d920589bcf0da1bd1fafbc840f87b Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Mon, 4 Mar 2019 20:49:21 -0800 Subject: [PATCH 4/6] fix several problems --- examples/nasjob-example-RL.yaml | 3 +- .../NAS_Reinforcement_Learning/Controller.py | 4 +- .../SuggestionParam.py | 2 - pkg/suggestion/nasrl_service.py | 42 ++++++++++--------- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/examples/nasjob-example-RL.yaml b/examples/nasjob-example-RL.yaml index 3e2946436d8..a64215857d9 100644 --- a/examples/nasjob-example-RL.yaml +++ b/examples/nasjob-example-RL.yaml @@ -149,9 +149,8 @@ spec: restartPolicy: Never suggestionSpec: suggestionAlgorithm: "nasrl" + requestNumber: 3 suggestionParameters: - - name: "num_trials" - value: "1" - name: "lstm_num_cells" value: "64" - name: "lstm_num_layers" diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py b/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py index 2c87868a573..937dc130b1a 100755 --- a/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py +++ b/pkg/suggestion/NAS_Reinforcement_Learning/Controller.py @@ -31,7 +31,7 @@ def __init__(self, logger=None): self.logger = logger - self.logger.info("Building Controller") + self.logger.info(">>> Building Controller") self.num_layers = num_layers self.num_operations = num_operations @@ -87,7 +87,7 @@ def _create_params(self): def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" - self.logger.info("Building Controller Sampler") + self.logger.info(">>> Building Controller Sampler") anchors = [] anchors_w_1 = [] diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py index fc75b073062..ace456782b3 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py +++ b/pkg/suggestion/NAS_Reinforcement_Learning/SuggestionParam.py @@ -1,6 +1,5 @@ def parseSuggestionParam(params_raw): param_standard = { - "num_trials": ['value', int, [1, 'inf']], "lstm_num_cells": ['value', int, [1, 'inf']], "lstm_num_layers": ['value', int, [1, 'inf']], "lstm_keep_prob": ['value', float, [0.0, 1.0]], @@ -17,7 +16,6 @@ def parseSuggestionParam(params_raw): } suggestion_params = { - "num_trials": 1, "lstm_num_cells": 64, "lstm_num_layers": 1, "lstm_keep_prob": 1.0, diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index 86b02372cf9..8e860ec1fa3 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -20,6 +20,7 @@ def __init__(self, request, logger): self.logger = logger self.study_id = request.study_id self.param_id = request.param_id + self.num_trials = request.request_number self.study_name = None self.tf_graph = tf.Graph() self.prev_trial_ids = list() @@ -35,14 +36,12 @@ def __init__(self, request, logger): self.search_space = None self.opt_direction = None self.objective_name = None - self.num_trials = 1 self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100) self._get_study_param() self._get_suggestion_param() self._setup_controller() - self.num_trials = self.suggestion_config["num_trials"] - self.logger.info("Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id)) + self.logger.info(">>> Suggestion for StudyJob {} (ID: {}) has been initialized.\n".format(self.study_name, self.study_id)) def _get_study_param(self): # this function need to @@ -115,7 +114,7 @@ def print_search_space(self): self.logger.warning("Error! The Suggestion has not yet been initialized!") return - self.logger.info("Search Space for StudyJob {} (ID: {}):".format(self.study_name, self.study_id)) + self.logger.info(">>> Search Space for StudyJob {} (ID: {}):".format(self.study_name, self.study_id)) for opt in self.search_space: opt.print_op(self.logger) self.logger.info("There are {} operations in total.\n".format(self.num_operations)) @@ -125,12 +124,13 @@ def print_suggestion_params(self): self.logger.warning("Error! The Suggestion has not yet been initialized!") return - self.logger.info("Parameters of LSTM Controller for StudyJob {} (ID: {}):".format(self.study_name, self.study_id)) + self.logger.info(">>> Parameters of LSTM Controller for StudyJob {} (ID: {}):".format(self.study_name, self.study_id)) for spec in self.suggestion_config: if len(spec) > 13: self.logger.info("{}: \t{}".format(spec, self.suggestion_config[spec])) else: self.logger.info("{}: \t\t{}".format(spec, self.suggestion_config[spec])) + self.logger.info("RequestNumber:\t\t{}".format(self.num_trials)) self.logger.info("") @@ -189,7 +189,7 @@ def GetSuggestions(self, request, context): controller_ops["train_op"]] if study.is_first_run: - self.logger.info("First time running suggestion for {}. Random architecture will be given.".format(study.study_name)) + self.logger.info(">>> First time running suggestion for {}. Random architecture will be given.".format(study.study_name)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) candidates = list() @@ -209,8 +209,7 @@ def GetSuggestions(self, request, context): results = self.GetEvaluationResult(study) avg_result = sum(results) / len(results) - self.logger.info("Evaluation results of previous trials: {}".format(str(results)[1:-1])) - self.logger.info("The average is {}".format(avg_result)) + self.logger.info(">>> Evaluation results of previous trials: {}. The average is {}".format(str(results)[1:-1], avg_result)) # This lstm cell is designed to maximize the metrics # However, if the user want to minimize the metrics, we can take the negative of the result @@ -220,7 +219,7 @@ def GetSuggestions(self, request, context): loss, entropy, lr, gn, bl, skip, _ = sess.run( fetches=run_ops, feed_dict={valid_acc: avg_result}) - self.logger.info("Suggetion updated. LSTM Controller Reward: {}".format(loss)) + self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss)) candidates = list() for _ in range(study.num_trials): @@ -255,9 +254,9 @@ def GetSuggestions(self, request, context): organized_arc_str = str(organized_arc_json).replace('\"', '\'') nn_config_str = str(nn_config_json).replace('\"', '\'') - self.logger.info("\nNeural Network Architecture Candidate #{} (internal representation):".format(i)) + self.logger.info("\n>>> Neural Network Architecture Candidate #{} (internal representation):".format(i)) self.logger.info(organized_arc_json) - self.logger.info("\nCorresponding Seach Space Description:") + self.logger.info("\n>>> Corresponding Seach Space Description:") self.logger.info(nn_config_str) trials.append(api_pb2.Trial( @@ -275,14 +274,18 @@ def GetSuggestions(self, request, context): ) ) + self.prev_trial_ids = list() self.logger.info("") channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT) with api_pb2.beta_create_Manager_stub(channel) as client: for i, t in enumerate(trials): ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10) trials[i].trial_id = ctrep.trial_id - self.logger.info("Trial {} Created".format(ctrep.trial_id)) - study.prev_trial_ids.append(ctrep.trial_id) + self.prev_trial_ids.append(ctrep.trial_id) + + self.logger.info(">>> {} Trials were created:".format(study.num_trials)) + for t in self.prev_trial_ids: + self.logger.info(t) self.logger.info("") study.ctrl_step += 1 @@ -296,16 +299,17 @@ def GetEvaluationResult(self, study): trials_list = gwfrep.worker_full_infos completed = True - for trial in trials_list: - completed = completed and (trial.Worker.status == api_pb2.COMPLETED) + for t in trials_list: + if t.Worker.trial_id in self.prev_trial_ids: + completed = completed and (t.Worker.status == api_pb2.COMPLETED) - if completed: metrics = list() for t in trials_list: - for ml in t.metrics_logs: - if ml.name == study.objective_name: - metrics.append(float(ml.values[-1].value)) + if t.Worker.trial_id in self.prev_trial_ids: + for ml in t.metrics_logs: + if ml.name == study.objective_name: + metrics.append(float(ml.values[-1].value)) return metrics From 2f415f348f463356ed35198489b0b590990d5e6a Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 5 Mar 2019 13:46:39 -0800 Subject: [PATCH 5/6] fix a potential problem --- pkg/suggestion/nasrl_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index 8e860ec1fa3..d60baec0f4c 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -298,12 +298,12 @@ def GetEvaluationResult(self, study): gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10) trials_list = gwfrep.worker_full_infos - completed = True + completed_count = 0 for t in trials_list: - if t.Worker.trial_id in self.prev_trial_ids: - completed = completed and (t.Worker.status == api_pb2.COMPLETED) + if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED: + completed_count += 1 - if completed: + if completed_count == study.num_trials: metrics = list() for t in trials_list: From 8986a7f9d62ab597d5539fcf1896ae9ab5fbd658 Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 5 Mar 2019 15:48:55 -0800 Subject: [PATCH 6/6] handle the GetEvaluationResult() return None problem --- pkg/suggestion/nasrl_service.py | 45 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index d60baec0f4c..ebba3e13524 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -9,6 +9,7 @@ from logging import getLogger, StreamHandler, INFO, DEBUG import json import os +import time MANAGER_ADDRESS = "vizier-core" @@ -206,19 +207,23 @@ def GetSuggestions(self, request, context): saver.restore(sess, study.ctrl_cache_file) valid_acc = ctrl.reward - results = self.GetEvaluationResult(study) - avg_result = sum(results) / len(results) + result = self.GetEvaluationResult(study) - self.logger.info(">>> Evaluation results of previous trials: {}. The average is {}".format(str(results)[1:-1], avg_result)) + # In some rare cases, GetEvaluationResult() may return None + # if GetSuggestions() is called before all the trials are completed + while result is None: + self.logger.warning(">>> GetEvaluationResult() returns None") + time.sleep(20) + result = self.GetEvaluationResult(study) - # This lstm cell is designed to maximize the metrics - # However, if the user want to minimize the metrics, we can take the negative of the result + # This LSTM network is designed to maximize the metrics + # However, if the user wants to minimize the metrics, we can take the negative of the result if study.opt_direction == api_pb2.MINIMIZE: - avg_result = -avg_result + result = -result loss, entropy, lr, gn, bl, skip, _ = sess.run( fetches=run_ops, - feed_dict={valid_acc: avg_result}) + feed_dict={valid_acc: result}) self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss)) candidates = list() @@ -254,7 +259,7 @@ def GetSuggestions(self, request, context): organized_arc_str = str(organized_arc_json).replace('\"', '\'') nn_config_str = str(nn_config_json).replace('\"', '\'') - self.logger.info("\n>>> Neural Network Architecture Candidate #{} (internal representation):".format(i)) + self.logger.info("\n>>> New Neural Network Architecture Candidate #{} (internal representation):".format(i)) self.logger.info(organized_arc_json) self.logger.info("\n>>> Corresponding Seach Space Description:") self.logger.info(nn_config_str) @@ -298,18 +303,18 @@ def GetEvaluationResult(self, study): gwfrep = client.GetWorkerFullInfo(api_pb2.GetWorkerFullInfoRequest(study_id=study.study_id, only_latest_log=True), 10) trials_list = gwfrep.worker_full_infos - completed_count = 0 + completed_trials = dict() for t in trials_list: if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED: - completed_count += 1 + for ml in t.metrics_logs: + if ml.name == study.objective_name: + completed_trials[t.Worker.trial_id] = float(ml.values[-1].value) - if completed_count == study.num_trials: - metrics = list() - - for t in trials_list: - if t.Worker.trial_id in self.prev_trial_ids: - for ml in t.metrics_logs: - if ml.name == study.objective_name: - metrics.append(float(ml.values[-1].value)) - - return metrics + if len(completed_trials) == study.num_trials: + self.logger.info(">>> Evaluation results of previous trials:") + for k in completed_trials: + self.logger.info("{}: {}".format(k, completed_trials[k])) + avg_metrics = sum(completed_trials.values()) / study.num_trials + self.logger.info("The average is {}\n".format(avg_metrics)) + + return avg_metrics \ No newline at end of file