From de54e9c094821e8bc91023666ed6bac2ee08fdaa Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Thu, 7 Mar 2019 18:47:16 -0800 Subject: [PATCH 1/7] add fault tolerance for trial failure --- pkg/suggestion/nasrl_service.py | 53 ++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index 06b8d8de4ae..23c6d2ccccd 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -14,6 +14,8 @@ MANAGER_ADDRESS = "vizier-core" MANAGER_PORT = 6789 +RECALL_LIMIT = 10 +RESPAWN_LIMIT = 10 class NAS_RL_StudyJob(object): @@ -21,10 +23,13 @@ def __init__(self, request, logger): self.logger = logger self.study_id = request.study_id self.param_id = request.param_id - self.num_trials = request.request_number + self.num_trials = 1 + if request.request_number > 0: + self.num_trials = request.request_number self.study_name = None self.tf_graph = tf.Graph() self.prev_trial_ids = list() + self.prev_trials = None self.ctrl_cache_file = "ctrl_cache/{}/{}.ckpt".format(request.study_id, request.study_id) self.ctrl_step = 0 self.is_first_run = True @@ -37,6 +42,7 @@ def __init__(self, request, logger): self.search_space = None self.opt_direction = None self.objective_name = None + self.respawn_count = 0 self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100) self._get_study_param() @@ -272,13 +278,36 @@ def GetSuggestions(self, request, context): valid_acc = ctrl.reward result = self.GetEvaluationResult(study) - # In some rare cases, GetEvaluationResult() may return None - # if GetSuggestions() is called before all the trials are completed + + # Sometimes training container may fail and GetEvaluationResult() will return None + # In this case, the Suggestion will: + # 1. Try to call GetEvaluationResult() again + # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, + # then respawn the previous trials + # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results, + # then fail the task becuase it may indicate that the training container has errors. + + recall_count = 0 while result is None: - self.logger.warning(">>> GetEvaluationResult() returns None") + if study.respawn_count >= RESPAWN_LIMIT: + self.logger.warning(">>> Suggestion has spawned trials for {} times, but they all failed.".format(RESPAWN_LIMIT)) + self.logger.warning(">>> Please check whether the training container is correctly implemented") + self.logger.info(">>> StudyJob {} failed".format(study.study_name)) + return [] + + if recall_count >= RECALL_LIMIT: + self.logger.warning(">>> GetEvaluationResult() returns None for {} times. Previous trials probably failed".format(RECALL_LIMIT)) + self.logger.info(">>> Respawn the previous trials") + study.respawn_count += 1 + return self.SpawnTrials(study, study.prev_trials) + + self.logger.warning(">>> GetEvaluationResult() returns None. It will be called again after 20 seconds") time.sleep(20) - result = self.GetEvaluationResult(study) + recall_count += 1 + result = self.GetEvaluationResult(study) + + study.respawn_count = 0 # This LSTM network is designed to maximize the metrics # However, if the user wants to minimize the metrics, we can take the negative of the result if study.opt_direction == api_pb2.MINIMIZE: @@ -287,7 +316,7 @@ def GetSuggestions(self, request, context): loss, entropy, lr, gn, bl, skip, _ = sess.run( fetches=run_ops, feed_dict={valid_acc: result}) - self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss)) + self.logger.info(">>> Suggestion updated. LSTM Controller Reward: {}".format(loss)) candidates = list() for _ in range(study.num_trials): @@ -342,17 +371,21 @@ def GetSuggestions(self, request, context): ) ) - self.prev_trial_ids = list() + return self.SpawnTrials(study, trials) + + def SpawnTrials(self, study, trials): + study.prev_trials = trials + study.prev_trial_ids = list() self.logger.info("") channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT) with api_pb2.beta_create_Manager_stub(channel) as client: for i, t in enumerate(trials): ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10) trials[i].trial_id = ctrep.trial_id - self.prev_trial_ids.append(ctrep.trial_id) + study.prev_trial_ids.append(ctrep.trial_id) self.logger.info(">>> {} Trials were created:".format(study.num_trials)) - for t in self.prev_trial_ids: + for t in study.prev_trial_ids: self.logger.info(t) self.logger.info("") @@ -368,7 +401,7 @@ def GetEvaluationResult(self, study): completed_trials = dict() for t in trials_list: - if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED: + if t.Worker.trial_id in study.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED: for ml in t.metrics_logs: if ml.name == study.objective_name: completed_trials[t.Worker.trial_id] = float(ml.values[-1].value) From 6a572f6638c5b597775e886ee978c3c4837a544a Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Thu, 7 Mar 2019 18:56:16 -0800 Subject: [PATCH 2/7] fix a small typo --- pkg/suggestion/nasrl_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index 23c6d2ccccd..f38bcf03ab9 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -285,7 +285,7 @@ def GetSuggestions(self, request, context): # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, # then respawn the previous trials # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results, - # then fail the task becuase it may indicate that the training container has errors. + # then fail the task because it may indicate that the training container has errors. recall_count = 0 while result is None: From 62031e5dfa1ffe46e69c0971c0640e3d4f7b464a Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Mon, 11 Mar 2019 11:53:18 -0700 Subject: [PATCH 3/7] fix a typo --- pkg/suggestion/nasrl_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index f38bcf03ab9..c63a58ae0c3 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -284,7 +284,7 @@ def GetSuggestions(self, request, context): # 1. Try to call GetEvaluationResult() again # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, # then respawn the previous trials - # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results, + # 3. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results, # then fail the task because it may indicate that the training container has errors. recall_count = 0 From ae902d78fa34d0b1a71d52768d910990f790e495 Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 12 Mar 2019 15:05:20 -0700 Subject: [PATCH 4/7] improve fault processing strategy --- pkg/suggestion/nasrl_service.py | 49 +++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py index c63a58ae0c3..8ba34646875 100644 --- a/pkg/suggestion/nasrl_service.py +++ b/pkg/suggestion/nasrl_service.py @@ -14,7 +14,7 @@ MANAGER_ADDRESS = "vizier-core" MANAGER_PORT = 6789 -RECALL_LIMIT = 10 +RESPAWN_SLEEP = 20 RESPAWN_LIMIT = 10 @@ -281,33 +281,29 @@ def GetSuggestions(self, request, context): # Sometimes training container may fail and GetEvaluationResult() will return None # In this case, the Suggestion will: - # 1. Try to call GetEvaluationResult() again - # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, - # then respawn the previous trials - # 3. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results, + # 1. Firstly try to respawn the previous trials after waiting for RESPAWN_SLEEP seconds + # 2. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results, # then fail the task because it may indicate that the training container has errors. - recall_count = 0 - while result is None: + if result is None: if study.respawn_count >= RESPAWN_LIMIT: self.logger.warning(">>> Suggestion has spawned trials for {} times, but they all failed.".format(RESPAWN_LIMIT)) self.logger.warning(">>> Please check whether the training container is correctly implemented") self.logger.info(">>> StudyJob {} failed".format(study.study_name)) return [] - - if recall_count >= RECALL_LIMIT: - self.logger.warning(">>> GetEvaluationResult() returns None for {} times. Previous trials probably failed".format(RECALL_LIMIT)) + + else: + self.logger.warning(">>> GetEvaluationResult() returns None. All the previous trials failed") + + self.logger.info(">>> Sleep for {} seconds".format(RESPAWN_SLEEP)) + time.sleep(RESPAWN_SLEEP) + self.logger.info(">>> Respawn the previous trials") study.respawn_count += 1 return self.SpawnTrials(study, study.prev_trials) - self.logger.warning(">>> GetEvaluationResult() returns None. It will be called again after 20 seconds") - time.sleep(20) - recall_count += 1 - result = self.GetEvaluationResult(study) - - study.respawn_count = 0 + # This LSTM network is designed to maximize the metrics # However, if the user wants to minimize the metrics, we can take the negative of the result if study.opt_direction == api_pb2.MINIMIZE: @@ -405,12 +401,19 @@ def GetEvaluationResult(self, study): for ml in t.metrics_logs: if ml.name == study.objective_name: completed_trials[t.Worker.trial_id] = float(ml.values[-1].value) - - if len(completed_trials) == study.num_trials: - self.logger.info(">>> Evaluation results of previous trials:") - for k in completed_trials: - self.logger.info("{}: {}".format(k, completed_trials[k])) - avg_metrics = sum(completed_trials.values()) / study.num_trials + + n_complete = len(completed_trials) + n_fail = study.num_trials - n_complete + + self.logger.info(">>> {} Trials succeeded, {} Trials failed:".format(n_complete, n_fail)) + for tid in study.prev_trial_ids: + if tid in completed_trials: + self.logger.info("{}: {}".format(tid, completed_trials[tid])) + else: + self.logger.info("{}: Failed".format(tid)) + + if n_complete > 0: + avg_metrics = sum(completed_trials.values()) / n_complete self.logger.info("The average is {}\n".format(avg_metrics)) - return avg_metrics \ No newline at end of file + return avg_metrics From c7dcc0f550e520b44bb20c4a8cb8e8c7f3e18e57 Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 12 Mar 2019 16:08:59 -0700 Subject: [PATCH 5/7] add an important TODO --- pkg/suggestion/NAS_Reinforcement_Learning/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md index 25be4cbd88f..ebc4e6a0949 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md +++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md @@ -121,6 +121,7 @@ This neural architecture can be visualized as ![a neural netowrk architecure example](example.png) ## To Do -1. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` -2. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts -3. Add `RequestCount` into API so that the suggestion can clean the information of completed studies. +1. Add 'micro' mode, which means searching for a nerual network cell instead of the whole nerual netowrk. +2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` +3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts +4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies. From fa19ab9a20572d994c02c429eb1a02cd65b8163f Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 12 Mar 2019 16:11:14 -0700 Subject: [PATCH 6/7] fix typo --- pkg/suggestion/NAS_Reinforcement_Learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md index ebc4e6a0949..9bb84cd4de0 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md +++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md @@ -121,7 +121,7 @@ This neural architecture can be visualized as ![a neural netowrk architecure example](example.png) ## To Do -1. Add 'micro' mode, which means searching for a nerual network cell instead of the whole nerual netowrk. +1. Add 'micro' mode, which means searching for a neural cell instead of the whole neural network. 2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` 3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts 4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies. From 9e582361f8f9ce36772e72b278bed33da8481d97 Mon Sep 17 00:00:00 2001 From: DeeperMind <1155077043@link.cuhk.edu.hk> Date: Tue, 12 Mar 2019 16:18:10 -0700 Subject: [PATCH 7/7] add some more TODOs --- pkg/suggestion/NAS_Reinforcement_Learning/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md index 9bb84cd4de0..a92acb6fa73 100644 --- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md +++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md @@ -122,6 +122,8 @@ This neural architecture can be visualized as ## To Do 1. Add 'micro' mode, which means searching for a neural cell instead of the whole neural network. -2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` -3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts -4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies. +2. Add supoort for recurrent neural networks and build a training container for the Penn Treebank task. +3. Add parameter sharing, if possible. +4. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell` +5. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts +6. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.