From de54e9c094821e8bc91023666ed6bac2ee08fdaa Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Thu, 7 Mar 2019 18:47:16 -0800
Subject: [PATCH 1/7] add fault tolerance for trial failure

---
 pkg/suggestion/nasrl_service.py | 53 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index 06b8d8de4ae..23c6d2ccccd 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -14,6 +14,8 @@
 
 MANAGER_ADDRESS = "vizier-core"
 MANAGER_PORT = 6789
+RECALL_LIMIT = 10
+RESPAWN_LIMIT = 10
 
 
 class NAS_RL_StudyJob(object):
@@ -21,10 +23,13 @@ def __init__(self, request, logger):
         self.logger = logger
         self.study_id = request.study_id
         self.param_id = request.param_id
-        self.num_trials = request.request_number
+        self.num_trials = 1
+        if request.request_number > 0:
+            self.num_trials = request.request_number
         self.study_name = None
         self.tf_graph = tf.Graph()
         self.prev_trial_ids = list()
+        self.prev_trials = None
         self.ctrl_cache_file = "ctrl_cache/{}/{}.ckpt".format(request.study_id, request.study_id)
         self.ctrl_step = 0
         self.is_first_run = True
@@ -37,6 +42,7 @@ def __init__(self, request, logger):
         self.search_space = None
         self.opt_direction = None
         self.objective_name = None
+        self.respawn_count = 0
         
         self.logger.info("-" * 100 + "\nSetting Up Suggestion for StudyJob ID {}\n".format(request.study_id) + "-" * 100)
         self._get_study_param()
@@ -272,13 +278,36 @@ def GetSuggestions(self, request, context):
                     valid_acc = ctrl.reward
                     result = self.GetEvaluationResult(study)
 
-                    # In some rare cases, GetEvaluationResult() may return None
-                    # if GetSuggestions() is called before all the trials are completed
+
+                    # Sometimes training container may fail and GetEvaluationResult() will return None
+                    # In this case, the Suggestion will:
+                    # 1. Try to call GetEvaluationResult() again
+                    # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, 
+                    #    then respawn the previous trials
+                    # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results,
+                    #    then fail the task becuase it may indicate that the training container has errors.
+
+                    recall_count = 0
                     while result is None:
-                        self.logger.warning(">>> GetEvaluationResult() returns None")
+                        if study.respawn_count >= RESPAWN_LIMIT:
+                            self.logger.warning(">>> Suggestion has spawned trials for {} times, but they all failed.".format(RESPAWN_LIMIT))
+                            self.logger.warning(">>> Please check whether the training container is correctly implemented")
+                            self.logger.info(">>> StudyJob {} failed".format(study.study_name))
+                            return []
+                        
+                        if recall_count >= RECALL_LIMIT:
+                            self.logger.warning(">>> GetEvaluationResult() returns None for {} times. Previous trials probably failed".format(RECALL_LIMIT))
+                            self.logger.info(">>> Respawn the previous trials")
+                            study.respawn_count += 1
+                            return self.SpawnTrials(study, study.prev_trials)
+
+                        self.logger.warning(">>> GetEvaluationResult() returns None. It will be called again after 20 seconds")
                         time.sleep(20)
-                        result = self.GetEvaluationResult(study)
+                        recall_count += 1
+                        result  = self.GetEvaluationResult(study)
+
 
+                    study.respawn_count = 0
                     # This LSTM network is designed to maximize the metrics
                     # However, if the user wants to minimize the metrics, we can take the negative of the result
                     if study.opt_direction == api_pb2.MINIMIZE:
@@ -287,7 +316,7 @@ def GetSuggestions(self, request, context):
                     loss, entropy, lr, gn, bl, skip, _ = sess.run(
                         fetches=run_ops,
                         feed_dict={valid_acc: result})
-                    self.logger.info(">>> Suggetion updated. LSTM Controller Reward: {}".format(loss))
+                    self.logger.info(">>> Suggestion updated. LSTM Controller Reward: {}".format(loss))
 
                     candidates = list()
                     for _ in range(study.num_trials):
@@ -342,17 +371,21 @@ def GetSuggestions(self, request, context):
                 )
             )
 
-        self.prev_trial_ids = list()
+        return self.SpawnTrials(study, trials)
+    
+    def SpawnTrials(self, study, trials):
+        study.prev_trials = trials
+        study.prev_trial_ids = list()
         self.logger.info("")
         channel = grpc.beta.implementations.insecure_channel(MANAGER_ADDRESS, MANAGER_PORT)
         with api_pb2.beta_create_Manager_stub(channel) as client:
             for i, t in enumerate(trials):
                 ctrep = client.CreateTrial(api_pb2.CreateTrialRequest(trial=t), 10)
                 trials[i].trial_id = ctrep.trial_id
-                self.prev_trial_ids.append(ctrep.trial_id)
+                study.prev_trial_ids.append(ctrep.trial_id)
         
         self.logger.info(">>> {} Trials were created:".format(study.num_trials))
-        for t in self.prev_trial_ids:
+        for t in study.prev_trial_ids:
             self.logger.info(t)
         self.logger.info("")
 
@@ -368,7 +401,7 @@ def GetEvaluationResult(self, study):
         
         completed_trials = dict()
         for t in trials_list:
-            if t.Worker.trial_id in self.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED:
+            if t.Worker.trial_id in study.prev_trial_ids and t.Worker.status == api_pb2.COMPLETED:
                 for ml in t.metrics_logs:
                     if ml.name == study.objective_name:
                         completed_trials[t.Worker.trial_id] = float(ml.values[-1].value)

From 6a572f6638c5b597775e886ee978c3c4837a544a Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Thu, 7 Mar 2019 18:56:16 -0800
Subject: [PATCH 2/7] fix a small typo

---
 pkg/suggestion/nasrl_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index 23c6d2ccccd..f38bcf03ab9 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -285,7 +285,7 @@ def GetSuggestions(self, request, context):
                     # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, 
                     #    then respawn the previous trials
                     # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results,
-                    #    then fail the task becuase it may indicate that the training container has errors.
+                    #    then fail the task because it may indicate that the training container has errors.
 
                     recall_count = 0
                     while result is None:

From 62031e5dfa1ffe46e69c0971c0640e3d4f7b464a Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Mon, 11 Mar 2019 11:53:18 -0700
Subject: [PATCH 3/7] fix a typo

---
 pkg/suggestion/nasrl_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index f38bcf03ab9..c63a58ae0c3 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -284,7 +284,7 @@ def GetSuggestions(self, request, context):
                     # 1. Try to call GetEvaluationResult() again
                     # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, 
                     #    then respawn the previous trials
-                    # 3. If respawning the trials for RESPAWAN_LIMIT times still cannot collect valid results,
+                    # 3. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results,
                     #    then fail the task because it may indicate that the training container has errors.
 
                     recall_count = 0

From ae902d78fa34d0b1a71d52768d910990f790e495 Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 12 Mar 2019 15:05:20 -0700
Subject: [PATCH 4/7] improve fault processing strategy

---
 pkg/suggestion/nasrl_service.py | 49 +++++++++++++++++----------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/pkg/suggestion/nasrl_service.py b/pkg/suggestion/nasrl_service.py
index c63a58ae0c3..8ba34646875 100644
--- a/pkg/suggestion/nasrl_service.py
+++ b/pkg/suggestion/nasrl_service.py
@@ -14,7 +14,7 @@
 
 MANAGER_ADDRESS = "vizier-core"
 MANAGER_PORT = 6789
-RECALL_LIMIT = 10
+RESPAWN_SLEEP = 20
 RESPAWN_LIMIT = 10
 
 
@@ -281,33 +281,29 @@ def GetSuggestions(self, request, context):
 
                     # Sometimes training container may fail and GetEvaluationResult() will return None
                     # In this case, the Suggestion will:
-                    # 1. Try to call GetEvaluationResult() again
-                    # 2. If calling GetEvaluationResult() for RECALL_LIMIT times all return None, 
-                    #    then respawn the previous trials
-                    # 3. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results,
+                    # 1. Firstly try to respawn the previous trials after waiting for RESPAWN_SLEEP seconds
+                    # 2. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results,
                     #    then fail the task because it may indicate that the training container has errors.
 
-                    recall_count = 0
-                    while result is None:
+                    if result is None:
                         if study.respawn_count >= RESPAWN_LIMIT:
                             self.logger.warning(">>> Suggestion has spawned trials for {} times, but they all failed.".format(RESPAWN_LIMIT))
                             self.logger.warning(">>> Please check whether the training container is correctly implemented")
                             self.logger.info(">>> StudyJob {} failed".format(study.study_name))
                             return []
-                        
-                        if recall_count >= RECALL_LIMIT:
-                            self.logger.warning(">>> GetEvaluationResult() returns None for {} times. Previous trials probably failed".format(RECALL_LIMIT))
+                            
+                        else:
+                            self.logger.warning(">>> GetEvaluationResult() returns None. All the previous trials failed")
+
+                            self.logger.info(">>> Sleep for {} seconds".format(RESPAWN_SLEEP))
+                            time.sleep(RESPAWN_SLEEP)
+
                             self.logger.info(">>> Respawn the previous trials")
                             study.respawn_count += 1
                             return self.SpawnTrials(study, study.prev_trials)
 
-                        self.logger.warning(">>> GetEvaluationResult() returns None. It will be called again after 20 seconds")
-                        time.sleep(20)
-                        recall_count += 1
-                        result  = self.GetEvaluationResult(study)
-
-
                     study.respawn_count = 0
+
                     # This LSTM network is designed to maximize the metrics
                     # However, if the user wants to minimize the metrics, we can take the negative of the result
                     if study.opt_direction == api_pb2.MINIMIZE:
@@ -405,12 +401,19 @@ def GetEvaluationResult(self, study):
                 for ml in t.metrics_logs:
                     if ml.name == study.objective_name:
                         completed_trials[t.Worker.trial_id] = float(ml.values[-1].value)
-        
-        if len(completed_trials) == study.num_trials:
-            self.logger.info(">>> Evaluation results of previous trials:")
-            for k in completed_trials:
-                self.logger.info("{}: {}".format(k, completed_trials[k]))
-            avg_metrics = sum(completed_trials.values()) / study.num_trials
+
+        n_complete = len(completed_trials)
+        n_fail = study.num_trials - n_complete
+
+        self.logger.info(">>> {} Trials succeeded, {} Trials failed:".format(n_complete, n_fail))
+        for tid in study.prev_trial_ids:
+            if tid in completed_trials:
+                self.logger.info("{}: {}".format(tid, completed_trials[tid]))
+            else:
+                self.logger.info("{}: Failed".format(tid))
+
+        if n_complete > 0:
+            avg_metrics = sum(completed_trials.values()) / n_complete
             self.logger.info("The average is {}\n".format(avg_metrics))
 
-            return avg_metrics
\ No newline at end of file
+            return avg_metrics

From c7dcc0f550e520b44bb20c4a8cb8e8c7f3e18e57 Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 12 Mar 2019 16:08:59 -0700
Subject: [PATCH 5/7] add an important TODO

---
 pkg/suggestion/NAS_Reinforcement_Learning/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
index 25be4cbd88f..ebc4e6a0949 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
@@ -121,6 +121,7 @@ This neural architecture can be visualized as
 ![a neural netowrk architecure example](example.png)
 
 ## To Do
-1. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
-2. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
-3. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.
+1. Add 'micro' mode, which means searching for a nerual network cell instead of the whole nerual netowrk.
+2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
+3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
+4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.

From fa19ab9a20572d994c02c429eb1a02cd65b8163f Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 12 Mar 2019 16:11:14 -0700
Subject: [PATCH 6/7] fix typo

---
 pkg/suggestion/NAS_Reinforcement_Learning/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
index ebc4e6a0949..9bb84cd4de0 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
@@ -121,7 +121,7 @@ This neural architecture can be visualized as
 ![a neural netowrk architecure example](example.png)
 
 ## To Do
-1. Add 'micro' mode, which means searching for a nerual network cell instead of the whole nerual netowrk.
+1. Add 'micro' mode, which means searching for a neural cell instead of the whole neural network.
 2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
 3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
 4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.

From 9e582361f8f9ce36772e72b278bed33da8481d97 Mon Sep 17 00:00:00 2001
From: DeeperMind <1155077043@link.cuhk.edu.hk>
Date: Tue, 12 Mar 2019 16:18:10 -0700
Subject: [PATCH 7/7] add some more TODOs

---
 pkg/suggestion/NAS_Reinforcement_Learning/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pkg/suggestion/NAS_Reinforcement_Learning/README.md b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
index 9bb84cd4de0..a92acb6fa73 100644
--- a/pkg/suggestion/NAS_Reinforcement_Learning/README.md
+++ b/pkg/suggestion/NAS_Reinforcement_Learning/README.md
@@ -122,6 +122,8 @@ This neural architecture can be visualized as
 
 ## To Do
 1. Add 'micro' mode, which means searching for a neural cell instead of the whole neural network.
-2. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
-3. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
-4. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.
+2. Add supoort for recurrent neural networks and build a training container for the Penn Treebank task.
+3. Add parameter sharing, if possible.
+4. Change LSTM cell from self defined functions in LSTM.py to `tf.nn.rnn_cell.LSTMCell`
+5. Store the suggestion checkpoint to PVC to protect against unexpected nasrl service pod restarts
+6. Add `RequestCount` into API so that the suggestion can clean the information of completed studies.