fix #1578 and some improvements (#2370)

Add shell support for ssh connection, so that remote script can be started with user environment. Minor fixes, 1. Fix gpu_metrics_collector to support pyenv. As pyenv will create one more process, so that original pgrep code always got extra processes, and cannot start gpu_metrics_collector. 2. Fix NASUI failure on dev-install-node-modules, to create subfolder every time. 3. Fix MakeFile to reduce mis-created links, and other minor issues. 4. Add node --watch for nni_manager for better dev experience.
microsoft · Apr 26, 2020 · 1c6f1ef · 1c6f1ef
1 parent 1d893dd
commit 1c6f1ef
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 58 deletions.
diff --git a/Makefile b/Makefile
@@ -173,12 +173,12 @@ install-python-modules:
 dev-install-python-modules:
 	#$(_INFO) Installing Python SDK $(_END)
 	mkdir -p build
-	ln -sf ../src/sdk/pynni/nni build/nni
-	ln -sf ../src/sdk/pynni/nnicli build/nnicli
-	ln -sf ../tools/nni_annotation build/nni_annotation
-	ln -sf ../tools/nni_cmd build/nni_cmd
-	ln -sf ../tools/nni_trial_tool build/nni_trial_tool
-	ln -sf ../tools/nni_gpu_tool build/nni_gpu_tool
+	ln -sf ../src/sdk/pynni/nni build
+	ln -sf ../src/sdk/pycli/nnicli build
+	ln -sf ../tools/nni_annotation build
+	ln -sf ../tools/nni_cmd build
+	ln -sf ../tools/nni_trial_tool build
+	ln -sf ../tools/nni_gpu_tool build
 	cp setup.py build/
 	cp README.md build/
 	sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py
@@ -209,10 +209,12 @@ dev-install-node-modules:
 	ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
 	cp src/nni_manager/package.json $(NNI_PKG_FOLDER)
 	sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
-	ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
-	ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
-	ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
-	ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
+	ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)
+	ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER)
+	mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static
+	mkdir -p $(NASUI_PKG_FOLDER)
+	ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)
+	ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)
 
 .PHONY: install-scripts
 install-scripts:

diff --git a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md
@@ -6,7 +6,7 @@
 
 For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps.
 
-**1. Clone the source code**
+### 1. Clone the source code
 
 Run the command
 
@@ -16,7 +16,7 @@ git clone https://github.com/Microsoft/nni.git
 
 to clone the source code
 
-**2. Prepare the debug environment and install dependencies**
+### 2. Prepare the debug environment and install dependencies**
 
 Change directory to the source code folder, then run the command
 
@@ -26,7 +26,7 @@ make install-dependencies
 
 to install the dependent tools for the environment
 
-**3. Build source code**
+### 3. Build source code
 
 Run the command
 
@@ -36,7 +36,7 @@ make build
 
 to build the source code
 
-**4. Install NNI to development environment**
+### 4. Install NNI to development environment
 
 Run the command
 
@@ -46,7 +46,7 @@ make dev-install
 
 to install the distribution content to development environment, and create cli scripts
 
-**5. Check if the environment is ready**
+### 5. Check if the environment is ready
 
 Now, you can try to start an experiment to check if your environment is ready.
 For example, run the command
@@ -57,9 +57,21 @@ nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml
 
 And open WebUI to check if everything is OK
 
-**6. Redeploy**
+### 6. Redeploy
+
+After the code changes, it may need to redeploy. It depends on what kind of code changed. 
+
+#### Python
+
+It doesn't need to redeploy, but the nnictl may need to be restarted.
+
+#### TypeScript
+
+* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly.
+* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code.
+
+The nnictl may need to be restarted.
 
-After the code changes, use **step 3** to rebuild your codes, then the changes will take effect immediately.
 
 ---
 At last, wish you have a wonderful day.

diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json
@@ -6,6 +6,7 @@
     "build": "tsc",
     "test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
     "start": "node dist/main.js",
+    "watch": "tsc --watch",
     "eslint": "npx eslint ./ --ext .ts"
   },
   "license": "MIT",

diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -175,7 +175,7 @@ class RemoteMachineTrainingService implements TrainingService {
             if (trialJob.rmMeta === undefined) {
                 throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
             }
-            const sshClient: Client | undefined  = this.trialSSHClientMap.get(trialJob.id);
+            const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id);
             if (sshClient === undefined) {
                 throw new Error(`Invalid job id: ${trialJobId}, cannot find ssh client`);
             }
@@ -324,7 +324,7 @@ class RemoteMachineTrainingService implements TrainingService {
                 }
                 // codeDir is not a valid directory, throw Error
                 if (!fs.lstatSync(remoteMachineTrailConfig.codeDir)
-                  .isDirectory()) {
+                    .isDirectory()) {
                     throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
                 }
 
@@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService {
 
         //Begin to execute gpu_metrics_collection scripts
         const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
-        SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn);
+        SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn, true);
 
         const disposable: Rx.IDisposable = this.timer.subscribe(
             async (tick: number) => {
@@ -513,7 +513,7 @@ class RemoteMachineTrainingService implements TrainingService {
     }
 
     private async launchTrialOnScheduledMachine(trialJobId: string, trialWorkingFolder: string, form: TrialJobApplicationForm,
-                                                rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
+        rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
         if (this.trialConfig === undefined) {
             throw new Error('trial config is not initialized');
         }
@@ -588,7 +588,7 @@ class RemoteMachineTrainingService implements TrainingService {
         // Copy files in codeDir to remote working directory
         await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
         // Execute command in remote machine
-        SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
+        SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient, true);
     }
 
     private getRmMetaByHost(host: string): RemoteMachineMeta {
@@ -612,7 +612,7 @@ class RemoteMachineTrainingService implements TrainingService {
                 const trailReturnCode: string = await SSHClientUtility.getRemoteFileContent(trialReturnCodeFilePath, sshClient);
                 this.log.debug(`trailjob ${trialJob.id} return code: ${trailReturnCode}`);
                 const match: RegExpMatchArray | null = trailReturnCode.trim()
-                  .match(/^(\d+)\s+(\d+)$/);
+                    .match(/^(\d+)\s+(\d+)$/);
                 if (match !== null) {
                     const { 1: code, 2: timestamp } = match;
                     // Update trial job's status based on result code

diff --git a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts
@@ -58,38 +58,50 @@ export namespace SSHClientUtility {
      * @param command the command to execute remotely
      * @param client SSH Client
      */
-    export function remoteExeCommand(command: string, client: Client): Promise<RemoteCommandResult> {
+    export function remoteExeCommand(command: string, client: Client, useShell: boolean = false): Promise<RemoteCommandResult> {
         const log: Logger = getLogger();
         log.debug(`remoteExeCommand: command: [${command}]`);
         const deferred: Deferred<RemoteCommandResult> = new Deferred<RemoteCommandResult>();
         let stdout: string = '';
         let stderr: string = '';
         let exitCode: number;
 
-        client.exec(command, (err: Error, channel: ClientChannel) => {
+        const callback = (err: Error, channel: ClientChannel): void => {
             if (err !== undefined && err !== null) {
                 log.error(`remoteExeCommand: ${err.message}`);
                 deferred.reject(err);
-
                 return;
             }
 
-            channel.on('data', (data: any, dataStderr: any) => {
-                if (dataStderr !== undefined && dataStderr !== null) {
-                    stderr += data.toString();
-                } else {
-                    stdout += data.toString();
-                }
-            })
-              .on('exit', (code: any, signal: any) => {
+            channel.on('data', (data: any) => {
+                stdout += data;
+            });
+            channel.on('exit', (code: any) => {
                 exitCode = <number>code;
+                log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`);
                 deferred.resolve({
-                    stdout : stdout,
-                    stderr : stderr,
-                    exitCode : exitCode
+                    stdout: stdout,
+                    stderr: stderr,
+                    exitCode: exitCode
                 });
             });
-        });
+            channel.stderr.on('data', function (data) {
+                stderr += data;
+            });
+
+            if (useShell) {
+                channel.stdin.write(`${command}\n`);
+                channel.end("exit\n");
+            }
+
+            return;
+        };
+
+        if (useShell) {
+            client.shell(callback);
+        } else {
+            client.exec(command, callback);
+        }
 
         return deferred.promise;
     }
@@ -120,7 +132,7 @@ export namespace SSHClientUtility {
         sshClient.sftp((err: Error, sftp: SFTPWrapper) => {
             if (err !== undefined && err !== null) {
                 getLogger()
-                  .error(`getRemoteFileContent: ${err.message}`);
+                    .error(`getRemoteFileContent: ${err.message}`);
                 deferred.reject(new Error(`SFTP error: ${err.message}`));
 
                 return;
@@ -132,18 +144,18 @@ export namespace SSHClientUtility {
                 sftpStream.on('data', (data: Buffer | string) => {
                     dataBuffer += data;
                 })
-                  .on('error', (streamErr: Error) => {
-                    sftp.end();
-                    deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
-                })
-                  .on('end', () => {
-                    // sftp connection need to be released manually once operation is done
-                    sftp.end();
-                    deferred.resolve(dataBuffer);
-                });
+                    .on('error', (streamErr: Error) => {
+                        sftp.end();
+                        deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
+                    })
+                    .on('end', () => {
+                        // sftp connection need to be released manually once operation is done
+                        sftp.end();
+                        deferred.resolve(dataBuffer);
+                    });
             } catch (error) {
                 getLogger()
-                  .error(`getRemoteFileContent: ${error.message}`);
+                    .error(`getRemoteFileContent: ${error.message}`);
                 sftp.end();
                 deferred.reject(new Error(`SFTP error: ${error.message}`));
             }

diff --git a/src/webui/yarn.lock b/src/webui/yarn.lock
@@ -5593,7 +5593,7 @@ load-json-file@^4.0.0:
     pify "^3.0.0"
     strip-bom "^3.0.0"
 
-loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0:
+loader-fs-cache@^1.0.0:
   version "1.0.3"
   resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9"
   integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA==

diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py
@@ -10,27 +10,31 @@
 
 from xml.dom import minidom
 
+
 def check_ready_to_run():
     if sys.platform == 'win32':
         pgrep_output = subprocess.check_output(
             'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
         pidList = pgrep_output.decode("utf-8").strip().split()
-        pidList.pop(0) # remove the key word 'ProcessId'
+        pidList.pop(0)  # remove the key word 'ProcessId'
         pidList = list(map(int, pidList))
         pidList.remove(os.getpid())
         return not pidList
     else:
-        pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
+        pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
         pidList = []
         for pid in pgrep_output.splitlines():
-            pidList.append(int(pid))
-        pidList.remove(os.getpid())
+            pid = pid.decode()
+            if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
+                continue
+            pidList.append(pid)
         return not pidList
 
+
 def main(argv):
     metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
     if check_ready_to_run() == False:
-        # GPU metrics collector is already running. Exit
+        print("GPU metrics collector is already running. exiting...")
         exit(2)
     cmd = 'nvidia-smi -q -x'.split()
     while(True):
@@ -44,6 +48,7 @@ def main(argv):
         # TODO: change to sleep time configurable via arguments
         time.sleep(5)
 
+
 def parse_nvidia_smi_result(smi, outputDir):
     try:
         old_umask = os.umask(0)
@@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir):
                 outPut["gpuInfos"].append(gpuInfo)
             print(outPut)
             outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
-            outputFile.flush();
-    except:
+            outputFile.flush()
+    except Exception as error:
         # e_info = sys.exc_info()
-        print('xmldoc paring error')
+        print('gpu_metrics_collector error: %s' % error)
     finally:
         os.umask(old_umask)
 
+
 def gen_empty_gpu_metric(outputDir):
     try:
         old_umask = os.umask(0)