diff --git a/Makefile b/Makefile index 6935e6c67f..bba531ebda 100644 --- a/Makefile +++ b/Makefile @@ -173,12 +173,12 @@ install-python-modules: dev-install-python-modules: #$(_INFO) Installing Python SDK $(_END) mkdir -p build - ln -sf ../src/sdk/pynni/nni build/nni - ln -sf ../src/sdk/pynni/nnicli build/nnicli - ln -sf ../tools/nni_annotation build/nni_annotation - ln -sf ../tools/nni_cmd build/nni_cmd - ln -sf ../tools/nni_trial_tool build/nni_trial_tool - ln -sf ../tools/nni_gpu_tool build/nni_gpu_tool + ln -sf ../src/sdk/pynni/nni build + ln -sf ../src/sdk/pycli/nnicli build + ln -sf ../tools/nni_annotation build + ln -sf ../tools/nni_cmd build + ln -sf ../tools/nni_trial_tool build + ln -sf ../tools/nni_gpu_tool build cp setup.py build/ cp README.md build/ sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py @@ -209,10 +209,12 @@ dev-install-node-modules: ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER) cp src/nni_manager/package.json $(NNI_PKG_FOLDER) sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json - ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules - ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static - ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build - ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js + ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER) + ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER) + mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static + mkdir -p $(NASUI_PKG_FOLDER) + ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER) + ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER) .PHONY: install-scripts install-scripts: diff --git a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md index a01d8cfe49..0cf9932530 100644 --- a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md +++ b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md @@ -6,7 +6,7 @@ For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps. -**1. Clone the source code** +### 1. Clone the source code Run the command @@ -16,7 +16,7 @@ git clone https://github.com/Microsoft/nni.git to clone the source code -**2. Prepare the debug environment and install dependencies** +### 2. Prepare the debug environment and install dependencies** Change directory to the source code folder, then run the command @@ -26,7 +26,7 @@ make install-dependencies to install the dependent tools for the environment -**3. Build source code** +### 3. Build source code Run the command @@ -36,7 +36,7 @@ make build to build the source code -**4. Install NNI to development environment** +### 4. Install NNI to development environment Run the command @@ -46,7 +46,7 @@ make dev-install to install the distribution content to development environment, and create cli scripts -**5. Check if the environment is ready** +### 5. Check if the environment is ready Now, you can try to start an experiment to check if your environment is ready. For example, run the command @@ -57,9 +57,21 @@ nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml And open WebUI to check if everything is OK -**6. Redeploy** +### 6. Redeploy + +After the code changes, it may need to redeploy. It depends on what kind of code changed. + +#### Python + +It doesn't need to redeploy, but the nnictl may need to be restarted. + +#### TypeScript + +* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly. +* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code. + +The nnictl may need to be restarted. -After the code changes, use **step 3** to rebuild your codes, then the changes will take effect immediately. --- At last, wish you have a wonderful day. diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index d4b16c276c..3e89f675b2 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -6,6 +6,7 @@ "build": "tsc", "test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors", "start": "node dist/main.js", + "watch": "tsc --watch", "eslint": "npx eslint ./ --ext .ts" }, "license": "MIT", diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index e88ae8aed2..d5a2bda6cd 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -175,7 +175,7 @@ class RemoteMachineTrainingService implements TrainingService { if (trialJob.rmMeta === undefined) { throw new Error(`rmMeta not set for submitted job ${trialJobId}`); } - const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id); + const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id); if (sshClient === undefined) { throw new Error(`Invalid job id: ${trialJobId}, cannot find ssh client`); } @@ -324,7 +324,7 @@ class RemoteMachineTrainingService implements TrainingService { } // codeDir is not a valid directory, throw Error if (!fs.lstatSync(remoteMachineTrailConfig.codeDir) - .isDirectory()) { + .isDirectory()) { throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`); } @@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService { //Begin to execute gpu_metrics_collection scripts const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir); - SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn); + SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn, true); const disposable: Rx.IDisposable = this.timer.subscribe( async (tick: number) => { @@ -513,7 +513,7 @@ class RemoteMachineTrainingService implements TrainingService { } private async launchTrialOnScheduledMachine(trialJobId: string, trialWorkingFolder: string, form: TrialJobApplicationForm, - rmScheduleInfo: RemoteMachineScheduleInfo): Promise { + rmScheduleInfo: RemoteMachineScheduleInfo): Promise { if (this.trialConfig === undefined) { throw new Error('trial config is not initialized'); } @@ -588,7 +588,7 @@ class RemoteMachineTrainingService implements TrainingService { // Copy files in codeDir to remote working directory await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS); // Execute command in remote machine - SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient); + SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient, true); } private getRmMetaByHost(host: string): RemoteMachineMeta { @@ -612,7 +612,7 @@ class RemoteMachineTrainingService implements TrainingService { const trailReturnCode: string = await SSHClientUtility.getRemoteFileContent(trialReturnCodeFilePath, sshClient); this.log.debug(`trailjob ${trialJob.id} return code: ${trailReturnCode}`); const match: RegExpMatchArray | null = trailReturnCode.trim() - .match(/^(\d+)\s+(\d+)$/); + .match(/^(\d+)\s+(\d+)$/); if (match !== null) { const { 1: code, 2: timestamp } = match; // Update trial job's status based on result code diff --git a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts index d261ff46c4..e3c96806dd 100644 --- a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts +++ b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts @@ -58,7 +58,7 @@ export namespace SSHClientUtility { * @param command the command to execute remotely * @param client SSH Client */ - export function remoteExeCommand(command: string, client: Client): Promise { + export function remoteExeCommand(command: string, client: Client, useShell: boolean = false): Promise { const log: Logger = getLogger(); log.debug(`remoteExeCommand: command: [${command}]`); const deferred: Deferred = new Deferred(); @@ -66,30 +66,42 @@ export namespace SSHClientUtility { let stderr: string = ''; let exitCode: number; - client.exec(command, (err: Error, channel: ClientChannel) => { + const callback = (err: Error, channel: ClientChannel): void => { if (err !== undefined && err !== null) { log.error(`remoteExeCommand: ${err.message}`); deferred.reject(err); - return; } - channel.on('data', (data: any, dataStderr: any) => { - if (dataStderr !== undefined && dataStderr !== null) { - stderr += data.toString(); - } else { - stdout += data.toString(); - } - }) - .on('exit', (code: any, signal: any) => { + channel.on('data', (data: any) => { + stdout += data; + }); + channel.on('exit', (code: any) => { exitCode = code; + log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`); deferred.resolve({ - stdout : stdout, - stderr : stderr, - exitCode : exitCode + stdout: stdout, + stderr: stderr, + exitCode: exitCode }); }); - }); + channel.stderr.on('data', function (data) { + stderr += data; + }); + + if (useShell) { + channel.stdin.write(`${command}\n`); + channel.end("exit\n"); + } + + return; + }; + + if (useShell) { + client.shell(callback); + } else { + client.exec(command, callback); + } return deferred.promise; } @@ -120,7 +132,7 @@ export namespace SSHClientUtility { sshClient.sftp((err: Error, sftp: SFTPWrapper) => { if (err !== undefined && err !== null) { getLogger() - .error(`getRemoteFileContent: ${err.message}`); + .error(`getRemoteFileContent: ${err.message}`); deferred.reject(new Error(`SFTP error: ${err.message}`)); return; @@ -132,18 +144,18 @@ export namespace SSHClientUtility { sftpStream.on('data', (data: Buffer | string) => { dataBuffer += data; }) - .on('error', (streamErr: Error) => { - sftp.end(); - deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message)); - }) - .on('end', () => { - // sftp connection need to be released manually once operation is done - sftp.end(); - deferred.resolve(dataBuffer); - }); + .on('error', (streamErr: Error) => { + sftp.end(); + deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message)); + }) + .on('end', () => { + // sftp connection need to be released manually once operation is done + sftp.end(); + deferred.resolve(dataBuffer); + }); } catch (error) { getLogger() - .error(`getRemoteFileContent: ${error.message}`); + .error(`getRemoteFileContent: ${error.message}`); sftp.end(); deferred.reject(new Error(`SFTP error: ${error.message}`)); } diff --git a/src/webui/yarn.lock b/src/webui/yarn.lock index 35e16a6891..3053a14da9 100644 --- a/src/webui/yarn.lock +++ b/src/webui/yarn.lock @@ -5593,7 +5593,7 @@ load-json-file@^4.0.0: pify "^3.0.0" strip-bom "^3.0.0" -loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0: +loader-fs-cache@^1.0.0: version "1.0.3" resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9" integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA== diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 10f672946f..6b2a9e1351 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -10,27 +10,31 @@ from xml.dom import minidom + def check_ready_to_run(): if sys.platform == 'win32': pgrep_output = subprocess.check_output( 'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId') pidList = pgrep_output.decode("utf-8").strip().split() - pidList.pop(0) # remove the key word 'ProcessId' + pidList.pop(0) # remove the key word 'ProcessId' pidList = list(map(int, pidList)) pidList.remove(os.getpid()) return not pidList else: - pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) + pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pidList = [] for pid in pgrep_output.splitlines(): - pidList.append(int(pid)) - pidList.remove(os.getpid()) + pid = pid.decode() + if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()): + continue + pidList.append(pid) return not pidList + def main(argv): metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] if check_ready_to_run() == False: - # GPU metrics collector is already running. Exit + print("GPU metrics collector is already running. exiting...") exit(2) cmd = 'nvidia-smi -q -x'.split() while(True): @@ -44,6 +48,7 @@ def main(argv): # TODO: change to sleep time configurable via arguments time.sleep(5) + def parse_nvidia_smi_result(smi, outputDir): try: old_umask = os.umask(0) @@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir): outPut["gpuInfos"].append(gpuInfo) print(outPut) outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True))) - outputFile.flush(); - except: + outputFile.flush() + except Exception as error: # e_info = sys.exc_info() - print('xmldoc paring error') + print('gpu_metrics_collector error: %s' % error) finally: os.umask(old_umask) + def gen_empty_gpu_metric(outputDir): try: old_umask = os.umask(0)