Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
fix #1578 and some improvements (#2370)
Browse files Browse the repository at this point in the history
Add shell support for ssh connection, so that remote script can be started with user environment.

Minor fixes,

1. Fix gpu_metrics_collector to support pyenv. As pyenv will create one more process, so that original pgrep code always got extra processes, and cannot start gpu_metrics_collector.
2. Fix NASUI failure on dev-install-node-modules, to create subfolder every time.
3. Fix MakeFile to reduce mis-created links, and other minor issues.
4. Add node --watch for nni_manager for better dev experience.
  • Loading branch information
squirrelsc authored Apr 26, 2020
1 parent 1d893dd commit 1c6f1ef
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 58 deletions.
22 changes: 12 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,12 @@ install-python-modules:
dev-install-python-modules:
#$(_INFO) Installing Python SDK $(_END)
mkdir -p build
ln -sf ../src/sdk/pynni/nni build/nni
ln -sf ../src/sdk/pynni/nnicli build/nnicli
ln -sf ../tools/nni_annotation build/nni_annotation
ln -sf ../tools/nni_cmd build/nni_cmd
ln -sf ../tools/nni_trial_tool build/nni_trial_tool
ln -sf ../tools/nni_gpu_tool build/nni_gpu_tool
ln -sf ../src/sdk/pynni/nni build
ln -sf ../src/sdk/pycli/nnicli build
ln -sf ../tools/nni_annotation build
ln -sf ../tools/nni_cmd build
ln -sf ../tools/nni_trial_tool build
ln -sf ../tools/nni_gpu_tool build
cp setup.py build/
cp README.md build/
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py
Expand Down Expand Up @@ -209,10 +209,12 @@ dev-install-node-modules:
ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
cp src/nni_manager/package.json $(NNI_PKG_FOLDER)
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)
ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER)
mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static
mkdir -p $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)

.PHONY: install-scripts
install-scripts:
Expand Down
26 changes: 19 additions & 7 deletions docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps.

**1. Clone the source code**
### 1. Clone the source code

Run the command

Expand All @@ -16,7 +16,7 @@ git clone https://github.com/Microsoft/nni.git

to clone the source code

**2. Prepare the debug environment and install dependencies**
### 2. Prepare the debug environment and install dependencies**

Change directory to the source code folder, then run the command

Expand All @@ -26,7 +26,7 @@ make install-dependencies

to install the dependent tools for the environment

**3. Build source code**
### 3. Build source code

Run the command

Expand All @@ -36,7 +36,7 @@ make build

to build the source code

**4. Install NNI to development environment**
### 4. Install NNI to development environment

Run the command

Expand All @@ -46,7 +46,7 @@ make dev-install

to install the distribution content to development environment, and create cli scripts

**5. Check if the environment is ready**
### 5. Check if the environment is ready

Now, you can try to start an experiment to check if your environment is ready.
For example, run the command
Expand All @@ -57,9 +57,21 @@ nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml

And open WebUI to check if everything is OK

**6. Redeploy**
### 6. Redeploy

After the code changes, it may need to redeploy. It depends on what kind of code changed.

#### Python

It doesn't need to redeploy, but the nnictl may need to be restarted.

#### TypeScript

* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly.
* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code.

The nnictl may need to be restarted.

After the code changes, use **step 3** to rebuild your codes, then the changes will take effect immediately.

---
At last, wish you have a wonderful day.
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"build": "tsc",
"test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
"start": "node dist/main.js",
"watch": "tsc --watch",
"eslint": "npx eslint ./ --ext .ts"
},
"license": "MIT",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (trialJob.rmMeta === undefined) {
throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
}
const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id);
const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id);
if (sshClient === undefined) {
throw new Error(`Invalid job id: ${trialJobId}, cannot find ssh client`);
}
Expand Down Expand Up @@ -324,7 +324,7 @@ class RemoteMachineTrainingService implements TrainingService {
}
// codeDir is not a valid directory, throw Error
if (!fs.lstatSync(remoteMachineTrailConfig.codeDir)
.isDirectory()) {
.isDirectory()) {
throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
}

Expand Down Expand Up @@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService {

//Begin to execute gpu_metrics_collection scripts
const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn);
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn, true);

const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => {
Expand Down Expand Up @@ -513,7 +513,7 @@ class RemoteMachineTrainingService implements TrainingService {
}

private async launchTrialOnScheduledMachine(trialJobId: string, trialWorkingFolder: string, form: TrialJobApplicationForm,
rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
if (this.trialConfig === undefined) {
throw new Error('trial config is not initialized');
}
Expand Down Expand Up @@ -588,7 +588,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient, true);
}

private getRmMetaByHost(host: string): RemoteMachineMeta {
Expand All @@ -612,7 +612,7 @@ class RemoteMachineTrainingService implements TrainingService {
const trailReturnCode: string = await SSHClientUtility.getRemoteFileContent(trialReturnCodeFilePath, sshClient);
this.log.debug(`trailjob ${trialJob.id} return code: ${trailReturnCode}`);
const match: RegExpMatchArray | null = trailReturnCode.trim()
.match(/^(\d+)\s+(\d+)$/);
.match(/^(\d+)\s+(\d+)$/);
if (match !== null) {
const { 1: code, 2: timestamp } = match;
// Update trial job's status based on result code
Expand Down
64 changes: 38 additions & 26 deletions src/nni_manager/training_service/remote_machine/sshClientUtility.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,38 +58,50 @@ export namespace SSHClientUtility {
* @param command the command to execute remotely
* @param client SSH Client
*/
export function remoteExeCommand(command: string, client: Client): Promise<RemoteCommandResult> {
export function remoteExeCommand(command: string, client: Client, useShell: boolean = false): Promise<RemoteCommandResult> {
const log: Logger = getLogger();
log.debug(`remoteExeCommand: command: [${command}]`);
const deferred: Deferred<RemoteCommandResult> = new Deferred<RemoteCommandResult>();
let stdout: string = '';
let stderr: string = '';
let exitCode: number;

client.exec(command, (err: Error, channel: ClientChannel) => {
const callback = (err: Error, channel: ClientChannel): void => {
if (err !== undefined && err !== null) {
log.error(`remoteExeCommand: ${err.message}`);
deferred.reject(err);

return;
}

channel.on('data', (data: any, dataStderr: any) => {
if (dataStderr !== undefined && dataStderr !== null) {
stderr += data.toString();
} else {
stdout += data.toString();
}
})
.on('exit', (code: any, signal: any) => {
channel.on('data', (data: any) => {
stdout += data;
});
channel.on('exit', (code: any) => {
exitCode = <number>code;
log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`);
deferred.resolve({
stdout : stdout,
stderr : stderr,
exitCode : exitCode
stdout: stdout,
stderr: stderr,
exitCode: exitCode
});
});
});
channel.stderr.on('data', function (data) {
stderr += data;
});

if (useShell) {
channel.stdin.write(`${command}\n`);
channel.end("exit\n");
}

return;
};

if (useShell) {
client.shell(callback);
} else {
client.exec(command, callback);
}

return deferred.promise;
}
Expand Down Expand Up @@ -120,7 +132,7 @@ export namespace SSHClientUtility {
sshClient.sftp((err: Error, sftp: SFTPWrapper) => {
if (err !== undefined && err !== null) {
getLogger()
.error(`getRemoteFileContent: ${err.message}`);
.error(`getRemoteFileContent: ${err.message}`);
deferred.reject(new Error(`SFTP error: ${err.message}`));

return;
Expand All @@ -132,18 +144,18 @@ export namespace SSHClientUtility {
sftpStream.on('data', (data: Buffer | string) => {
dataBuffer += data;
})
.on('error', (streamErr: Error) => {
sftp.end();
deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
})
.on('end', () => {
// sftp connection need to be released manually once operation is done
sftp.end();
deferred.resolve(dataBuffer);
});
.on('error', (streamErr: Error) => {
sftp.end();
deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
})
.on('end', () => {
// sftp connection need to be released manually once operation is done
sftp.end();
deferred.resolve(dataBuffer);
});
} catch (error) {
getLogger()
.error(`getRemoteFileContent: ${error.message}`);
.error(`getRemoteFileContent: ${error.message}`);
sftp.end();
deferred.reject(new Error(`SFTP error: ${error.message}`));
}
Expand Down
2 changes: 1 addition & 1 deletion src/webui/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5593,7 +5593,7 @@ load-json-file@^4.0.0:
pify "^3.0.0"
strip-bom "^3.0.0"

loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0:
loader-fs-cache@^1.0.0:
version "1.0.3"
resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9"
integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA==
Expand Down
22 changes: 14 additions & 8 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,31 @@

from xml.dom import minidom


def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
pidList.remove(os.getpid())
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList


def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split()
while(True):
Expand All @@ -44,6 +48,7 @@ def main(argv):
# TODO: change to sleep time configurable via arguments
time.sleep(5)


def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
Expand All @@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir):
outPut["gpuInfos"].append(gpuInfo)
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush();
except:
outputFile.flush()
except Exception as error:
# e_info = sys.exc_info()
print('xmldoc paring error')
print('gpu_metrics_collector error: %s' % error)
finally:
os.umask(old_umask)


def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
Expand Down

0 comments on commit 1c6f1ef

Please sign in to comment.