From b336e187872f53d3a0f0281f9e93ee6805dc9be7 Mon Sep 17 00:00:00 2001 From: J-shang Date: Thu, 28 Jan 2021 08:34:46 +0000 Subject: [PATCH 01/20] nfs for pai --- .../common/trialConfigMetadataKey.ts | 3 +- .../training_service/reusable/environment.ts | 1 + .../environments/amlEnvironmentService.ts | 4 + .../environments/localEnvironmentService.ts | 4 + .../environments/openPaiEnvironmentService.ts | 15 ++- .../environments/remoteEnvironmentService.ts | 4 + .../reusable/sharedStorage.ts | 22 ++++ .../shared_storages/nfsStorageService.ts | 104 ++++++++++++++++++ .../reusable/test/utEnvironmentService.ts | 5 + .../reusable/trialDispatcher.ts | 56 +++++++++- 10 files changed, 209 insertions(+), 9 deletions(-) create mode 100644 ts/nni_manager/training_service/reusable/sharedStorage.ts create mode 100644 ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts diff --git a/ts/nni_manager/training_service/common/trialConfigMetadataKey.ts b/ts/nni_manager/training_service/common/trialConfigMetadataKey.ts index a080b80ad0..02f342b55a 100644 --- a/ts/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/ts/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey { LOG_COLLECTION = 'log_collection', // Used to set platform for hybrid in reuse mode, // temproarily change and will refactor config schema in the future - PLATFORM_LIST = 'platform_list' + PLATFORM_LIST = 'platform_list', + SHARED_STORAGE_CONFIG = 'shared_storage_config' } diff --git a/ts/nni_manager/training_service/reusable/environment.ts b/ts/nni_manager/training_service/reusable/environment.ts index 3f021676db..4e06275c49 100644 --- a/ts/nni_manager/training_service/reusable/environment.ts +++ b/ts/nni_manager/training_service/reusable/environment.ts @@ -125,6 +125,7 @@ export class EnvironmentInformation { export abstract class EnvironmentService { public abstract get hasStorageService(): boolean; + public abstract get useSharedStorage(): boolean; public abstract config(key: string, value: string): Promise; public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; public abstract stopEnvironment(environment: EnvironmentInformation): Promise; diff --git a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 6a59b81c0e..0aa44b0f1f 100644 --- a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -40,6 +40,10 @@ export class AMLEnvironmentService extends EnvironmentService { return false; } + public get useSharedStorage(): boolean { + return false; + } + public initCommandChannel(eventEmitter: EventEmitter): void { this.commandChannel = new AMLCommandChannel(eventEmitter); } diff --git a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts index ad48ae4df9..45961acb60 100644 --- a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts @@ -37,6 +37,10 @@ export class LocalEnvironmentService extends EnvironmentService { return false; } + public get useSharedStorage(): boolean { + return false; + } + public get getName(): string { return 'local'; } diff --git a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index c41a18cdb5..bc168a9c4c 100644 --- a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -14,6 +14,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../../pai/paiConfig'; import { NNIPAITrialConfig } from '../../pai/paiConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; +import { SharedStorageService } from '../sharedStorage'; import { StorageService } from '../storageService'; @@ -45,6 +46,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return true; } + public get useSharedStorage(): boolean { + return true; + } + public get getName(): string { return 'pai'; } @@ -178,9 +183,15 @@ export class OpenPaiEnvironmentService extends EnvironmentService { } // Step 1. Prepare PAI job configuration - const environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`; + let environmentRoot: string; + if (this.useSharedStorage) { + environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; + environment.command = `${component.get(SharedStorageService).remoteMountCommand} && cd ${environmentRoot} && ${environment.command}`; + } else { + environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`; + environment.command = `cd ${environmentRoot} && ${environment.command}`; + } environment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`; - environment.command = `cd ${environmentRoot} && ${environment.command}`; environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.envId}`; environment.useActiveGpu = this.paiClusterConfig.useActiveGpu; environment.maxTrialNumberPerGpu = this.paiClusterConfig.maxTrialNumPerGpu; diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index bc52a11565..763ae31281 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -63,6 +63,10 @@ export class RemoteEnvironmentService extends EnvironmentService { return false; } + public get useSharedStorage(): boolean { + return false; + } + public get getName(): string { return 'remote'; } diff --git a/ts/nni_manager/training_service/reusable/sharedStorage.ts b/ts/nni_manager/training_service/reusable/sharedStorage.ts new file mode 100644 index 0000000000..3565ee53f0 --- /dev/null +++ b/ts/nni_manager/training_service/reusable/sharedStorage.ts @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { StorageService } from './storageService' + +export type SharedStorageType = 'NFS' + +export interface SharedStorageConfig { + readonly storageType: SharedStorageType; + readonly localMountPoint: string; + readonly remoteMountPoint: string; +} + +export abstract class SharedStorageService { + public abstract config(key: string, value: string): Promise; + public abstract get storageService(): StorageService; + public abstract get localMountCommand(): string; + public abstract get remoteMountCommand(): string; + public abstract get remoteWorkingRoot(): string; +} diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts new file mode 100644 index 0000000000..20812fd3d2 --- /dev/null +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -0,0 +1,104 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as cpp from 'child-process-promise'; +import * as path from 'path'; + +import { SharedStorageService, SharedStorageConfig, SharedStorageType } from '../sharedStorage' +import { MountedStorageService } from '../storages/mountedStorageService'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { getLogger, Logger } from '../../../common/log'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; + +class NFSSharedStorageConfig implements SharedStorageConfig { + public storageType: SharedStorageType; + public localMountPoint: string; + public remoteMountPoint: string; + + public nfsServer: string; + public exportedDirectory: string; + public userMounted: boolean; + + constructor(storageType: SharedStorageType, localMountPoint: string, remoteMountPoint: string, + nfsServer: string, exportedDirectory: string, userMounted: boolean) { + this.storageType = storageType; + this.localMountPoint = localMountPoint; + this.remoteMountPoint = remoteMountPoint; + this.nfsServer = nfsServer; + this.exportedDirectory = exportedDirectory; + this.userMounted = userMounted; + } +} + +export class NFSSharedStorageService extends SharedStorageService { + private log: Logger; + private internalStorageService: MountedStorageService; + private experimentId: string; + + private storageType?: SharedStorageType; + private nfsServer?: string; + private exportedDirectory?: string; + + private localMountPoint?: string; + private remoteMountPoint?: string; + + constructor() { + super(); + this.log = getLogger(); + this.internalStorageService = new MountedStorageService(); + this.experimentId = getExperimentId(); + } + + public async config(key: string, value: string): Promise { + if (key === TrialConfigMetadataKey.SHARED_STORAGE_CONFIG) { + const nfsConfig = JSON.parse(value); + this.localMountPoint = nfsConfig.localMountPoint; + this.remoteMountPoint = nfsConfig.remoteMountPoint; + + this.storageType = nfsConfig.storageType; + this.nfsServer = nfsConfig.nfsServer; + this.exportedDirectory = nfsConfig.exportedDirectory; + if ( nfsConfig.userMounted === false ) { + await this.helpLocalMount(); + } + + this.internalStorageService.initialize(this.localMountPoint, path.join(this.localMountPoint, 'nni', this.experimentId)); + } + } + + public get storageService(): MountedStorageService { + return this.internalStorageService; + } + + public get localMountCommand(): string { + return `mkdir -p ${this.localMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.localMountPoint}`; + } + + public get remoteMountCommand(): string { + return `mkdir -p ${this.remoteMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.remoteMountPoint}`; + } + + public get remoteWorkingRoot(): string { + return `${this.remoteMountPoint}/nni/${this.experimentId}`; + } + + private async helpLocalMount(): Promise { + if (process.platform === 'win32') { + const errorMessage = `${this.storageType} Shared Storage: NNI not support auto mount ${this.storageType} under Windows yet.`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + try { + await cpp.exec(this.localMountCommand); + } catch (error) { + const errorMessage: string = `${this.storageType} Shared Storage: Mount ${this.nfsServer}:${this.exportedDirectory} to ${this.localMountPoint} failed, error is ${error}`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + return Promise.resolve(); + } +} diff --git a/ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts b/ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts index b4221d82bf..71410574f0 100644 --- a/ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts @@ -17,6 +17,11 @@ export class UtEnvironmentService extends EnvironmentService { // storage service is tested by integration testing. return false; } + + public get useSharedStorage(): boolean { + return false; + } + public get environmentMaintenceLoopInterval(): number { return 1; } diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 1316fad2a5..16ece45a5b 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -7,6 +7,7 @@ import { EventEmitter } from 'events'; import * as fs from 'fs'; import * as path from 'path'; import { Writable } from 'stream'; +import { Container, Scope } from 'typescript-ioc'; import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; @@ -26,6 +27,8 @@ import { EnvironmentServiceFactory } from './environments/environmentServiceFact import { GpuScheduler } from './gpuScheduler'; import { MountedStorageService } from './storages/mountedStorageService'; import { StorageService } from './storageService'; +import { SharedStorageService, SharedStorageConfig } from './sharedStorage'; +import { NFSSharedStorageService } from './shared_storages/nfsStorageService' import { TrialDetail } from './trial'; @@ -74,6 +77,10 @@ class TrialDispatcher implements TrainingService { private isLoggedNoMoreEnvironment: boolean = false; private isLoggedNoGpuAvailable: boolean = false; + // uses to mark whether to use shared storage + private useSharedStorage: boolean = false; + private fileCopyCompleted: boolean = false; + constructor() { this.log = getLogger(); this.trials = new Map(); @@ -195,7 +202,14 @@ class TrialDispatcher implements TrainingService { this.log.info(`TrialDispatcher: copying code and settings.`); let storageService: StorageService; - if (environmentService.hasStorageService) { + if (environmentService.useSharedStorage) { + if (this.fileCopyCompleted) { + this.log.debug(`TrialDispatcher: file already copy to shared storage.`); + continue; + } + this.log.debug(`TrialDispatcher: use shared storage service.`); + storageService = component.get(SharedStorageService).storageService; + } else if (environmentService.hasStorageService) { this.log.debug(`TrialDispatcher: use existing storage service.`); storageService = component.get(StorageService); } else { @@ -223,6 +237,10 @@ class TrialDispatcher implements TrainingService { } await storageService.copyDirectory(trialToolsPath, envDir, true); } + + if (environmentService.useSharedStorage) { + this.fileCopyCompleted = true; + } } // start channel this.commandEmitter.on("command", (command: Command): void => { @@ -260,7 +278,6 @@ class TrialDispatcher implements TrainingService { break; case TrialConfigMetadataKey.VERSION_CHECK: this.enableVersionCheck = (value === 'true' || value === 'True'); - break; case TrialConfigMetadataKey.LOG_COLLECTION: this.logCollection = value; @@ -279,7 +296,7 @@ class TrialDispatcher implements TrainingService { // Validate to make sure codeDir doesn't have too many files await validateCodeDir(this.trialConfig.codeDir); break; - case TrialConfigMetadataKey.PLATFORM_LIST: { + case TrialConfigMetadataKey.PLATFORM_LIST: const platforms: string[] = value.split(","); for(const platform of platforms) { const environmentService: EnvironmentService = EnvironmentServiceFactory.createEnvironmentService(platform); @@ -289,7 +306,16 @@ class TrialDispatcher implements TrainingService { this.commandChannelSet.add(environmentService.getCommandChannel); this.environmentServiceList.push(environmentService); } - } + this.setClusterMetadata('shared_storage_config', `{"storageType":"NFS","localMountPoint":"/mnt/sharedfolder","remoteMountPoint":"nni-sharedfolder","nfsServer":"40.121.81.141","exportedDirectory":"/mnt/sharedfolder","userMounted":"true"}`); + break; + case TrialConfigMetadataKey.SHARED_STORAGE_CONFIG: + if (this.useSharedStorage === false) { + await this.initializeSharedStorage(key, value); + } else { + const errorMessage = `Already has set shared storage.`; + this.log.error(errorMessage); + } + break; } for(const environmentService of this.environmentServiceList) { await environmentService.config(key, value); @@ -621,7 +647,7 @@ class TrialDispatcher implements TrainingService { } } - + // Schedule a environment platform for environment private selectEnvironmentService(): EnvironmentService | undefined { const validEnvironmentServiceList = []; @@ -636,7 +662,7 @@ class TrialDispatcher implements TrainingService { // Random scheduler return randomSelect(validEnvironmentServiceList); } - + private async prefetchEnvironments (): Promise { for (const environmentService of this.environmentServiceList) { const number = environmentService.prefetchedEnvironmentCount; @@ -882,6 +908,24 @@ class TrialDispatcher implements TrainingService { } this.shouldUpdateTrials = true; } + + private async initializeSharedStorage(key: string, value: string): Promise { + const storageType = (JSON.parse(value)).storageType; + switch (storageType) { + case 'NFS': + Container.bind(SharedStorageService) + .to(NFSSharedStorageService) + .scope(Scope.Singleton); + break; + default: + const errorMessage = `Shared storage type '${storageType}' not support.`; + this.log.error(errorMessage) + return Promise.reject(errorMessage); + } + await component.get(SharedStorageService).config(key, value); + this.useSharedStorage = true; + return Promise.resolve(); + } } export { TrialDispatcher }; From 2440d2f50ce902a76ac32292420aefb8d4bf7e23 Mon Sep 17 00:00:00 2001 From: J-shang Date: Thu, 28 Jan 2021 23:59:56 +0800 Subject: [PATCH 02/20] add validation --- nni/tools/nnictl/config_schema.py | 3 ++- nni/tools/nnictl/launcher.py | 15 +++++++++++++++ .../rest_server/restValidationSchemas.ts | 8 ++++++++ .../reusable/shared_storages/nfsStorageService.ts | 4 ++-- .../training_service/reusable/trialDispatcher.ts | 1 - 5 files changed, 27 insertions(+), 4 deletions(-) diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index baa02b26e9..96527a2c3c 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -142,7 +142,8 @@ def validate(self, data): Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), Optional('useActiveGpu'): setType('useActiveGpu', bool) - } + }, + Optional('sharedStorage'): setType('sharedStorage', dict) } common_trial_schema = { diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index 15ea58b47d..1465490000 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -314,6 +314,19 @@ def set_hybrid_config(experiment_config, port, config_file_name): #set trial_config return set_trial_config(experiment_config, port, config_file_name), err_message +def set_shared_storage(experiment_config, port, config_file_name): + if 'sharedStorage' in experiment_config: + response = rest_put(cluster_metadata_url(port), json.dumps({'shared_storage_config': experiment_config['sharedStorage']}), REST_TIME_OUT) + err_message = None + if not response or not response.status_code == 200: + if response is not None: + err_message = response.text + _, stderr_full_path = get_log_path(config_file_name) + with open(stderr_full_path, 'a+') as fout: + fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) + return False, err_message + return True, None + def set_experiment(experiment_config, mode, port, config_file_name): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() @@ -442,6 +455,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res else: raise Exception(ERROR_INFO % 'Unsupported platform!') exit(1) + if config_result: + config_result, err_msg = set_shared_storage(experiment_config, port, config_file_name) if config_result: print_normal('Successfully set {0} config!'.format(platform)) else: diff --git a/ts/nni_manager/rest_server/restValidationSchemas.ts b/ts/nni_manager/rest_server/restValidationSchemas.ts index e7253beba4..2a3758874b 100644 --- a/ts/nni_manager/rest_server/restValidationSchemas.ts +++ b/ts/nni_manager/rest_server/restValidationSchemas.ts @@ -191,6 +191,14 @@ export namespace ValidationSchemas { }), remote_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase reuse: joi.boolean() + }), + shared_storage_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase + storageType: joi.string(), + localMountPoint: joi.string(), + remoteMountPoint: joi.string(), + nfsServer: joi.string(), + exportedDirectory: joi.string(), + userMounted: joi.boolean() }) } }; diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index 20812fd3d2..362b51d981 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -73,11 +73,11 @@ export class NFSSharedStorageService extends SharedStorageService { } public get localMountCommand(): string { - return `mkdir -p ${this.localMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.localMountPoint}`; + return `sudo apt-get update && sudo apt-get -y install nfs-common && mkdir -p ${this.localMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.localMountPoint}`; } public get remoteMountCommand(): string { - return `mkdir -p ${this.remoteMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.remoteMountPoint}`; + return `sudo apt-get update && sudo apt-get -y install nfs-common && mkdir -p ${this.remoteMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.remoteMountPoint}`; } public get remoteWorkingRoot(): string { diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 16ece45a5b..3aa966ce61 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -306,7 +306,6 @@ class TrialDispatcher implements TrainingService { this.commandChannelSet.add(environmentService.getCommandChannel); this.environmentServiceList.push(environmentService); } - this.setClusterMetadata('shared_storage_config', `{"storageType":"NFS","localMountPoint":"/mnt/sharedfolder","remoteMountPoint":"nni-sharedfolder","nfsServer":"40.121.81.141","exportedDirectory":"/mnt/sharedfolder","userMounted":"true"}`); break; case TrialConfigMetadataKey.SHARED_STORAGE_CONFIG: if (this.useSharedStorage === false) { From f2e30fa15bff0cbcb4681542ea5e2ca32dba4860 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 29 Jan 2021 06:08:27 +0000 Subject: [PATCH 03/20] support local remote aml --- .../training_service/reusable/environment.ts | 3 +- .../environments/amlEnvironmentService.ts | 16 +++++++---- .../environments/localEnvironmentService.ts | 28 +++++++++++++------ .../environments/openPaiEnvironmentService.ts | 6 +--- .../environments/remoteEnvironmentService.ts | 20 +++++++++---- .../reusable/sharedStorage.ts | 1 + .../shared_storages/nfsStorageService.ts | 4 +++ .../reusable/trialDispatcher.ts | 6 ++-- 8 files changed, 56 insertions(+), 28 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environment.ts b/ts/nni_manager/training_service/reusable/environment.ts index 4e06275c49..ab2b75c5be 100644 --- a/ts/nni_manager/training_service/reusable/environment.ts +++ b/ts/nni_manager/training_service/reusable/environment.ts @@ -77,6 +77,8 @@ export class EnvironmentInformation { public environmentService?: EnvironmentService; + public useSharedStorage?: boolean; + constructor(id: string, name: string, envId?: string) { this.log = getLogger(); this.id = id; @@ -125,7 +127,6 @@ export class EnvironmentInformation { export abstract class EnvironmentService { public abstract get hasStorageService(): boolean; - public abstract get useSharedStorage(): boolean; public abstract config(key: string, value: string): Promise; public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; public abstract stopEnvironment(environment: EnvironmentInformation): Promise; diff --git a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 0aa44b0f1f..c9e75c6c22 100644 --- a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -16,6 +16,7 @@ import { AMLClusterConfig, AMLEnvironmentInformation, AMLTrialConfig } from '../ import { EnvironmentInformation, EnvironmentService } from '../environment'; import { EventEmitter } from "events"; import { AMLCommandChannel } from '../channels/amlCommandChannel'; +import { SharedStorageService } from '../sharedStorage' /** @@ -40,10 +41,6 @@ export class AMLEnvironmentService extends EnvironmentService { return false; } - public get useSharedStorage(): boolean { - return false; - } - public initCommandChannel(eventEmitter: EventEmitter): void { this.commandChannel = new AMLCommandChannel(eventEmitter); } @@ -118,7 +115,16 @@ export class AMLEnvironmentService extends EnvironmentService { } const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); - environment.command = `import os\nos.system('${amlEnvironment.command}')`; + if (!fs.existsSync(environmentLocalTempFolder)) { + await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); + } + if (environment.useSharedStorage) { + const environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; + const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; + environment.command = `import os\nos.system('${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}')`; + } else { + environment.command = `import os\nos.system('${amlEnvironment.command}')`; + } environment.useActiveGpu = this.amlClusterConfig.useActiveGpu; environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' }); diff --git a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts index 45961acb60..b76ac21f02 100644 --- a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts @@ -14,6 +14,7 @@ import { EnvironmentInformation, EnvironmentService } from '../environment'; import { TrialConfig } from '../../common/trialConfig'; import { getExperimentRootDir, isAlive } from '../../../common/utils'; import { execMkdir, runScript, execCopydir } from '../../common/util'; +import { SharedStorageService } from '../sharedStorage' @component.Singleton export class LocalEnvironmentService extends EnvironmentService { @@ -97,18 +98,27 @@ export class LocalEnvironmentService extends EnvironmentService { throw new Error('Local trial config is not initialized'); } // Need refactor, this temp folder path is not appropriate, there are two expId in this path - const localTempFolder: string = path.join(this.experimentRootDir, this.experimentId, - "environment-temp", "envs"); - const localEnvCodeFolder: string = path.join(this.experimentRootDir, "envs"); + let localWorkingRoot: string; + if (this.useSharedStorage) { + localWorkingRoot = component.get(SharedStorageService).localWorkingRoot; + } else { + localWorkingRoot = this.experimentRootDir; + } + const localEnvCodeFolder: string = path.join(localWorkingRoot, "envs"); + if (!this.useSharedStorage) { + const localTempFolder: string = path.join(localWorkingRoot, this.experimentId, + "environment-temp", "envs"); + await execCopydir(localTempFolder, localEnvCodeFolder); + } environment.runnerWorkingFolder = path.join(localEnvCodeFolder, environment.id); await execMkdir(environment.runnerWorkingFolder); - await execCopydir(localTempFolder, localEnvCodeFolder); - environment.command = `cd ${this.experimentRootDir} && \ -${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ -1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ -&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + + environment.command = `cd ${localWorkingRoot} && \ + ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ + 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ + && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; await fs.promises.writeFile(path.join(localEnvCodeFolder, 'nni_run.sh'), - environment.command, { encoding: 'utf8', mode: 0o777 }), + environment.command, { encoding: 'utf8', mode: 0o777 }), // Execute command in local machine runScript(path.join(localEnvCodeFolder, 'nni_run.sh')); environment.trackingUrl = `${environment.runnerWorkingFolder}`; diff --git a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index bc168a9c4c..9ef189dfe4 100644 --- a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -46,10 +46,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return true; } - public get useSharedStorage(): boolean { - return true; - } - public get getName(): string { return 'pai'; } @@ -184,7 +180,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { // Step 1. Prepare PAI job configuration let environmentRoot: string; - if (this.useSharedStorage) { + if (environment.useSharedStorage) { environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; environment.command = `${component.get(SharedStorageService).remoteMountCommand} && cd ${environmentRoot} && ${environment.command}`; } else { diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index 763ae31281..2c6b1e818e 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -20,6 +20,7 @@ import { } from '../../remote_machine/remoteMachineData'; import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; import { RemoteMachineEnvironmentInformation } from '../remote/remoteConfig'; +import { SharedStorageService } from '../sharedStorage' @component.Singleton @@ -251,13 +252,20 @@ export class RemoteEnvironmentService extends EnvironmentService { } this.environmentExecutorManagerMap.set(environment.id, executorManager); const executor = await this.getExecutor(environment.id); - environment.runnerWorkingFolder = - executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), - 'envs', environment.id) + if (environment.useSharedStorage) { + const environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; + environment.runnerWorkingFolder = executor.joinPath(environmentRoot, 'envs', environment.id) + const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; + await executor.executeScript(remoteMountCommand, false, false); + } else { + environment.runnerWorkingFolder = + executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), + 'envs', environment.id) + } environment.command = `cd ${environment.runnerWorkingFolder} && \ -${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ -1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ -&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ + 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ + && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; return Promise.resolve(true); } } diff --git a/ts/nni_manager/training_service/reusable/sharedStorage.ts b/ts/nni_manager/training_service/reusable/sharedStorage.ts index 3565ee53f0..b45dbddb10 100644 --- a/ts/nni_manager/training_service/reusable/sharedStorage.ts +++ b/ts/nni_manager/training_service/reusable/sharedStorage.ts @@ -18,5 +18,6 @@ export abstract class SharedStorageService { public abstract get storageService(): StorageService; public abstract get localMountCommand(): string; public abstract get remoteMountCommand(): string; + public abstract get localWorkingRoot(): string; public abstract get remoteWorkingRoot(): string; } diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index 362b51d981..ecd3bc514a 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -80,6 +80,10 @@ export class NFSSharedStorageService extends SharedStorageService { return `sudo apt-get update && sudo apt-get -y install nfs-common && mkdir -p ${this.remoteMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.remoteMountPoint}`; } + public get localWorkingRoot(): string { + return `${this.localMountPoint}/nni/${this.experimentId}`; + } + public get remoteWorkingRoot(): string { return `${this.remoteMountPoint}/nni/${this.experimentId}`; } diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 3aa966ce61..636f8a16ca 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -202,7 +202,7 @@ class TrialDispatcher implements TrainingService { this.log.info(`TrialDispatcher: copying code and settings.`); let storageService: StorageService; - if (environmentService.useSharedStorage) { + if (this.useSharedStorage) { if (this.fileCopyCompleted) { this.log.debug(`TrialDispatcher: file already copy to shared storage.`); continue; @@ -238,7 +238,7 @@ class TrialDispatcher implements TrainingService { await storageService.copyDirectory(trialToolsPath, envDir, true); } - if (environmentService.useSharedStorage) { + if (this.useSharedStorage) { this.fileCopyCompleted = true; } } @@ -686,6 +686,8 @@ class TrialDispatcher implements TrainingService { environment.command = `mkdir -p envs/${envId} && cd envs/${envId} && ${environment.command}`; + environment.useSharedStorage = this.useSharedStorage; + await environmentService.startEnvironment(environment); this.environments.set(environment.id, environment); From 6defbf92e60413fad8fa5661f976e8d62fffdf28 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 29 Jan 2021 06:31:07 +0000 Subject: [PATCH 04/20] add nfs example --- .../trials/mnist-sharedstorage/config_nfs.yml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/trials/mnist-sharedstorage/config_nfs.yml diff --git a/examples/trials/mnist-sharedstorage/config_nfs.yml b/examples/trials/mnist-sharedstorage/config_nfs.yml new file mode 100644 index 0000000000..00ebffd347 --- /dev/null +++ b/examples/trials/mnist-sharedstorage/config_nfs.yml @@ -0,0 +1,34 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + image: msranni/nni +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} + computeTarget: ${replace_to_your_computeTarget} +sharedStorage: + storageType: NFS + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + nfsServer: ${nfs-server-ip} + exportedDirectory: ${nfs/exported/directory} + # true means you have already mount this storage on localMountPoint + # false means nni will try to mount this storage on localMountPoint + userMounted: true \ No newline at end of file From 8ab828b20c871e712427e2b2275549a428b8e991 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 29 Jan 2021 06:54:02 +0000 Subject: [PATCH 05/20] fix bug --- .../reusable/environments/localEnvironmentService.ts | 8 ++------ .../reusable/environments/remoteEnvironmentService.ts | 4 ---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts index b76ac21f02..c643bf9c45 100644 --- a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts @@ -38,10 +38,6 @@ export class LocalEnvironmentService extends EnvironmentService { return false; } - public get useSharedStorage(): boolean { - return false; - } - public get getName(): string { return 'local'; } @@ -99,13 +95,13 @@ export class LocalEnvironmentService extends EnvironmentService { } // Need refactor, this temp folder path is not appropriate, there are two expId in this path let localWorkingRoot: string; - if (this.useSharedStorage) { + if (environment.useSharedStorage) { localWorkingRoot = component.get(SharedStorageService).localWorkingRoot; } else { localWorkingRoot = this.experimentRootDir; } const localEnvCodeFolder: string = path.join(localWorkingRoot, "envs"); - if (!this.useSharedStorage) { + if (!environment.useSharedStorage) { const localTempFolder: string = path.join(localWorkingRoot, this.experimentId, "environment-temp", "envs"); await execCopydir(localTempFolder, localEnvCodeFolder); diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index 2c6b1e818e..e1dbfb763d 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -64,10 +64,6 @@ export class RemoteEnvironmentService extends EnvironmentService { return false; } - public get useSharedStorage(): boolean { - return false; - } - public get getName(): string { return 'remote'; } From 364eefbd8302d8eed5f8bbadc2bb22b69fc1227d Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 29 Jan 2021 07:03:52 +0000 Subject: [PATCH 06/20] fix lint --- .../reusable/environments/amlEnvironmentService.ts | 7 +++---- ts/nni_manager/training_service/reusable/sharedStorage.ts | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index c9e75c6c22..4b06610f14 100644 --- a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -118,13 +118,12 @@ export class AMLEnvironmentService extends EnvironmentService { if (!fs.existsSync(environmentLocalTempFolder)) { await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); } - if (environment.useSharedStorage) { + if (amlEnvironment.useSharedStorage) { const environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; - environment.command = `import os\nos.system('${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}')`; - } else { - environment.command = `import os\nos.system('${amlEnvironment.command}')`; + amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`; } + environment.command = `import os\nos.system('${amlEnvironment.command}')`; environment.useActiveGpu = this.amlClusterConfig.useActiveGpu; environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' }); diff --git a/ts/nni_manager/training_service/reusable/sharedStorage.ts b/ts/nni_manager/training_service/reusable/sharedStorage.ts index b45dbddb10..f0c0ff4445 100644 --- a/ts/nni_manager/training_service/reusable/sharedStorage.ts +++ b/ts/nni_manager/training_service/reusable/sharedStorage.ts @@ -5,7 +5,7 @@ import { StorageService } from './storageService' -export type SharedStorageType = 'NFS' +export type SharedStorageType = 'NFS' | 'AzureBlob' export interface SharedStorageConfig { readonly storageType: SharedStorageType; From ad4e819663cbfd8a518c27f9e3418d3c93210fd9 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 29 Jan 2021 07:47:31 +0000 Subject: [PATCH 07/20] update sharedstorage api --- examples/trials/mnist-sharedstorage/config_nfs.yml | 2 +- ts/nni_manager/rest_server/restValidationSchemas.ts | 2 +- .../reusable/environments/localEnvironmentService.ts | 9 ++++++--- .../training_service/reusable/sharedStorage.ts | 3 ++- .../reusable/shared_storages/nfsStorageService.ts | 12 ++++++++---- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/examples/trials/mnist-sharedstorage/config_nfs.yml b/examples/trials/mnist-sharedstorage/config_nfs.yml index 00ebffd347..15453940bc 100644 --- a/examples/trials/mnist-sharedstorage/config_nfs.yml +++ b/examples/trials/mnist-sharedstorage/config_nfs.yml @@ -31,4 +31,4 @@ sharedStorage: exportedDirectory: ${nfs/exported/directory} # true means you have already mount this storage on localMountPoint # false means nni will try to mount this storage on localMountPoint - userMounted: true \ No newline at end of file + hasLocalMounted: true \ No newline at end of file diff --git a/ts/nni_manager/rest_server/restValidationSchemas.ts b/ts/nni_manager/rest_server/restValidationSchemas.ts index 2a3758874b..6d9c1b6510 100644 --- a/ts/nni_manager/rest_server/restValidationSchemas.ts +++ b/ts/nni_manager/rest_server/restValidationSchemas.ts @@ -198,7 +198,7 @@ export namespace ValidationSchemas { remoteMountPoint: joi.string(), nfsServer: joi.string(), exportedDirectory: joi.string(), - userMounted: joi.boolean() + hasLocalMounted: joi.boolean() }) } }; diff --git a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts index c643bf9c45..475470ae92 100644 --- a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts @@ -95,13 +95,16 @@ export class LocalEnvironmentService extends EnvironmentService { } // Need refactor, this temp folder path is not appropriate, there are two expId in this path let localWorkingRoot: string; - if (environment.useSharedStorage) { - localWorkingRoot = component.get(SharedStorageService).localWorkingRoot; + const sharedStorageService = component.get(SharedStorageService); + if (environment.useSharedStorage && sharedStorageService.canLocalMounted) { + localWorkingRoot = sharedStorageService.localWorkingRoot; } else { localWorkingRoot = this.experimentRootDir; } const localEnvCodeFolder: string = path.join(localWorkingRoot, "envs"); - if (!environment.useSharedStorage) { + if (environment.useSharedStorage && !sharedStorageService.canLocalMounted) { + await sharedStorageService.storageService.copyDirectoryBack("envs", localEnvCodeFolder) + } else if (!environment.useSharedStorage) { const localTempFolder: string = path.join(localWorkingRoot, this.experimentId, "environment-temp", "envs"); await execCopydir(localTempFolder, localEnvCodeFolder); diff --git a/ts/nni_manager/training_service/reusable/sharedStorage.ts b/ts/nni_manager/training_service/reusable/sharedStorage.ts index f0c0ff4445..3104285afe 100644 --- a/ts/nni_manager/training_service/reusable/sharedStorage.ts +++ b/ts/nni_manager/training_service/reusable/sharedStorage.ts @@ -9,12 +9,13 @@ export type SharedStorageType = 'NFS' | 'AzureBlob' export interface SharedStorageConfig { readonly storageType: SharedStorageType; - readonly localMountPoint: string; + readonly localMountPoint?: string; readonly remoteMountPoint: string; } export abstract class SharedStorageService { public abstract config(key: string, value: string): Promise; + public abstract get canLocalMounted(): boolean; public abstract get storageService(): StorageService; public abstract get localMountCommand(): string; public abstract get remoteMountCommand(): string; diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index ecd3bc514a..78be1f46b6 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -19,16 +19,16 @@ class NFSSharedStorageConfig implements SharedStorageConfig { public nfsServer: string; public exportedDirectory: string; - public userMounted: boolean; + public hasLocalMounted: boolean; constructor(storageType: SharedStorageType, localMountPoint: string, remoteMountPoint: string, - nfsServer: string, exportedDirectory: string, userMounted: boolean) { + nfsServer: string, exportedDirectory: string, hasLocalMounted: boolean) { this.storageType = storageType; this.localMountPoint = localMountPoint; this.remoteMountPoint = remoteMountPoint; this.nfsServer = nfsServer; this.exportedDirectory = exportedDirectory; - this.userMounted = userMounted; + this.hasLocalMounted = hasLocalMounted; } } @@ -60,7 +60,7 @@ export class NFSSharedStorageService extends SharedStorageService { this.storageType = nfsConfig.storageType; this.nfsServer = nfsConfig.nfsServer; this.exportedDirectory = nfsConfig.exportedDirectory; - if ( nfsConfig.userMounted === false ) { + if ( nfsConfig.hasLocalMounted === false ) { await this.helpLocalMount(); } @@ -68,6 +68,10 @@ export class NFSSharedStorageService extends SharedStorageService { } } + public get canLocalMounted(): boolean{ + return true; + } + public get storageService(): MountedStorageService { return this.internalStorageService; } From 224903287b76ebe4128bbab47ec9d3cf187497b7 Mon Sep 17 00:00:00 2001 From: J-shang Date: Mon, 1 Feb 2021 03:24:52 +0000 Subject: [PATCH 08/20] fix lint --- .../reusable/environments/amlEnvironmentService.ts | 6 +++--- ts/nni_manager/training_service/reusable/trialDispatcher.ts | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 4b06610f14..982b0f1efc 100644 --- a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -123,9 +123,9 @@ export class AMLEnvironmentService extends EnvironmentService { const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`; } - environment.command = `import os\nos.system('${amlEnvironment.command}')`; - environment.useActiveGpu = this.amlClusterConfig.useActiveGpu; - environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; + amlEnvironment.command = `import os\nos.system('${amlEnvironment.command}')`; + amlEnvironment.useActiveGpu = this.amlClusterConfig.useActiveGpu; + amlEnvironment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' }); const amlClient = new AMLClient( this.amlClusterConfig.subscriptionId, diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 636f8a16ca..eb86ebc837 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -296,7 +296,7 @@ class TrialDispatcher implements TrainingService { // Validate to make sure codeDir doesn't have too many files await validateCodeDir(this.trialConfig.codeDir); break; - case TrialConfigMetadataKey.PLATFORM_LIST: + case TrialConfigMetadataKey.PLATFORM_LIST: { const platforms: string[] = value.split(","); for(const platform of platforms) { const environmentService: EnvironmentService = EnvironmentServiceFactory.createEnvironmentService(platform); @@ -307,6 +307,7 @@ class TrialDispatcher implements TrainingService { this.environmentServiceList.push(environmentService); } break; + } case TrialConfigMetadataKey.SHARED_STORAGE_CONFIG: if (this.useSharedStorage === false) { await this.initializeSharedStorage(key, value); @@ -918,10 +919,11 @@ class TrialDispatcher implements TrainingService { .to(NFSSharedStorageService) .scope(Scope.Singleton); break; - default: + default: { const errorMessage = `Shared storage type '${storageType}' not support.`; this.log.error(errorMessage) return Promise.reject(errorMessage); + } } await component.get(SharedStorageService).config(key, value); this.useSharedStorage = true; From 51205ec7e822943ec946ddd6e7ead07426518d2b Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 2 Feb 2021 10:20:21 +0000 Subject: [PATCH 09/20] add azureblob --- .../mnist-sharedstorage/config_azureblob.yml | 36 +++ .../trials/mnist-sharedstorage/config_nfs.yml | 7 +- examples/trials/mnist-sharedstorage/mnist.py | 166 ++++++++++++++ .../mnist-sharedstorage/requirements.txt | 2 + .../mnist-sharedstorage/search_space.json | 6 + .../rest_server/restValidationSchemas.ts | 6 +- .../environments/amlEnvironmentService.ts | 2 +- .../reusable/sharedStorage.ts | 1 + .../azureblobStorageService.ts | 207 ++++++++++++++++++ .../shared_storages/nfsStorageService.ts | 20 +- .../reusable/trialDispatcher.ts | 6 + 11 files changed, 448 insertions(+), 11 deletions(-) create mode 100644 examples/trials/mnist-sharedstorage/config_azureblob.yml create mode 100644 examples/trials/mnist-sharedstorage/mnist.py create mode 100644 examples/trials/mnist-sharedstorage/requirements.txt create mode 100644 examples/trials/mnist-sharedstorage/search_space.json create mode 100644 ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts diff --git a/examples/trials/mnist-sharedstorage/config_azureblob.yml b/examples/trials/mnist-sharedstorage/config_azureblob.yml new file mode 100644 index 0000000000..604f33379d --- /dev/null +++ b/examples/trials/mnist-sharedstorage/config_azureblob.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + image: msranni/nni +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} + computeTarget: ${replace_to_your_computeTarget} +sharedStorage: + storageType: AzureBlob + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + resourceGroupName: ${replace_to_your_resourceGroupName} + storageAccountName: ${replace_to_your_storageAccountName} + containerName: ${replace_to_your_containerName} + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount \ No newline at end of file diff --git a/examples/trials/mnist-sharedstorage/config_nfs.yml b/examples/trials/mnist-sharedstorage/config_nfs.yml index 15453940bc..0a2872c920 100644 --- a/examples/trials/mnist-sharedstorage/config_nfs.yml +++ b/examples/trials/mnist-sharedstorage/config_nfs.yml @@ -29,6 +29,7 @@ sharedStorage: remoteMountPoint: ${your/remote/mount/point} nfsServer: ${nfs-server-ip} exportedDirectory: ${nfs/exported/directory} - # true means you have already mount this storage on localMountPoint - # false means nni will try to mount this storage on localMountPoint - hasLocalMounted: true \ No newline at end of file + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount \ No newline at end of file diff --git a/examples/trials/mnist-sharedstorage/mnist.py b/examples/trials/mnist-sharedstorage/mnist.py new file mode 100644 index 0000000000..a7ca27816a --- /dev/null +++ b/examples/trials/mnist-sharedstorage/mnist.py @@ -0,0 +1,166 @@ +""" +A deep MNIST classifier using convolutional layers. + +This file is a modification of the official pytorch mnist example: +https://github.com/pytorch/examples/blob/master/mnist/main.py +""" + +import os +import argparse +import logging +import nni +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from nni.utils import merge_parameter +from torchvision import datasets, transforms + +logger = logging.getLogger('mnist_AutoML') + + +class Net(nn.Module): + def __init__(self, hidden_size): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4*4*50, hidden_size) + self.fc2 = nn.Linear(hidden_size, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4*4*50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + if (args['batch_num'] is not None) and batch_idx >= args['batch_num']: + break + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args['log_interval'] == 0: + logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, reduction='sum').item() + # get the index of the max log-probability + pred = output.argmax(dim=1, keepdim=True) + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + accuracy = 100. * correct / len(test_loader.dataset) + + logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), accuracy)) + + return accuracy + + +def main(args): + use_cuda = not args['no_cuda'] and torch.cuda.is_available() + + torch.manual_seed(args['seed']) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + + data_dir = args['data_dir'] + + train_loader = torch.utils.data.DataLoader( + datasets.MNIST(data_dir, train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args['batch_size'], shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=1000, shuffle=True, **kwargs) + + hidden_size = args['hidden_size'] + + model = Net(hidden_size=hidden_size).to(device) + optimizer = optim.SGD(model.parameters(), lr=args['lr'], + momentum=args['momentum']) + + for epoch in range(1, args['epochs'] + 1): + train(args, model, device, train_loader, optimizer, epoch) + test_acc = test(args, model, device, test_loader) + + # report intermediate result + nni.report_intermediate_result(test_acc) + logger.debug('test accuracy %g', test_acc) + logger.debug('Pipe send intermediate result done.') + + # report final result + nni.report_final_result(test_acc) + logger.debug('Final result is %g', test_acc) + logger.debug('Send final result done.') + + +def get_params(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument("--data_dir", type=str, + default='./data', help="data directory") + parser.add_argument('--batch_size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument("--batch_num", type=int, default=None) + parser.add_argument("--hidden_size", type=int, default=512, metavar='N', + help='hidden layer size (default: 512)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--no_cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--log_interval', type=int, default=1000, metavar='N', + help='how many batches to wait before logging training status') + + + args, _ = parser.parse_known_args() + return args + + +if __name__ == '__main__': + try: + # get parameters form tuner + tuner_params = nni.get_next_parameter() + logger.debug(tuner_params) + params = vars(merge_parameter(get_params(), tuner_params)) + print(params) + main(params) + except Exception as exception: + logger.exception(exception) + raise diff --git a/examples/trials/mnist-sharedstorage/requirements.txt b/examples/trials/mnist-sharedstorage/requirements.txt new file mode 100644 index 0000000000..01f6b72556 --- /dev/null +++ b/examples/trials/mnist-sharedstorage/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision diff --git a/examples/trials/mnist-sharedstorage/search_space.json b/examples/trials/mnist-sharedstorage/search_space.json new file mode 100644 index 0000000000..c26cdce369 --- /dev/null +++ b/examples/trials/mnist-sharedstorage/search_space.json @@ -0,0 +1,6 @@ +{ + "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]}, + "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]}, + "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}, + "momentum":{"_type":"uniform","_value":[0, 1]} +} diff --git a/ts/nni_manager/rest_server/restValidationSchemas.ts b/ts/nni_manager/rest_server/restValidationSchemas.ts index 6d9c1b6510..62d86bc888 100644 --- a/ts/nni_manager/rest_server/restValidationSchemas.ts +++ b/ts/nni_manager/rest_server/restValidationSchemas.ts @@ -198,7 +198,11 @@ export namespace ValidationSchemas { remoteMountPoint: joi.string(), nfsServer: joi.string(), exportedDirectory: joi.string(), - hasLocalMounted: joi.boolean() + storageAccountName: joi.string(), + storageAccountKey: joi.string(), + containerName: joi.string(), + resourceGroupName: joi.string(), + localMounted: joi.string() }) } }; diff --git a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index dc36c5d62c..eda30dcc8a 100644 --- a/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -121,7 +121,7 @@ export class AMLEnvironmentService extends EnvironmentService { if (amlEnvironment.useSharedStorage) { const environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; - amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`; + amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`.replace(/"/g, `\\"`); } else { amlEnvironment.command = `mv envs outputs/envs && cd outputs && ${amlEnvironment.command}`; } diff --git a/ts/nni_manager/training_service/reusable/sharedStorage.ts b/ts/nni_manager/training_service/reusable/sharedStorage.ts index 3104285afe..06614706fc 100644 --- a/ts/nni_manager/training_service/reusable/sharedStorage.ts +++ b/ts/nni_manager/training_service/reusable/sharedStorage.ts @@ -6,6 +6,7 @@ import { StorageService } from './storageService' export type SharedStorageType = 'NFS' | 'AzureBlob' +export type LocalMountedType = 'usermount' | 'nnimount' | 'nomount' export interface SharedStorageConfig { readonly storageType: SharedStorageType; diff --git a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts new file mode 100644 index 0000000000..505866f75a --- /dev/null +++ b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts @@ -0,0 +1,207 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as cpp from 'child-process-promise'; +import * as path from 'path'; + +import { SharedStorageService, SharedStorageConfig, SharedStorageType, LocalMountedType } from '../sharedStorage' +import { MountedStorageService } from '../storages/mountedStorageService'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { getLogger, Logger } from '../../../common/log'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; + +const INSTALL_BLOBFUSE = ` +#!/bin/bash +if [ -n "$(command -v blobfuse)" ] +then + exit 0 +fi + +if [ -n "$(command -v apt-get)" ] +then + sudo apt-get update + sudo apt-get install -y lsb-release +elif [ -n "$(command -v yum)" ] +then + sudo yum install -y redhat-lsb +else +echo "Unknown package management." +fi + +id=$(lsb_release -a | grep "Distributor ID:" | cut -c16- | sed s/[[:space:]]//g) +version=$(lsb_release -a | grep "Release:" | cut -c9- | sed s/[[:space:]]//g) + +if [ $id = "Ubuntu" ] +then + wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + sudo apt-get update + sudo apt-get install -y blobfuse fuse +elif [ $id = "CentOS" ] || [ $id = "RedHat" ] || [ $id = "rhel" ] +then + sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm + sudo yum install -y blobfuse fuse +else + echo "Not support distributor." +fi +` + +class AzureBlobSharedStorageConfig implements SharedStorageConfig { + public storageType: SharedStorageType; + public localMountPoint?: string; + public remoteMountPoint: string; + + public resourceGroupName?: string; + public storageAccountName: string; + public storageAccountKey?: string; + public containerName: string; + + public localMounted: LocalMountedType; + + constructor(storageType: SharedStorageType, remoteMountPoint: string, storageAccountName: string, containerName: string, + localMounted: LocalMountedType, localMountPoint?: string, resourceGroupName?: string, storageAccountKey?: string) { + this.storageType = storageType; + this.localMountPoint = localMountPoint; + this.remoteMountPoint = remoteMountPoint; + this.resourceGroupName = resourceGroupName; + this.storageAccountName = storageAccountName; + this.storageAccountKey = storageAccountKey; + this.containerName = containerName; + this.localMounted = localMounted; + } +} + +export class AzureBlobSharedStorageService extends SharedStorageService { + private log: Logger; + private internalStorageService: MountedStorageService; + private experimentId: string; + + private storageType?: SharedStorageType; + private storageAccountName?: string; + private storageAccountKey?: string; + private containerName?: string; + + private localMountPoint?: string; + private remoteMountPoint?: string; + + constructor() { + super(); + this.log = getLogger(); + this.internalStorageService = new MountedStorageService(); + this.experimentId = getExperimentId(); + } + + public async config(key: string, value: string): Promise { + if (key === TrialConfigMetadataKey.SHARED_STORAGE_CONFIG) { + const azureblobConfig = JSON.parse(value); + this.localMountPoint = azureblobConfig.localMountPoint; + this.remoteMountPoint = azureblobConfig.remoteMountPoint; + + this.storageType = azureblobConfig.storageType; + this.storageAccountName = azureblobConfig.storageAccountName; + this.containerName = azureblobConfig.containerName; + if (azureblobConfig.storageAccountKey !== undefined) { + this.storageAccountKey =azureblobConfig.storageAccountKey; + } else if (azureblobConfig.resourceGroupName !== undefined) { + await this.setAccountKey(azureblobConfig.resourceGroupName); + } else { + const errorMessage = `${this.storageType} Shared Storage: must set one of 'storageAccountKey' or 'resourceGroupName'.`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + if (azureblobConfig.localMounted === 'nnimount') { + await this.helpLocalMount(); + } else if (azureblobConfig.localMounted === 'nomount') { + const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + if (this.canLocalMounted && this.localMountPoint) { + this.internalStorageService.initialize(this.localMountPoint, path.join(this.localMountPoint, 'nni', this.experimentId)); + } + } + } + + public get canLocalMounted(): boolean{ + return true; + } + + public get storageService(): MountedStorageService { + return this.internalStorageService; + } + + public get localMountCommand(): string { + if (this.localMountPoint) { + return this.getCommand(this.localMountPoint); + } else { + this.log.error(`${this.storageType} Shared Storage: localMountPoint is not initialized.`); + return ''; + } + } + + public get remoteMountCommand(): string { + if (this.remoteMountPoint) { + return this.getCommand(this.remoteMountPoint); + } else { + this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`); + return ''; + } + } + + private getCommand(mountPoint: string): string { + const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`; + const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`; + const mount = `mkdir -p ${mountPoint} && sudo blobfuse ${mountPoint} --tmp-path=/mnt/resource/nniblobfusetmp --config-file=$(pwd)/nni_fuse_connection.cfg -o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other`; + const clean = `rm -f nni_install_fuseblob.sh nni_fuse_connection.cfg`; + return `${install} && ${prepare} && ${mount}`; + } + + public get localWorkingRoot(): string { + return `${this.localMountPoint}/nni/${this.experimentId}`; + } + + public get remoteWorkingRoot(): string { + return `${this.remoteMountPoint}/nni/${this.experimentId}`; + } + + private async helpLocalMount(): Promise { + if (process.platform === 'win32') { + const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} do not support mount under Windows yet.`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + try { + this.log.debug(`Local mount command is: ${this.localMountCommand}`); + const result = await cpp.exec(this.localMountCommand); + if (result.stderr) { + throw new Error(result.stderr); + } + } catch (error) { + const errorMessage: string = `${this.storageType} Shared Storage: Mount ${this.storageAccountName}/${this.containerName} to ${this.localMountPoint} failed, error is ${error}`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + + return Promise.resolve(); + } + + private async setAccountKey(resourceGroupName: string): Promise { + try { + const result = await cpp.exec(`az storage account keys list --resource-group ${resourceGroupName} --account-name ${this.storageAccountName} --query "[0].value" | tr -d '"'`); + if (result.stderr) { + throw Error(result.stderr); + } else { + this.storageAccountKey = result.stdout.trim(); + } + } catch (error) { + const errorMessage: string = `${this.storageType} Shared Storage: get account key failed, error is ${error}`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); + } + } +} \ No newline at end of file diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index 78be1f46b6..78dd72e418 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -6,7 +6,7 @@ import * as cpp from 'child-process-promise'; import * as path from 'path'; -import { SharedStorageService, SharedStorageConfig, SharedStorageType } from '../sharedStorage' +import { SharedStorageService, SharedStorageConfig, SharedStorageType, LocalMountedType } from '../sharedStorage' import { MountedStorageService } from '../storages/mountedStorageService'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { getLogger, Logger } from '../../../common/log'; @@ -19,16 +19,16 @@ class NFSSharedStorageConfig implements SharedStorageConfig { public nfsServer: string; public exportedDirectory: string; - public hasLocalMounted: boolean; + public localMounted: LocalMountedType; constructor(storageType: SharedStorageType, localMountPoint: string, remoteMountPoint: string, - nfsServer: string, exportedDirectory: string, hasLocalMounted: boolean) { + nfsServer: string, exportedDirectory: string, localMounted: LocalMountedType) { this.storageType = storageType; this.localMountPoint = localMountPoint; this.remoteMountPoint = remoteMountPoint; this.nfsServer = nfsServer; this.exportedDirectory = exportedDirectory; - this.hasLocalMounted = hasLocalMounted; + this.localMounted = localMounted; } } @@ -60,12 +60,17 @@ export class NFSSharedStorageService extends SharedStorageService { this.storageType = nfsConfig.storageType; this.nfsServer = nfsConfig.nfsServer; this.exportedDirectory = nfsConfig.exportedDirectory; - if ( nfsConfig.hasLocalMounted === false ) { + if (nfsConfig.localMounted === 'nnimount') { await this.helpLocalMount(); + } else if (nfsConfig.localMounted === 'nomount') { + const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`; + this.log.error(errorMessage); + return Promise.reject(errorMessage); } this.internalStorageService.initialize(this.localMountPoint, path.join(this.localMountPoint, 'nni', this.experimentId)); } + return Promise.resolve(); } public get canLocalMounted(): boolean{ @@ -100,7 +105,10 @@ export class NFSSharedStorageService extends SharedStorageService { } try { - await cpp.exec(this.localMountCommand); + const result = await cpp.exec(this.localMountCommand); + if (result.stderr) { + throw new Error(result.stderr); + } } catch (error) { const errorMessage: string = `${this.storageType} Shared Storage: Mount ${this.nfsServer}:${this.exportedDirectory} to ${this.localMountPoint} failed, error is ${error}`; this.log.error(errorMessage); diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index cbf8566686..c00b6610c5 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -29,6 +29,7 @@ import { MountedStorageService } from './storages/mountedStorageService'; import { StorageService } from './storageService'; import { SharedStorageService, SharedStorageConfig } from './sharedStorage'; import { NFSSharedStorageService } from './shared_storages/nfsStorageService' +import { AzureBlobSharedStorageService } from './shared_storages/azureblobStorageService' import { TrialDetail } from './trial'; @@ -917,6 +918,11 @@ class TrialDispatcher implements TrainingService { .to(NFSSharedStorageService) .scope(Scope.Singleton); break; + case 'AzureBlob': + Container.bind(SharedStorageService) + .to(AzureBlobSharedStorageService) + .scope(Scope.Singleton); + break; default: { const errorMessage = `Shared storage type '${storageType}' not support.`; this.log.error(errorMessage) From 0e4e872d85d340fe98d3f1809a339a9bdebeeaf3 Mon Sep 17 00:00:00 2001 From: J-shang Date: Wed, 3 Feb 2021 01:29:24 +0000 Subject: [PATCH 10/20] fix lint --- .../reusable/shared_storages/azureblobStorageService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts index 505866f75a..7915a1c32e 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts @@ -157,7 +157,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService { const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`; const mount = `mkdir -p ${mountPoint} && sudo blobfuse ${mountPoint} --tmp-path=/mnt/resource/nniblobfusetmp --config-file=$(pwd)/nni_fuse_connection.cfg -o attr_timeout=240 -o entry_timeout=240 -o negative_timeout=120 -o allow_other`; const clean = `rm -f nni_install_fuseblob.sh nni_fuse_connection.cfg`; - return `${install} && ${prepare} && ${mount}`; + return `${install} && ${prepare} && ${mount} && ${clean}`; } public get localWorkingRoot(): string { From 033c579dcd0f44292cf5a6279722f1ce831b9e1a Mon Sep 17 00:00:00 2001 From: J-shang Date: Wed, 3 Feb 2021 15:56:11 +0000 Subject: [PATCH 11/20] update nfs install script --- .../azureblobStorageService.ts | 10 +++-- .../shared_storages/nfsStorageService.ts | 39 ++++++++++++++++++- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts index 7915a1c32e..4554f9ebaf 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts @@ -27,11 +27,12 @@ elif [ -n "$(command -v yum)" ] then sudo yum install -y redhat-lsb else -echo "Unknown package management." + echo "Unknown package management." + exit 1 fi -id=$(lsb_release -a | grep "Distributor ID:" | cut -c16- | sed s/[[:space:]]//g) -version=$(lsb_release -a | grep "Release:" | cut -c9- | sed s/[[:space:]]//g) +id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g) +version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g) if [ $id = "Ubuntu" ] then @@ -39,12 +40,13 @@ then sudo dpkg -i packages-microsoft-prod.deb sudo apt-get update sudo apt-get install -y blobfuse fuse -elif [ $id = "CentOS" ] || [ $id = "RedHat" ] || [ $id = "rhel" ] +elif [ $id = "CentOS" ] || [ $id = "RHEL" ] then sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm sudo yum install -y blobfuse fuse else echo "Not support distributor." + exit 1 fi ` diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index 78dd72e418..609b5a7a23 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -12,6 +12,24 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { getLogger, Logger } from '../../../common/log'; import { getExperimentId } from '../../../common/experimentStartupInfo'; +const INSTALL_NFS_CLIENT = ` +#!/bin/bash +if [ -n "$(command -v apt-get)" ] +then + sudo apt-get update + sudo apt-get install -y nfs-common +elif [ -n "$(command -v yum)" ] +then + sudo yum install -y nfs-utils +elif [ -n "$(command -v dnf)" ] +then + sudo dnf install -y nfs-utils +else + echo "Unknown package management." + exit 1 +fi +` + class NFSSharedStorageConfig implements SharedStorageConfig { public storageType: SharedStorageType; public localMountPoint: string; @@ -82,11 +100,28 @@ export class NFSSharedStorageService extends SharedStorageService { } public get localMountCommand(): string { - return `sudo apt-get update && sudo apt-get -y install nfs-common && mkdir -p ${this.localMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.localMountPoint}`; + if (this.localMountPoint) { + return this.getCommand(this.localMountPoint); + } else { + this.log.error(`${this.storageType} Shared Storage: localMountPoint is not initialized.`); + return ''; + } } public get remoteMountCommand(): string { - return `sudo apt-get update && sudo apt-get -y install nfs-common && mkdir -p ${this.remoteMountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${this.remoteMountPoint}`; + if (this.remoteMountPoint) { + return this.getCommand(this.remoteMountPoint); + } else { + this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`); + return ''; + } + } + + private getCommand(mountPoint: string): string { + const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`; + const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`; + const clean = `rm -f nni_install_nfsclient.sh`; + return `${install} && ${mount} && ${clean}`; } public get localWorkingRoot(): string { From 766325039e5aa450e4e385f9d2cc81d35a4350f8 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 5 Feb 2021 08:24:18 +0000 Subject: [PATCH 12/20] fix json can not stringify obj --- ts/nni_manager/core/nnimanager.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 9d703a2d5a..e9fe057f0c 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -631,8 +631,9 @@ class NNIManager implements Manager { this.currSubmittedTrialNum++; this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(form); + const Snapshot: TrialJobDetail = Object.assign({}, trialJobDetail); await this.storeExperimentProfile(); - this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail)); + this.trialJobs.set(trialJobDetail.id, Snapshot); const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id); if (trialJobDetailSnapshot != undefined) { await this.dataStore.storeTrialJobEvent( From 5413d7ebeba5f6baf92c0c4cdcd40ebd868b3062 Mon Sep 17 00:00:00 2001 From: J-shang Date: Thu, 18 Feb 2021 08:35:07 +0000 Subject: [PATCH 13/20] update config schema --- nni/tools/nnictl/config_schema.py | 12 +++++++++++- .../shared_storages/azureblobStorageService.ts | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index 96527a2c3c..5007732d17 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -143,7 +143,17 @@ def validate(self, data): Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), Optional('useActiveGpu'): setType('useActiveGpu', bool) }, - Optional('sharedStorage'): setType('sharedStorage', dict) + Optional('sharedStorage'): { + 'storageType': setChoice('storageType', 'NFS', 'AzureBlob'), + Optional('localMountPoint'): setType('localMountPoint', str), + Optional('remoteMountPoint'): setType('remoteMountPoint', str), + Optional('nfsServer'): setType('nfsServer', str), + Optional('storageAccountName'): setType('storageAccountName', str), + Optional('storageAccountKey'): setType('storageAccountKey', str), + Optional('containerName'): setType('containerName', str), + Optional('resourceGroupName'): setType('resourceGroupName', str), + Optional('localMounted'): setChoice('localMounted', 'usermount', 'nnimount', 'nomount') + } } common_trial_schema = { diff --git a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts index 4554f9ebaf..dd16eb12a1 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts @@ -206,4 +206,4 @@ export class AzureBlobSharedStorageService extends SharedStorageService { return Promise.reject(errorMessage); } } -} \ No newline at end of file +} From b511c2959f4364399621c193bef41c34cdbd13a6 Mon Sep 17 00:00:00 2001 From: J-shang Date: Fri, 19 Feb 2021 06:51:19 +0000 Subject: [PATCH 14/20] fix local ts --- .../reusable/environments/localEnvironmentService.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts index b8861b6625..1ddeb5188a 100644 --- a/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts @@ -120,18 +120,17 @@ export class LocalEnvironmentService extends EnvironmentService { throw new Error('Local trial config is not initialized'); } // Need refactor, this temp folder path is not appropriate, there are two expId in this path - let localWorkingRoot: string; const sharedStorageService = component.get(SharedStorageService); if (environment.useSharedStorage && sharedStorageService.canLocalMounted) { - localWorkingRoot = sharedStorageService.localWorkingRoot; + this.experimentRootDir = sharedStorageService.localWorkingRoot; } else { - localWorkingRoot = this.experimentRootDir; + this.experimentRootDir = getExperimentRootDir(); } - const localEnvCodeFolder: string = path.join(localWorkingRoot, "envs"); + const localEnvCodeFolder: string = path.join(this.experimentRootDir, "envs"); if (environment.useSharedStorage && !sharedStorageService.canLocalMounted) { await sharedStorageService.storageService.copyDirectoryBack("envs", localEnvCodeFolder) } else if (!environment.useSharedStorage) { - const localTempFolder: string = path.join(localWorkingRoot, this.experimentId, + const localTempFolder: string = path.join(this.experimentRootDir, this.experimentId, "environment-temp", "envs"); await execCopydir(localTempFolder, localEnvCodeFolder); } From 5b438ff4ef6f82a538bde3c062f00bf17a491022 Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 23 Feb 2021 06:57:48 +0000 Subject: [PATCH 15/20] update doc --- docs/en_US/Tutorial/HowToUseSharedStorage.rst | 48 +++++++++++++++++++ .../mnist-sharedstorage/config_azureblob.yml | 4 +- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 docs/en_US/Tutorial/HowToUseSharedStorage.rst diff --git a/docs/en_US/Tutorial/HowToUseSharedStorage.rst b/docs/en_US/Tutorial/HowToUseSharedStorage.rst new file mode 100644 index 0000000000..d51273bbeb --- /dev/null +++ b/docs/en_US/Tutorial/HowToUseSharedStorage.rst @@ -0,0 +1,48 @@ +**How to Use Shared Storage** +============================= + +If you want to use your own storage during using NNI, shared storage can satisfy you. +Instead of using training service native storage, shared storage can bring you more convenience. +All the information generated by the experiment will be stored under `/nni` folder in your shared storage. +All the output produced by the trial will be located under `/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}/nnioutput` folder in your shared storage. +This saves you from finding for experiment-related information in various places. +Remember that your trial working directory is `/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}`, so if you upload your data in this shared storage, you can open it like a local file in your trial code without downloading it. +And we will develop more practical features in the future based on shared storage. + +Use +--- +If you want to use AzureBlob, add below to your config. Full config file see :githublink:`mnist-sharedstorage/config_azureblob.yml `. + +.. code-block:: yaml + sharedStorage: + storageType: AzureBlob + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + storageAccountName: ${replace_to_your_storageAccountName} + storageAccountKey: ${replace_to_your_storageAccountKey} + # If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName. + # resourceGroupName: ${replace_to_your_resourceGroupName} + containerName: ${replace_to_your_containerName} + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount + +If you want to use NFS, add below to your config. Full config file see :githublink:`mnist-sharedstorage/config_nfs.yml `. + +.. code-block:: yaml + sharedStorage: + storageType: NFS + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + nfsServer: ${nfs-server-ip} + exportedDirectory: ${nfs/exported/directory} + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount + +Suggestion +---------- +Shared storage is currently in the experimental stage. We suggest use AzureBlob under Ubuntu/CentOS/RHEL, and NFS under Ubuntu/CentOS/RHEL/Fedora/Debian for remote. +And make sure your local machine can mount NFS or fuse AzureBlob and has `sudo` permission on your remote runtime. We only support shared storage under training service with reuse mode for now. diff --git a/examples/trials/mnist-sharedstorage/config_azureblob.yml b/examples/trials/mnist-sharedstorage/config_azureblob.yml index 604f33379d..16994b3a0a 100644 --- a/examples/trials/mnist-sharedstorage/config_azureblob.yml +++ b/examples/trials/mnist-sharedstorage/config_azureblob.yml @@ -27,8 +27,10 @@ sharedStorage: storageType: AzureBlob localMountPoint: ${your/local/mount/point} remoteMountPoint: ${your/remote/mount/point} - resourceGroupName: ${replace_to_your_resourceGroupName} storageAccountName: ${replace_to_your_storageAccountName} + storageAccountKey: ${replace_to_your_storageAccountKey} + # If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName. + # resourceGroupName: ${replace_to_your_resourceGroupName} containerName: ${replace_to_your_containerName} # usermount means you have already mount this storage on localMountPoint # nnimount means nni will try to mount this storage on localMountPoint From 8aded978587cb8f518f46b4006e1178e390ee06a Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 23 Feb 2021 06:59:25 +0000 Subject: [PATCH 16/20] update doc --- docs/en_US/reference.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en_US/reference.rst b/docs/en_US/reference.rst index 8aa02f96b0..0c07ce8df6 100644 --- a/docs/en_US/reference.rst +++ b/docs/en_US/reference.rst @@ -12,3 +12,4 @@ References SDK API References Supported Framework Library Launch from python + Shared Storage From 8d91d9bb692d0fc5ecf89e8803982f0797cd72d6 Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 23 Feb 2021 07:05:08 +0000 Subject: [PATCH 17/20] update doc --- docs/en_US/Tutorial/HowToUseSharedStorage.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en_US/Tutorial/HowToUseSharedStorage.rst b/docs/en_US/Tutorial/HowToUseSharedStorage.rst index d51273bbeb..3fad85daff 100644 --- a/docs/en_US/Tutorial/HowToUseSharedStorage.rst +++ b/docs/en_US/Tutorial/HowToUseSharedStorage.rst @@ -3,10 +3,10 @@ If you want to use your own storage during using NNI, shared storage can satisfy you. Instead of using training service native storage, shared storage can bring you more convenience. -All the information generated by the experiment will be stored under `/nni` folder in your shared storage. -All the output produced by the trial will be located under `/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}/nnioutput` folder in your shared storage. +All the information generated by the experiment will be stored under ``/nni`` folder in your shared storage. +All the output produced by the trial will be located under ``/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}/nnioutput`` folder in your shared storage. This saves you from finding for experiment-related information in various places. -Remember that your trial working directory is `/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}`, so if you upload your data in this shared storage, you can open it like a local file in your trial code without downloading it. +Remember that your trial working directory is ``/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}``, so if you upload your data in this shared storage, you can open it like a local file in your trial code without downloading it. And we will develop more practical features in the future based on shared storage. Use @@ -45,4 +45,4 @@ If you want to use NFS, add below to your config. Full config file see :githubli Suggestion ---------- Shared storage is currently in the experimental stage. We suggest use AzureBlob under Ubuntu/CentOS/RHEL, and NFS under Ubuntu/CentOS/RHEL/Fedora/Debian for remote. -And make sure your local machine can mount NFS or fuse AzureBlob and has `sudo` permission on your remote runtime. We only support shared storage under training service with reuse mode for now. +And make sure your local machine can mount NFS or fuse AzureBlob and has ``sudo`` permission on your remote runtime. We only support shared storage under training service with reuse mode for now. From 6077b6d129d68c998fa5722db0b20fe83072caa5 Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 23 Feb 2021 09:15:47 +0000 Subject: [PATCH 18/20] fix lint --- docs/en_US/Tutorial/HowToUseSharedStorage.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en_US/Tutorial/HowToUseSharedStorage.rst b/docs/en_US/Tutorial/HowToUseSharedStorage.rst index 3fad85daff..c26c354ca3 100644 --- a/docs/en_US/Tutorial/HowToUseSharedStorage.rst +++ b/docs/en_US/Tutorial/HowToUseSharedStorage.rst @@ -14,6 +14,7 @@ Use If you want to use AzureBlob, add below to your config. Full config file see :githublink:`mnist-sharedstorage/config_azureblob.yml `. .. code-block:: yaml + sharedStorage: storageType: AzureBlob localMountPoint: ${your/local/mount/point} @@ -31,6 +32,7 @@ If you want to use AzureBlob, add below to your config. Full config file see :gi If you want to use NFS, add below to your config. Full config file see :githublink:`mnist-sharedstorage/config_nfs.yml `. .. code-block:: yaml + sharedStorage: storageType: NFS localMountPoint: ${your/local/mount/point} From adefe1f27a794f0fb92c086cdfc7f2ae4b0e2e82 Mon Sep 17 00:00:00 2001 From: J-shang Date: Wed, 24 Feb 2021 08:52:34 +0000 Subject: [PATCH 19/20] update doc --- docs/en_US/Tutorial/HowToUseSharedStorage.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/en_US/Tutorial/HowToUseSharedStorage.rst b/docs/en_US/Tutorial/HowToUseSharedStorage.rst index c26c354ca3..18ab42f4f0 100644 --- a/docs/en_US/Tutorial/HowToUseSharedStorage.rst +++ b/docs/en_US/Tutorial/HowToUseSharedStorage.rst @@ -9,8 +9,12 @@ This saves you from finding for experiment-related information in various places Remember that your trial working directory is ``/nni/{EXPERIMENT_ID}/trials/{TRIAL_ID}``, so if you upload your data in this shared storage, you can open it like a local file in your trial code without downloading it. And we will develop more practical features in the future based on shared storage. -Use ---- +.. note:: + Shared storage is currently in the experimental stage. We suggest use AzureBlob under Ubuntu/CentOS/RHEL, and NFS under Ubuntu/CentOS/RHEL/Fedora/Debian for remote. + And make sure your local machine can mount NFS or fuse AzureBlob and has ``sudo`` permission on your remote runtime. We only support shared storage under training service with reuse mode for now. + +Example +------- If you want to use AzureBlob, add below to your config. Full config file see :githublink:`mnist-sharedstorage/config_azureblob.yml `. .. code-block:: yaml @@ -43,8 +47,3 @@ If you want to use NFS, add below to your config. Full config file see :githubli # nnimount means nni will try to mount this storage on localMountPoint # nomount means storage will not mount in local machine, will support partial storages in the future localMounted: nnimount - -Suggestion ----------- -Shared storage is currently in the experimental stage. We suggest use AzureBlob under Ubuntu/CentOS/RHEL, and NFS under Ubuntu/CentOS/RHEL/Fedora/Debian for remote. -And make sure your local machine can mount NFS or fuse AzureBlob and has ``sudo`` permission on your remote runtime. We only support shared storage under training service with reuse mode for now. From a6bde142f763e06e6aa4ea14cb9aac5b33f14e12 Mon Sep 17 00:00:00 2001 From: J-shang Date: Thu, 25 Feb 2021 04:54:58 +0000 Subject: [PATCH 20/20] add nfs judgment and update doc --- docs/en_US/Tutorial/ExperimentConfig.rst | 73 +++++++++++++++++++ .../shared_storages/nfsStorageService.ts | 5 ++ 2 files changed, 78 insertions(+) diff --git a/docs/en_US/Tutorial/ExperimentConfig.rst b/docs/en_US/Tutorial/ExperimentConfig.rst index 5da4dfcc4c..1a084c8202 100644 --- a/docs/en_US/Tutorial/ExperimentConfig.rst +++ b/docs/en_US/Tutorial/ExperimentConfig.rst @@ -823,6 +823,79 @@ Optional. Bool. default: ``false``. It's an experimental feature. If it's true, NNI will reuse OpenPAI jobs to run as many as possible trials. It can save time of creating new jobs. User needs to make sure each trial can run independent in same job, for example, avoid loading checkpoint from previous trials. +sharedStorage +^^^^^^^^^^^^^ + +storageType +^^^^^^^^^^^ + +Required. String. + +The type of the storage, support ``NFS`` and ``AzureBlob``. + +localMountPoint +^^^^^^^^^^^^^^^ + +Required. String. + +The absolute path that the storage has been or will be mounted in local. + +remoteMountPoint +^^^^^^^^^^^^^^^^ + +Required. String. + +The absolute path that the storage will be mounted in remote. + +localMounted +^^^^^^^^^^^^ + +Required. String. + +One of ``usermount``, ``nnimount`` or ``nomount``. ``usermount`` means you have already mount this storage on localMountPoint. ``nnimount`` means nni will try to mount this storage on localMountPoint. ``nomount`` means storage will not mount in local machine, will support partial storages in the future. + +nfsServer +^^^^^^^^^ + +Optional. String. + +Required if using NFS storage. The NFS server host. + +exportedDirectory +^^^^^^^^^^^^^^^^^ + +Optional. String. + +Required if using NFS storage. The exported directory of NFS server. + +storageAccountName +^^^^^^^^^^^^^^^^^^ + +Optional. String. + +Required if using AzureBlob storage. The azure storage account name. + +storageAccountKey +^^^^^^^^^^^^^^^^^ + +Optional. String. + +Required if using AzureBlob storage and ``resourceGroupName`` not set. The azure storage account key. + +resourceGroupName +^^^^^^^^^^^^^^^^^ + +Optional. String. + +Required if using AzureBlob storage and ``storageAccountKey`` not set. The resource group that AzureBlob container belongs to. + +containerName +^^^^^^^^^^^^^ + +Optional. String. + +Required if using AzureBlob storage. The AzureBlob container name. + Examples -------- diff --git a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts index 609b5a7a23..bd981f3441 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts @@ -14,6 +14,11 @@ import { getExperimentId } from '../../../common/experimentStartupInfo'; const INSTALL_NFS_CLIENT = ` #!/bin/bash +if [ -n "$(command -v nfsstat)" ] +then + exit 0 +fi + if [ -n "$(command -v apt-get)" ] then sudo apt-get update