Skip to content

Commit

Permalink
Merge pull request #4356 from aannabe/nxs_perlmutter
Browse files Browse the repository at this point in the history
Nexus: Add Perlmutter to Nexus machines
  • Loading branch information
ye-luo authored Dec 12, 2022
2 parents c607ee5 + 7241e2f commit 800fec3
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 29 deletions.
138 changes: 132 additions & 6 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2121,11 +2121,6 @@ def write_job_header(self,job):
#end class NerscMachine


class Edison(NerscMachine):
name = 'edison'
#end class Edison


class Cori(NerscMachine):
name = 'cori'

Expand Down Expand Up @@ -2206,6 +2201,137 @@ def write_job_header(self,job):




class Perlmutter(NerscMachine):
name = 'perlmutter'

def pre_process_job(self,job):
# Set default queue and node type
if job.queue is None:
job.queue = 'regular'
#end if
if job.constraint is None:
job.constraint = 'cpu'
#end if
# Account for dual nature of Perlmutter
if 'cpu' in job.constraint:
self.nodes = 3072
self.procs_per_node = 2
self.cores_per_node = 128
self.ram_per_node = 512
elif 'gpu' in job.constraint:
self.nodes = 1536
self.procs_per_node = 1
self.cores_per_node = 64
self.ram_per_node = 256
self.gpus_per_node = 4
else:
self.error('SLURM input "constraint" must contain either "cpu" or "gpu" on Perlmutter\nyou provided: {0}'.format(job.constraint))
#end if
#end def pre_process_job

def write_job_header(self,job):
self.pre_process_job(job) # sync machine view with job

# Check if the user gave reasonable processes_per_node
if 'cpu' in job.constraint:
if job.processes_per_node > self.cores_per_node:
self.error('processes_per_node can not be greater than logical CPUs per node (256)\nyou provided: {0}'.format(job.processes_per_node))
#end if
elif 'gpu' in job.constraint:
if job.processes_per_node > self.gpus_per_node:
self.error('processes_per_node can not be greater than GPUs per node (4)\nyou provided: {0}'.format(job.processes_per_node))
#end if
# Also check if the user forgot to include '_g' in the account name for GPU jobs
if ('_g' in job.account) == False:
job.account = job.account + '_g'
#end if
#end if

# Check if the user gave reasonable queue inputs
if job.queue == 'debug':
base_partition = 1
max_partition = 8
max_time = 0.5
elif job.queue == 'regular':
base_partition = 1
max_partition = self.nodes
max_time = 12
elif job.queue == 'preempt':
base_partition = 1
max_partition = 128
max_time = 24
elif job.queue == 'overrun':
base_partition = 1
max_partition = self.nodes
max_time = 12
else:
self.error('The requested queue is not implemented.')
#end if
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time:
self.error('The maximum runtime on {0} queue should not be more than {1} hours\n you requested: {2} hours'.format(job.queue,max_time,job.total_hours))
#end if
if job.nodes<base_partition:
self.error('The number of nodes on {0} queue should not be less than {1}\n you requested: {2}'.format(job.queue,base_partition,job.nodes))
elif job.nodes>max_partition:
self.error('The number of nodes on {0} queue should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes))
#end if

# Use the user cpus_per_task if specified. If not specified, then use available cpus for each process
if job.cpus_per_task is not None:
cpus_per_task = job.cpus_per_task
else:
hyperthreads = 2 # Both CPU and GPU nodes use the same AMD EPYC 7763 (Milan) CPUs
cpus_per_task = int(floor(float(self.cores_per_node)/job.processes_per_node))*hyperthreads
#end if

c='#!/bin/bash\n'
if job.account is not None:
c+= '#SBATCH -A '+job.account+'\n'
#end if
c+='#SBATCH -C '+str(job.constraint)+'\n'
c+='#SBATCH -q '+job.queue+'\n'
c+='#SBATCH -t '+job.sbatch_walltime()+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH -c '+str(cpus_per_task)+'\n'
c+='#SBATCH -J '+str(job.name)+'\n'
c+='#SBATCH -o '+job.outfile+'\n'
c+='#SBATCH -e '+job.errfile+'\n'

if 'gpu' in job.constraint:
gpus_per_task = int(floor(float(self.gpus_per_node)/job.processes_per_node))
c+='#SBATCH --gpus-per-task={0}\n'.format(gpus_per_task)
#end if

if job.user_env:
c+='#SBATCH --export=ALL\n' # equiv to PBS -V
else:
c+='#SBATCH --export=NONE\n'
#end if
c+='''
echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR
'''
if job.threads>1:
c+='''
export OMP_PROC_BIND=true
export OMP_PLACES=threads
'''
#end if
if 'gpu' in job.constraint:
c+='''
export SLURM_CPU_BIND="cores"
'''
#end if
return c
#end def write_job_header
#end class Perlmutter




class BlueWatersXK(Supercomputer):

name = 'bluewaters_xk'
Expand Down Expand Up @@ -3454,7 +3580,6 @@ def specialized_bundle_commands(self,job,launcher,serial):
Kraken( 9408, 2, 6, 16, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Golub( 512, 2, 6, 32, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
OIC5( 28, 2, 16, 128, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
Edison( 664, 2, 12, 64, 100, 'srun', 'sbatch', 'squeue', 'scancel')
Cori( 9688, 1, 68, 96, 100, 'srun', 'sbatch', 'squeue', 'scancel')
BlueWatersXK( 3072, 1, 16, 32, 100, 'aprun', 'qsub', 'qstat', 'qdel')
BlueWatersXE(22640, 2, 16, 64, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Expand Down Expand Up @@ -3486,6 +3611,7 @@ def specialized_bundle_commands(self,job,launcher,serial):
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')
Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Polaris( 560, 1, 32, 512, 8,'mpiexec', 'qsub', 'qstat', 'qdel')
Perlmutter( 3072, 2, 128, 512, 5000, 'srun', 'sbatch', 'squeue', 'scancel')


#machine accessor functions
Expand Down
47 changes: 24 additions & 23 deletions nexus/tests/unit/test_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,12 +1099,6 @@ def job_commands_equal(c1,c2):
('eclipse' , 'n2_t2' ) : 'srun test.x',
('eclipse' , 'n2_t2_e' ) : 'srun test.x',
('eclipse' , 'n2_t2_p2' ) : 'srun test.x',
('edison' , 'n1' ) : 'srun test.x',
('edison' , 'n1_p1' ) : 'srun test.x',
('edison' , 'n2' ) : 'srun test.x',
('edison' , 'n2_t2' ) : 'srun test.x',
('edison' , 'n2_t2_e' ) : 'srun test.x',
('edison' , 'n2_t2_p2' ) : 'srun test.x',
('eos' , 'n1' ) : 'aprun -n 16 test.x',
('eos' , 'n1_p1' ) : 'aprun -n 1 test.x',
('eos' , 'n2' ) : 'aprun -n 32 test.x',
Expand Down Expand Up @@ -1153,6 +1147,12 @@ def job_commands_equal(c1,c2):
('oic5' , 'n2_t2' ) : 'mpirun -np 32 test.x',
('oic5' , 'n2_t2_e' ) : 'mpirun -np 32 test.x',
('oic5' , 'n2_t2_p2' ) : 'mpirun -np 4 test.x',
('perlmutter' , 'n1' ) : 'srun test.x',
('perlmutter' , 'n1_p1' ) : 'srun test.x',
('perlmutter' , 'n2' ) : 'srun test.x',
('perlmutter' , 'n2_t2' ) : 'srun test.x',
('perlmutter' , 'n2_t2_e' ) : 'srun test.x',
('perlmutter' , 'n2_t2_p2' ) : 'srun test.x',
('polaris' , 'n1' ) : 'mpiexec --cpu-bind depth --depth=1 -n 32 --ppn 32 --env OMP_NUM_THREADS=1 test.x',
('polaris' , 'n1_p1' ) : 'mpiexec --cpu-bind depth --depth=1 -n 1 --ppn 1 --env OMP_NUM_THREADS=1 test.x',
('polaris' , 'n2' ) : 'mpiexec --cpu-bind depth --depth=1 -n 64 --ppn 32 --env OMP_NUM_THREADS=1 test.x',
Expand Down Expand Up @@ -1555,23 +1555,6 @@ def test_write_job():
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
edison = '''#!/bin/bash
#SBATCH -p regular
#SBATCH -J jobname
#SBATCH -t 06:30:00
#SBATCH -N 2
#SBATCH --ntasks-per-node=24
#SBATCH --cpus-per-task=1
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --export=ALL
echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
Expand Down Expand Up @@ -1702,6 +1685,24 @@ def test_write_job():
export OMP_NUM_THREADS=1
export ENV_VAR=1
mpirun -np 64 test.x''',
perlmutter = '''#!/bin/bash
#SBATCH -C cpu
#SBATCH -q regular
#SBATCH -t 06:30:00
#SBATCH -N 2
#SBATCH --ntasks-per-node=128
#SBATCH -c 2
#SBATCH -J jobname
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --export=ALL
echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
polaris = '''#!/bin/sh
#PBS -l select=2:system=polaris
#PBS -l place=scatter
Expand Down

0 comments on commit 800fec3

Please sign in to comment.