diff --git a/buildstockbatch/hpc.py b/buildstockbatch/hpc.py index 56876dcd..0932461c 100644 --- a/buildstockbatch/hpc.py +++ b/buildstockbatch/hpc.py @@ -489,6 +489,7 @@ def queue_sampling( "--time={}".format(cfg[cls.HPC_NAME].get("sampling", {}).get("time", 60)), "--account={}".format(cfg[cls.HPC_NAME]["account"]), "--nodes=1", + "--mem={}".format(cls.DEFAULT_NODE_MEMORY_MB), "--export={}".format(",".join(env.keys())), "--output=sampling.out", hpc_sh, @@ -539,6 +540,7 @@ def queue_jobs(self, array_ids=None, hipri=False): "sbatch", "--account={}".format(account), "--time={}".format(walltime), + "--mem={}".format(self.DEFAULT_NODE_MEMORY_MB), "--export={}".format(",".join(export_vars)), "--array={}".format(array_spec), "--output=job.out-%a", @@ -620,6 +622,7 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) args = [ "sbatch", + "--tmp=1000000", "--account={}".format(account), "--time={}".format(walltime), "--export={}".format(",".join(env_export.keys())), @@ -627,6 +630,7 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False) "--output=postprocessing.out", "--nodes=1", ":", + "--tmp=1000000", "--mem={}".format(memory), "--output=dask_workers.out", "--nodes={}".format(n_workers), @@ -746,6 +750,7 @@ class EagleBatch(SlurmBatch): CORES_PER_NODE = 36 MIN_SIMS_PER_JOB = 36 * 2 DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 85248 + DEFAULT_NODE_MEMORY_MB = 85248 # standard node on Eagle DEFAULT_POSTPROCESSING_N_PROCS = 18 DEFAULT_POSTPROCESSING_N_WORKERS = 2 @@ -776,7 +781,8 @@ class KestrelBatch(SlurmBatch): HPC_NAME = "kestrel" CORES_PER_NODE = 104 MIN_SIMS_PER_JOB = 104 * 2 - DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 250000 # Standard node + DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 247000 # Standard node + DEFAULT_NODE_MEMORY_MB = 247000 # standard node on Kestrel DEFAULT_POSTPROCESSING_N_PROCS = 52 DEFAULT_POSTPROCESSING_N_WORKERS = 2 diff --git a/buildstockbatch/kestrel.sh b/buildstockbatch/kestrel.sh index 5d4b9422..f2d2a074 100644 --- a/buildstockbatch/kestrel.sh +++ b/buildstockbatch/kestrel.sh @@ -12,6 +12,10 @@ df -h module load python apptainer source "$MY_PYTHON_ENV/bin/activate" +# Default LOCAL_SCRATCH = /tmp/scratch +# Setting to user-specific dir to avoid +# issues with deleting previous buildstock run debris +export LOCAL_SCRATCH=/tmp/scratch/$USER source /kfs2/shared-projects/buildstock/aws_credentials.sh time python -u -m buildstockbatch.hpc kestrel "$PROJECTFILE" diff --git a/buildstockbatch/kestrel_postprocessing.sh b/buildstockbatch/kestrel_postprocessing.sh index 6c86f5ef..c96703b3 100644 --- a/buildstockbatch/kestrel_postprocessing.sh +++ b/buildstockbatch/kestrel_postprocessing.sh @@ -11,6 +11,10 @@ df -h module load python apptainer source "$MY_PYTHON_ENV/bin/activate" +# Default LOCAL_SCRATCH = /tmp/scratch +# Setting to user-specific dir to avoid +# issues with deleting previous buildstock run debris +export LOCAL_SCRATCH=/tmp/scratch/$USER source /kfs2/shared-projects/buildstock/aws_credentials.sh export POSTPROCESS=1 @@ -30,6 +34,6 @@ pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "free -h" pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "df -i; df -h" $MY_PYTHON_ENV/bin/dask scheduler --scheduler-file $SCHEDULER_FILE &> $OUT_DIR/dask_scheduler.out & -pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "source /kfs2/shared-projects/buildstock/aws_credentials.sh; $MY_PYTHON_ENV/bin/dask worker --scheduler-file $SCHEDULER_FILE --local-directory /tmp/scratch/dask --nworkers ${NPROCS} --nthreads 1 --memory-limit ${MEMORY}MB" &> $OUT_DIR/dask_workers.out & +pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "source /kfs2/shared-projects/buildstock/aws_credentials.sh; $MY_PYTHON_ENV/bin/dask worker --scheduler-file $SCHEDULER_FILE --local-directory $LOCAL_SCRATCH/dask --nworkers ${NPROCS} --nthreads 1 --memory-limit ${MEMORY}MB" &> $OUT_DIR/dask_workers.out & time python -u -m buildstockbatch.hpc kestrel "$PROJECTFILE"