aws-samples · deeppat · May 13, 2022 · May 12, 2022 · cartalla · May 13, 2022
diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py
@@ -56,6 +56,8 @@
 from os import path
 from os.path import dirname, realpath
 from pprint import PrettyPrinter
+import subprocess
+from subprocess import check_output
 import sys
 from sys import exit
 from tempfile import NamedTemporaryFile
@@ -1608,13 +1610,33 @@ def get_instance_template_vars(self, instance_role):
         return instance_template_vars
 
     def create_slurmctl(self):
-        if self.config['slurm']['MungeKeySsmParameter']:
+        ssm_client = boto3.client('ssm', region_name=self.config['Region'])
+        response = ssm_client.describe_parameters(
+            ParameterFilters = [
+                {
+                    'Key': 'Name',
+                    'Option': 'Equals',
+                    'Values': [self.config['slurm']['MungeKeySsmParameter']]
+                }
+            ]
+        )['Parameters']
+        if response:
+            logger.info(f"{self.config['slurm']['MungeKeySsmParameter']} SSM parameter exists and will be used.")
             self.munge_key_ssm_parameter = ssm.StringParameter.from_string_parameter_name(
                 self, f"MungeKeySsmParamter",
                 string_parameter_name  = f"{self.config['slurm']['MungeKeySsmParameter']}"
             )
         else:
-            self.munge_key_ssm_parameter = None
+            logger.info(f"{self.config['slurm']['MungeKeySsmParameter']} SSM parameter doesn't exist. Creating it so can give IAM permissions to it.")
+            output = check_output(['dd if=/dev/random bs=1 count=1024 | base64 -w 0'], shell=True, stderr=subprocess.DEVNULL, encoding='utf8', errors='ignore')
+            munge_key = output.split('\n')[0]
+            # print(f"output\n{output}")
+            # print(f"munge_key:\n{munge_key}")
+            self.munge_key_ssm_parameter = ssm.StringParameter(
+                self, f"MungeKeySsmParamter",
+                parameter_name  = f"{self.config['slurm']['MungeKeySsmParameter']}",
+                string_value = f"{munge_key}"
+            )
 
         self.slurmctl_role = iam.Role(self, "SlurmCtlRole",
             assumed_by=iam.CompositePrincipal(

diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py
@@ -59,7 +59,7 @@
         'slurm': {
             Optional('SlurmVersion', default='21.08.8'): str,
             Optional('ClusterName'): str,
-            Optional('MungeKeySsmParameter', default='/slurm/munge_key'): str,
+            Optional('MungeKeySsmParameter', default='/slurm/munge_key'): str, # Will be created if it doesn't exist.
             'SlurmCtl': {
                 Optional('NumberOfControllers', default=1): And(Use(int), lambda n: 1 <= n <= 3),
                 Optional('BaseHostname', default='slurmctl'): str,

diff --git a/source/resources/config/default_config.yml b/source/resources/config/default_config.yml
@@ -1,173 +1,31 @@
 ---
+#====================================================================
 # Sample configuraton that creates a minimal Slurm cluster
+#
 # Shows all available configuration options
 # Note that CentOS 8 has been discontinued and support has been removed.
 # Uses arm64 architecture for SlurmCtl and SlurmDbd by default.
 # No SlurmDbd in this configuration.
-
-termination_protection: True # Enable (recommended) or Disable Cloudformation Stack termination protection
-
-#====================================================================
-# Parameters that must be in the config file or on the command line.
-# Command line values override values in the config file.
-#====================================================================
-StackName: slurmminimal
-#Region: us-east-1
-#SshKeyPair: name of your ec2 keypair
-#VpcId: vpc-xxxxxxxxxxxxxxxxx
-
-# SubnetId:
-# Optional. If not specified then the first private subnet is chosen.
-#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet1
-#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet2
-#SubnetId: subnet-xxxxxxxxxxxxxxxxx # PrivateSubnet3
-
-# This is optional, but highly recommended
-#ErrorSnsTopicArn: arn:aws:sns:{{region}}:{AccountId}:{TopicName}
-
-#====================================================================
-# Required Parameters
+#
+# Defaults and valid configuration options are in source/config_schema.py.
 #====================================================================
 
-# Domain: Optional
-# Domain name for the Route 53 private hosted zone that will be used
-# by the slurm cluster for DNS.
-# By default will be {StackName}.local
-# Alternately, provide HostedZoneId of an existing Route53 hosted zone to use.
-# Cannot specify both Domain and HostedZoneId.
-# Domain: "{{StackName}}.local"
-
-# HostedZoneId: Optional
-# ID of an existing hosted zone that will be used by the slurm cluster for DNS.
-# Alternately, provide Domain name to use for a new Route53 hosted zone to use.
-# Cannot specify both Domain and HostedZoneId.
-# HostedZoneId:
-
-TimeZone: 'US/Central'
+StackName: slurmminimal
 
 slurm:
-  # High level configuration
-
-  SlurmVersion: "21.08.5"
-
-  # ClusterName:
-  # Optional
-  # Must be unique if multiple clusters deployed in the same VPC.
-  # Default: StackName
-  # ClusterName: slurm
-
-  # MungeKeySsmParameter
-  # SSM String Parameter with a base64 encoded munge key to use for the cluster.
-  # Use this if your submitters need to use more than 1 cluster.
-  #MungeKeySsmParameter: "/slurm/munge_key"
-
-  SlurmCtl:
-    # For high availability configure multiple controllers
-    NumberOfControllers: 1
-    # The index will be appended to BaseHostname starting with 1.
-    BaseHostname: slurmctl
-
-    # architecture: x86_64 or arm64
-    #architecture: x86_64
-    #instance_type: "c5.large"
-    architecture: arm64
-    instance_type: "c6g.large"
-    volume_size: 200 # Size of the EBS root disk
-
-    # SuspendAction
-    # Set to stop or terminate.
-    # Stopped nodes will restart quicker, but you will continue to be charged for the EBS volumes
-    # attached to the instance.
-    SuspendAction: stop
-    #
-    # MaxStoppedDuration
-    # In ISO 8601 duration format: https://en.wikipedia.org/wiki/ISO_8601#Durations
-    # Default: 1 hour = P0Y0M0DT1H0M0S
-    # Evaluated at least hourly
-    MaxStoppedDuration: P0Y0M0DT1H0M0S
-
-    CloudWatchPeriod: 5 # Cloudwatch metric collection period in minutes. Default value is 5. Set to 1 for finer resolution.
-                        # Also used in the dashboard widgets.
-
-  # The accounting database is required to enable fairshare scheduling
-  # It is managed by the Slurm Database Daemon (slurmdbd) instance
-  # This instance can be created as part of the cluster or can use an existing instance.
-  # SlurmDbd:
-  #   # It is recommended to get the basic cluster configured and working before enabling the accounting database
-  #   UseSlurmDbd: False
-
-  #   # Hostname:
-  #   # Hostname of the slurmdbd instance if CreateSlurmdbd is true.
-  #   Hostname: slurmdbd
-
-  #   # architecture: x86_64 or arm64
-  #   #architecture: x86_64
-  #   #instance_type: "m5.large"
-  #   architecture: arm64
-  #   instance_type: "m6g.large"
-  #   volume_size: 200 # Size of the EBS root disk
-
-  #   database:
-  #     port: 3306
-
-  # Federation:
-  #   Name: slurmeda
-  #   SlurmCtlSecurityGroups:
-  #     SecurityGroupName: sg-xxxxxxxxxxxxxxxxx
-
-  SlurmNodeAmis:
-    instance_type:
-      x86_64: m5.large
-      arm64:  m6g.large
-
-    # Customized AMIs with file system mounts, packages, etc. configured.
-    # If these aren't defined then the generic base AMIs are used.
-    # Example in the comment below is the AWS FPGA Developer AMI
-    #BaseAmis:
-    #  us-east-1:
-    #    Amazon: {2: {x86_64: {ImageId: ami-0efdec76678df9a64, RootDeviceSize: '+5'}}}
-    #    CentOS: {7: {x86_64: {ImageId: ami-02155c6289e76719a, RootDeviceSize: 90}}}
-
-  # External security groups that should be able to use the cluster
-  # SubmitterSecurityGroupIds:
-  #   soca-ComputeNodeSG: sg-xxxxxxxxxxxxxxxxx
-
-  # SubmitterInstanceTags:
-  #   'soca:ClusterId': ['soca-xyz']
+  SlurmCtl: {}
 
   # InstanceConfig:
   # Configure the instances used by the cluster
   # A partition will be created for each combination of Base OS, Architecture, and Spot
-  #
-  # UseSpot:
-  #     Create both on-demand and spot nodes
-  #     Default: true
-  # DefaultPartition:
-  #     By default this will be the first OS/Architecture listed in BaseOsArchitecture.
-  #     Add '_spot' to the end to make spot the default purchase option.
-  # NodesPerInstanceType:
-  #     The number of nodes that will be defined for each instance type.
-  # Include*/Exclude*:
-  #     Instance families and types are regular expressions with implicit '^' and '$' at the begining and end.
-  #     Exclude patterns are processed first and take precedence over any includes.
-  #     A empty list is the same as '.*'.
-  # MaxSizeOnly: If MaxSizeOnly is True then only the largest instance type in
-  #     a family will be included unless specific instance types are included.
-  #     Default: false
   InstanceConfig:
     UseSpot: true
     DefaultPartition: AlmaLinux_8_arm64_spot
     NodesPerInstanceType: 10
     BaseOsArchitecture:
       AlmaLinux: {8: [x86_64, arm64]}
-      # Amazon: {2: [x86_64, arm64]}
       CentOS:
         7: [x86_64]
-      # Amazon: {2: [x86_64, arm64]}
-      # RedHat:
-      #   7: [x86_64]
-      #   8: [x86_64, arm64]
-      # Rocky: {8: [x86_64, arm64]}
     Include:
       MaxSizeOnly: false
       InstanceFamilies:
@@ -180,68 +38,6 @@ slurm:
         - '.+\.(micro|nano)'   # Not enough memory
         - '.*\.metal'
 
-  # ElasticSearch:
-  # Configure the ElasticSearch/OpenSearch domain used by the slurm cluster
-  # If not specified then won't be created or used by the cluster.
-  # master_nodes: Defaults to 0
-  # data_nodes: Must be a multiple of number_of_azs
-  # ElasticSearch:
-  #   ebs_volume_size: 20
-  #   ebs_volume_type: GP2
-  #   enable_version_upgrade: False
-  #   number_of_azs: 2
-  #   master_nodes: 3
-  #   master_node_instance_type: m5.large.search
-  #   data_nodes: 2
-  #   data_node_instance_type: m5.large.search
-  #   warm_nodes: 0
-  #   warm_instance_type: ultrawarm.medium.search
-
-  # JobCompType:
-  # Values:
-  #     jobcomp/none
-  #     jobcomp/elasticsearch
-  #     jobcomp/filetxt
-  JobCompType: jobcomp/filetxt
-  #
-  # JobCompLoc:
-  # Used with jobcomp/elasticsearch
-  # A complete URL endpoint with format <host>:<port>/<target>/_doc
-  #JobCompLoc: http://{{EsDomain}}.{{Region}}.es.amazonaws.com/slurm/_doc
-
-  # Configure your Storage options below
-  # @todo support fsxn, test if efs will gate scaling of the cluster
   storage:
-    # mount_path:
-    # Default is /opt/slurm/{{cluster_name}}
-    #mount_path: ""
-    provider: "efs" # efs or lustre
-    #kms_key_arn:
-    removal_policy : "DESTROY" # DESTROY, RETAIN, SNAPSHOT. Choices: RETAIN will preserve the EFS even if you delete the stack. Any other value will delete EFS if you delete the CFN stack
-    efs:
-      use_efs_helper: false
-      throughput_mode: "BURSTING" # Choices: BURSTING, PROVISIONED
-      # provisioned_throughput_per_second: 1 # In MiB/s. Minimum value of 1
-      performance_mode: "GENERAL_PURPOSE" # Choices: GENERAL_PURPOSE, MAX_IO
-      encrypted: True # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html#cfn-efs-filesystem-encrypted
-      lifecycle_policy: "AFTER_30_DAYS" # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-efs-filesystem-lifecyclepolicy.html
-    lustre:
-        deployment_type: "SCRATCH_2" # Allowed values: PERSISTENT_1 | SCRATCH_1 | SCRATCH_2. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-deploymenttype
-        drive_cache_type: "NONE" # Allowed values: NONE | READ. Required when storage_type is HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-drivecachetype
-        per_unit_storage_throughput: 50 # Allowed values: 12, 40 for HDD, 50, 100, 200 for SSD. Required for the PERSISTENT_1 deployment_type. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-fsx-filesystem-lustreconfiguration.html#cfn-fsx-filesystem-lustreconfiguration-perunitstoragethroughput
-        storage_capacity: 1200 # For SCRATCH_2 and PERSISTENT_1 types, valid values are 1,200, 2,400, then continuing in increments of 2,400 GiB. For SCRATCH_1 deployment types, valid values are 1,200, 2,400, 3,600, then continuing in increments of 3,600 GiB. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagecapacity
-        storage_type: "SSD" # Allowed values: SSD or HDD. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-fsx-filesystem.html#cfn-fsx-filesystem-storagetype
-
-    # ExtraMounts
-    # Additional mounts for compute nodes
-    # This examle shows SOCA EFS file systems.
-    # This is required so the compute node as the same file structure as the remote desktops.
-    #ExtraMounts:
-    #  - dest: /apps
-    #    src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/
-    #    type: nfs4
-    #    options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport
-    #  - dest: /data
-    #    src: fs-xxxxxxxx.efs.us-east-1.amazonaws.com:/
-    #    type: nfs4
-    #    options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport
+    provider: zfs
+    zfs: {} # This causes the defaults from the schema to be applied.