diff --git a/.github/workflows/build-cml-ami.yml b/.github/workflows/build-cml-ami.yml new file mode 100644 index 00000000..1693d5b1 --- /dev/null +++ b/.github/workflows/build-cml-ami.yml @@ -0,0 +1,25 @@ +name: cml-ami +on: + push: + tags: + - 'ami*' + +jobs: + build-ami: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Packer + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_INSTANCE_TYPE: g2.2xlarge + run: | + sudo apt install unzip + + cd packer + packer validate ami.json + packer build ami.json + ./clone-amis.go diff --git a/.gitignore b/.gitignore index 4ddd3daf..11e631f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ .DS_Store -terraform-provider-hashicups +examples bin # Local .terraform directories diff --git a/Makefile b/Makefile index 619afb60..2f3ca6c9 100644 --- a/Makefile +++ b/Makefile @@ -13,20 +13,6 @@ default: install build: go build -o ${BINARY} -# release: -# GOOS=darwin GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_darwin_amd64 -# GOOS=freebsd GOARCH=386 go build -o ./bin/${BINARY}_${VERSION}_freebsd_386 -# GOOS=freebsd GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_freebsd_amd64 -# GOOS=freebsd GOARCH=arm go build -o ./bin/${BINARY}_${VERSION}_freebsd_arm -# GOOS=linux GOARCH=386 go build -o ./bin/${BINARY}_${VERSION}_linux_386 -# GOOS=linux GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_linux_amd64 -# GOOS=linux GOARCH=arm go build -o ./bin/${BINARY}_${VERSION}_linux_arm -# GOOS=openbsd GOARCH=386 go build -o ./bin/${BINARY}_${VERSION}_openbsd_386 -# GOOS=openbsd GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_openbsd_amd64 -# GOOS=solaris GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_solaris_amd64 -# GOOS=windows GOARCH=386 go build -o ./bin/${BINARY}_${VERSION}_windows_386 -# GOOS=windows GOARCH=amd64 go build -o ./bin/${BINARY}_${VERSION}_windows_amd64 - install: build mkdir -p ~/.terraform.d/plugins/${HOSTNAME}/${NAMESPACE}/${NAME}/${VERSION}/${OS_ARCH} mv ${BINARY} ~/.terraform.d/plugins/${HOSTNAME}/${NAMESPACE}/${NAME}/${VERSION}/${OS_ARCH} diff --git a/README.md b/README.md index 0c699b30..1258154b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,67 @@ -# cml-terraform-provider -Terraform provider for CML +![Terraform Provider Iterative](https://user-images.githubusercontent.com/414967/98701372-7f60d700-2379-11eb-90d0-47b5eeb22658.png) -```sh -go mod vendor -make install +# Terraform Provider Iterative + +The Terraform Iterative provider is a plugin for Terraform that allows for the full lifecycle management of GPU or non GPU cloud resources with your favourite [vendor](#supported-vendors). The provider offers a simple and homogeneous way to deploy a GPU or a cluster of them reducing the complexity. + +# Usage + +```tf +terraform { + required_providers { + iterative = { + source = "iterative/iterative" + version = "0.5.0" + } + } +} + +provider "iterative" {} + +resource "iterative_machine" "machine" { + region = "us-west" + instance_name = "machine" + instance_hdd_size = "20" + instance_type = "m" + instance_gpu = "tesla" +} ``` + +## Argument reference + +| Variable | Values | Default | | +| ------- | ------ | -------- | ------------- | +| ```region``` | ```us-west``` ```us-east``` ```eu-west``` ```eu-north``` | ```us-west``` | Sets the collocation region | +| ```instance_name``` | | cml_{UID} | Sets the instance name and related resources like AWS key pair. | +| ```instance_hdd_size``` | | 10 | Sets the instance hard disk size in gb | +| ```instance_type``` | ```m```, ```l```, ```xl``` | ```m``` | Sets thee instance computing size. You can also specify vendor specific machines in AWS i.e. ```t2.micro``` | +| ```instance_gpu``` | ``` ```, ```testla```, ```k80``` | ``` ``` | Sets the desired GPU if the ```instance_type``` is one of our types. | +| ```key_public``` | | | Set up ssh access with your OpenSSH public key. If not provided one be automatically generated and returned in terraform.tfstate | +| aws_security_group | | ```cml``` | AWS specific variable to setup an specific security group. If specified the instance will be launched in with that sg within the vpc managed by the specified sg. If not a new sg called ```cml``` will be created under the default vpc | + + +# Supported vendors + + - AWS + +### AWS instance equivalences. +The instance type in AWS is calculated joining the ```instance_type``` and ```instance_gpu``` + +| type | gpu | aws | +| ------- | ------ | -------- | +| m | | m5.2xlarge | +| l | | m5.8xlarge | +| xl | | m5.16xlarge | +| m | k80 | p2.xlarge | +| l | k80 | p2.8xlarge | +| xl | k80 | p2.16xlarge | +| m | tesla | p3.xlarge | +| l | tesla | p3.8xlarge | +| xl | tesla | p3.16xlarge | + +| region | aws | +| ------- | ------ | +| us-west | us-west-1 | +| us-east | us-east-1 | +| eu-north | us-north-1 | +| eu-west | us-west-1 | diff --git a/cml/ami-test.json b/cml/ami-test.json new file mode 100644 index 00000000..f914e772 --- /dev/null +++ b/cml/ami-test.json @@ -0,0 +1,44 @@ +{ + "variables" : { + "instance_type" : "{{env `AWS_INSTANCE_TYPE`}}" + }, + "builders" : [ + { + "type" : "amazon-ebs", + "assume_role": { + "role_arn": "arn:aws:iam::260760892802:role/dvc-cml-packer", + "session_name": "cml-packer-session" + }, + "region" : "us-west-1", + "ami_name" : "iterative-cml-test", + "ami_description" : "CML (Continous Machine Learning)", + "ami_groups": ["all"], + "force_deregister": "true", + "force_delete_snapshot": "true", + "ssh_username" : "ubuntu", + "instance_type" : "g2.2xlarge", + "source_ami_filter": { + "filters": { + "virtualization-type": "hvm", + "name": "iterative-cml", + "root-device-type": "ebs" + }, + "owners": ["260760892802"], + "most_recent": true + }, + "run_tags" : { + "Author" : "iterative" + } + } + ], + "provisioners" : [ + { + "type": "shell", + "inline": [ + "nvidia-smi" + ], + "start_retry_timeout": "10m", + "expect_disconnect": true + } + ] +} diff --git a/cml/ami.json b/cml/ami.json new file mode 100644 index 00000000..fef1c0f1 --- /dev/null +++ b/cml/ami.json @@ -0,0 +1,50 @@ +{ + "variables" : { + "instance_type" : "{{env `AWS_INSTANCE_TYPE`}}" + }, + "builders" : [ + { + "type" : "amazon-ebs", + "assume_role": { + "role_arn": "arn:aws:iam::260760892802:role/dvc-cml-packer", + "session_name": "cml-packer-session" + }, + "region" : "us-west-1", + "ami_name" : "iterative-cml", + "ami_description" : "CML (Continous Machine Learning)", + "ami_groups": ["all"], + "force_deregister": "true", + "force_delete_snapshot": "true", + "ssh_username" : "ubuntu", + "instance_type" : "g2.2xlarge", + "source_ami_filter": { + "filters": { + "virtualization-type": "hvm", + "name": "ubuntu/images/*ubuntu-*-18.04-amd64-server-*", + "root-device-type": "ebs" + }, + "owners": ["099720109477"], + "most_recent": true + }, + "run_tags" : { + "Author" : "iterative" + } + } + ], + "provisioners" : [ + { + "type" : "shell", + "environment_vars": ["DEBIAN_FRONTEND=noninteractive"], + "script" : "./setup.sh" + }, + { + "type": "shell", + "inline": [ + "sudo shutdown -r now", + "sleep 60" + ], + "start_retry_timeout": "10m", + "expect_disconnect": true + } + ] +} diff --git a/cml/clone-amis.go b/cml/clone-amis.go new file mode 100755 index 00000000..c409c3c5 --- /dev/null +++ b/cml/clone-amis.go @@ -0,0 +1,98 @@ +//usr/bin/env go run $0 "$@"; exit +package main + +import ( + "fmt" + "log" + "os" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/ec2" + "github.com/hashicorp/terraform-plugin-sdk/v2/diag" +) + +func main() { + region := "us-west-1" + amiName := "iterative-cml" + regions := []string{"us-east-1", "us-east-2", "us-west-2", "eu-central-1", "eu-west-1"} + + sess, sessError := session.NewSession(&aws.Config{ + Region: aws.String(region)}, + ) + if sessError != nil { + log.Printf("[ERROR] %s", sessError) + os.Exit(1) + } + + svc := ec2.New(sess) + + amiParams := &ec2.DescribeImagesInput{ + Filters: []*ec2.Filter{ + { + Name: aws.String("name"), + Values: []*string{aws.String(amiName)}, + }, + { + Name: aws.String("architecture"), + Values: []*string{aws.String("x86_64")}, + }, + }, + } + imagesRes, imagesErr := svc.DescribeImages(amiParams) + if imagesErr != nil { + diag.FromErr(imagesErr) + } + if len(imagesRes.Images) == 0 { + log.Printf("[ERROR] ami %s not found", amiName) + os.Exit(1) + } + + ami := imagesRes.Images[0] + amiID := *ami.ImageId + amiDesc := *ami.Description + + for _, value := range regions { + fmt.Println("Cloning", value) + + sess, _ := session.NewSession(&aws.Config{ + Region: aws.String(value)}, + ) + + svc := ec2.New(sess) + + copyResult, err := svc.CopyImage(&ec2.CopyImageInput{ + SourceImageId: aws.String(amiID), + SourceRegion: aws.String(region), + Name: aws.String(amiName), + Description: aws.String(amiDesc), + }) + if err != nil { + fmt.Println(err) + } + + svc.WaitUntilImageExists(&ec2.DescribeImagesInput{ + ImageIds: []*string{aws.String(*copyResult.ImageId)}, + Filters: []*ec2.Filter{ + { + Name: aws.String("state"), + Values: []*string{aws.String("available")}, + }, + }, + }) + + _, modifyErr := svc.ModifyImageAttribute(&ec2.ModifyImageAttributeInput{ + ImageId: aws.String(*copyResult.ImageId), + LaunchPermission: &ec2.LaunchPermissionModifications{ + Add: []*ec2.LaunchPermission{ + { + Group: aws.String("all"), + }, + }, + }, + }) + if modifyErr != nil { + fmt.Println(modifyErr) + } + } +} diff --git a/cml/setup.sh b/cml/setup.sh new file mode 100755 index 00000000..aa826b40 --- /dev/null +++ b/cml/setup.sh @@ -0,0 +1,13 @@ +#/bin/sh + +echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assumeyes + +curl -fsSL https://get.docker.com -o get-docker.sh && sh get-docker.sh && \ +sudo usermod -aG docker \${USER} +sudo setfacl --modify user:\${USER}:rw /var/run/docker.sock + +curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - && \ +curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list && \ +sudo apt update && sudo apt install -y ubuntu-drivers-common && \ +sudo ubuntu-drivers autoinstall && \ +sudo apt install -y nvidia-container-toolkit diff --git a/examples/main.tf b/examples/main.tf index e9976f8c..a407946f 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -1,9 +1,10 @@ - terraform { required_providers { iterative = { - source = "DavidGOrtega/iterative" - version = "0.4.0" + #source = "DavidGOrtega/iterative" + #version = "0.4.0" + versions = ["0.3"] + source = "github.com/davidgortega/iterative" } } } @@ -11,8 +12,7 @@ terraform { provider "iterative" {} resource "iterative_machine" "machine" { - region = "us-east-1" - - - -} \ No newline at end of file + region = "us-west" + instance_type = "t2.micro" + instance_gpu = "tesla" +} diff --git a/iterative/resource_machine.go b/iterative/resource_machine.go index 2dfd25b1..dc024b3b 100644 --- a/iterative/resource_machine.go +++ b/iterative/resource_machine.go @@ -3,7 +3,6 @@ package iterative import ( "context" "fmt" - "log" "sort" "time" @@ -19,20 +18,26 @@ func resourceMachine() *schema.Resource { return &schema.Resource{ CreateContext: resourceMachineCreate, ReadContext: resourceMachineRead, - //UpdateContext: resourceMachineUpdate,s + //UpdateContext: resourceMachineUpdate, DeleteContext: resourceMachineDelete, Schema: map[string]*schema.Schema{ "region": &schema.Schema{ Type: schema.TypeString, Optional: true, ForceNew: true, - Default: "us-east-1", + Default: "us-west", + }, + "instance_name": &schema.Schema{ + Type: schema.TypeString, + Optional: true, + ForceNew: true, + Default: "", }, "instance_type": &schema.Schema{ Type: schema.TypeString, Optional: true, ForceNew: true, - Default: "t2.micro", + Default: "m", }, "instance_hdd_size": &schema.Schema{ Type: schema.TypeInt, @@ -40,22 +45,23 @@ func resourceMachine() *schema.Resource { ForceNew: true, Default: 10, }, - "instance_id": &schema.Schema{ + "instance_gpu": &schema.Schema{ Type: schema.TypeString, Optional: true, - Computed: true, + ForceNew: true, + Default: "", }, - "instance_ip": &schema.Schema{ + "instance_id": &schema.Schema{ Type: schema.TypeString, Optional: true, Computed: true, }, - "instance_launch_time": &schema.Schema{ + "instance_ip": &schema.Schema{ Type: schema.TypeString, Optional: true, Computed: true, }, - "key_name": &schema.Schema{ + "instance_launch_time": &schema.Schema{ Type: schema.TypeString, Optional: true, Computed: true, @@ -71,6 +77,11 @@ func resourceMachine() *schema.Resource { Optional: true, Computed: true, }, + "key_name": &schema.Schema{ + Type: schema.TypeString, + Optional: true, + Computed: true, + }, "aws_security_group": &schema.Schema{ Type: schema.TypeString, Optional: true, @@ -84,15 +95,24 @@ func resourceMachine() *schema.Resource { func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interface{}) diag.Diagnostics { var diags diag.Diagnostics - svc, errClient := awsClient(d) + region := getRegion(d) + + sid, _ := shortid.New(1, shortid.DefaultABC, 2342) + id, _ := sid.Generate() + instanceName := d.Get("instance_name").(string) + if len(instanceName) == 0 { + instanceName = "cml_" + id + } + + instanceType := getInstanceType(d) + hddSize := d.Get("instance_hdd_size").(int) + + svc, errClient := awsClient(region) if errClient != nil { return diag.FromErr(errClient) } - sid, err := shortid.New(1, shortid.DefaultABC, 2342) - id, _ := sid.Generate() - - amiParams := &ec2.DescribeImagesInput{ + imagesRes, imagesErr := svc.DescribeImages(&ec2.DescribeImagesInput{ Filters: []*ec2.Filter{ { Name: aws.String("name"), @@ -103,10 +123,17 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf Values: []*string{aws.String("x86_64")}, }, }, - } - imagesRes, imagesErr := svc.DescribeImages(amiParams) + }) if imagesErr != nil { - diag.FromErr(imagesErr) + return diag.FromErr(imagesErr) + } + if len(imagesRes.Images) == 0 { + diags = append(diags, diag.Diagnostic{ + Severity: diag.Error, + Summary: "iterative-cml ami not found in region", + }) + + return diags } sort.Slice(imagesRes.Images, func(i, j int) bool { @@ -115,24 +142,15 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf return itime.Unix() > jtime.Unix() }) - /* diags = append(diags, diag.Diagnostic{ - Severity: diag.Error, - Summary: "Unable to create HashiCups client", - Detail: fmt.Sprint(len(imagesRes.Images)), - }) - - return diags */ - - instanceAmi := *imagesRes.Images[0].ImageId - instanceType := d.Get("instance_type").(string) keyPublic := d.Get("key_public").(string) - securityGroup := d.Get("aws_security_group").(string) - hddSize := d.Get("instance_hdd_size").(int) - pairName := "cml_" + id + instanceAmi := *imagesRes.Images[0].ImageId + pairName := instanceName + var keyMaterial string var vpcID string + var sgID string // key-pair if len(keyPublic) != 0 { @@ -158,8 +176,7 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf securityGroup = "cml" vpcsDesc, _ := svc.DescribeVpcs(&ec2.DescribeVpcsInput{}) - vpc := vpcsDesc.Vpcs[0] - vpcID = *vpc.VpcId + vpcID = *vpcsDesc.Vpcs[0].VpcId gpResult, ee := svc.CreateSecurityGroup(&ec2.CreateSecurityGroupInput{ GroupName: aws.String(securityGroup), @@ -204,12 +221,12 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf }, }, }) - if sgDescErr != nil { return diag.FromErr(sgDescErr) } - sgID := *sgDesc.SecurityGroups[0].GroupId + sgID = *sgDesc.SecurityGroups[0].GroupId + vpcID = *sgDesc.SecurityGroups[0].VpcId subDesc, _ := svc.DescribeSubnetsWithContext(ctx, &ec2.DescribeSubnetsInput{ Filters: []*ec2.Filter{ @@ -220,16 +237,13 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf }, }) - log.Printf("[ERROR] %s %s %s", instanceAmi, instanceType, sgID) + //launch instance runResult, err := svc.RunInstancesWithContext(ctx, &ec2.RunInstancesInput{ - ImageId: aws.String(instanceAmi), - KeyName: aws.String(pairName), - InstanceType: aws.String(instanceType), - MinCount: aws.Int64(1), - MaxCount: aws.Int64(1), - //SecurityGroups: []*string{ - //aws.String(securityGroup), - //}, + ImageId: aws.String(instanceAmi), + KeyName: aws.String(pairName), + InstanceType: aws.String(instanceType), + MinCount: aws.Int64(1), + MaxCount: aws.Int64(1), SubnetId: aws.String(*subDesc.Subnets[0].SubnetId), SecurityGroupIds: []*string{aws.String(sgID)}, BlockDeviceMappings: []*ec2.BlockDeviceMapping{ @@ -249,38 +263,33 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf if err != nil { diags = append(diags, diag.Diagnostic{ Severity: diag.Error, - Summary: "Unable to create HashiCups client", - Detail: fmt.Sprintf("[ERROR] %s %s %s", instanceAmi, instanceType, sgID), + Summary: "Unable to create instance", + Detail: fmt.Sprintf("[ERROR] Instance %s of type %s at region %s", instanceName, instanceType, region), }) diags = append(diags, diag.FromErr(err)[0]) - //return diag.FromErr(err) - return diags } - // Add tags to the created instance - _, errtag := svc.CreateTags(&ec2.CreateTagsInput{ - Resources: []*string{runResult.Instances[0].InstanceId}, + instanceID := *runResult.Instances[0].InstanceId + + // Add name to the created instance + _, errTag := svc.CreateTags(&ec2.CreateTagsInput{ + Resources: []*string{aws.String(instanceID)}, Tags: []*ec2.Tag{ { Key: aws.String("Name"), - Value: aws.String("cml"), + Value: aws.String(instanceName), }, }, }) - if errtag != nil { - return diag.FromErr(errtag) + if errTag != nil { + return diag.FromErr(errTag) } - instance := *runResult.Instances[0] - instanceID := *instance.InstanceId - - instanceIds := make([]*string, 1) - instanceIds[0] = &instanceID statusInput := ec2.DescribeInstancesInput{ - InstanceIds: instanceIds, + InstanceIds: []*string{aws.String(instanceID)}, Filters: []*ec2.Filter{ { Name: aws.String("instance-state-name"), @@ -288,12 +297,14 @@ func resourceMachineCreate(ctx context.Context, d *schema.ResourceData, m interf }, }, } + svc.WaitUntilInstanceExistsWithContext(ctx, &statusInput) descResult, _ := svc.DescribeInstancesWithContext(ctx, &statusInput) instanceDesc := descResult.Reservations[0].Instances[0] d.SetId(instanceID) + d.Set("instance_name", instanceName) d.Set("instance_id", instanceID) d.Set("instance_ip", instanceDesc.PublicIpAddress) d.Set("instance_launch_time", instanceDesc.LaunchTime.Format(time.RFC3339)) @@ -314,7 +325,7 @@ func resourceMachineUpdate(ctx context.Context, d *schema.ResourceData, m interf func resourceMachineDelete(ctx context.Context, d *schema.ResourceData, m interface{}) diag.Diagnostics { var diags diag.Diagnostics - svc, _ := awsClient(d) + svc, _ := awsClient(getRegion(d)) pairName := d.Get("key_name").(string) instanceID := d.Get("instance_id").(string) @@ -339,12 +350,45 @@ func resourceMachineDelete(ctx context.Context, d *schema.ResourceData, m interf return diags } -func awsClient(d *schema.ResourceData) (*ec2.EC2, error) { - region := d.Get("region").(string) +func awsClient(region string) (*ec2.EC2, error) { sess, err := session.NewSession(&aws.Config{ Region: aws.String(region)}, ) svc := ec2.New(sess) - return svc, err } + +func getRegion(d *schema.ResourceData) string { + instanceRegions := make(map[string]string) + instanceRegions["us-east"] = "us-east-1" + instanceRegions["us-west"] = "us-west-1" + instanceRegions["eu-north"] = "eu-north-1" + instanceRegions["eu-west"] = "eu-west-1" + + region := d.Get("region").(string) + if val, ok := instanceRegions[region]; ok { + region = val + } + + return region +} + +func getInstanceType(d *schema.ResourceData) string { + instanceTypes := make(map[string]string) + instanceTypes["m"] = "m5.2xlarge" + instanceTypes["l"] = "m5.8xlarge" + instanceTypes["xl"] = "m5.16xlarge" + instanceTypes["mk80"] = "p2.xlarge" + instanceTypes["lk80"] = "p2.8xlarge" + instanceTypes["xlk80"] = "p2.16xlarge" + instanceTypes["mtesla"] = "p3.xlarge" + instanceTypes["ltesla"] = "p3.8xlarge" + instanceTypes["xltesla"] = "p3.16xlarge" + + instanceType := d.Get("instance_type").(string) + if val, ok := instanceTypes[instanceType+d.Get("instance_gpu").(string)]; ok { + instanceType = val + } + + return instanceType +}