Skip to content

Commit

Permalink
Merge pull request #777 from aws-quickstart/feature/neuron-addon
Browse files Browse the repository at this point in the history
Neuron Device Plugin Addon
  • Loading branch information
elamaran11 authored Feb 14, 2024
2 parents 2cfb718 + e93145a commit 7f6442a
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/addons/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ The framework currently supports the following add-ons.
| [`MetricsServerAddOn`](./metrics-server.md) | Adds metrics server (pre-req for HPA and other monitoring tools). |||
| [`NewRelicAddOn`](./newrelic.md) | Adds [New Relic](https://newrelic.com/) and [Pixie](https://pixielabs.ai/) observability for Amazon EKS. ||
| [`NginxAddOn`](./nginx.md) | Adds NGINX ingress controller ||| |
| [`NeuronAddOn`](./neuron-plugin-addon.md) | Adds Neuron Addon || |
| [`OpaGatekeeperAddOn`](./opa-gatekeeper.md) | Adds OPA Gatekeeper |||
| [`ParalusAddOn`](./paralus.md) | Adds [Paralus](https://paralus.io/) |||
| [`PixieAddOn`](./pixie.md) | Adds [Pixie](https://px.dev) to the EKS Cluster. Pixie provides auto-telemetry for requests, metrics, application profiles, and more. ||
Expand Down
52 changes: 52 additions & 0 deletions docs/addons/neuron-plugin-addon.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Neuron Device Plugin Addon

[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the SDK used to run deep learning workloads on AWS Inferentia and AWS Trainium based instances. This addon will install the Neuron Device Plugin necessary to run the instances on Amazon EKS (and Blueprints). Note that you **must** use *inf1, inf2, trn1,* or *trn1n* instances.

## Usage

#### **`index.ts`**
```typescript
import 'source-map-support/register';
import * as cdk from 'aws-cdk-lib';
import * as blueprints from '@aws-quickstart/eks-blueprints';

const app = new cdk.App();

const addOn = new blueprints.addons.NeuronPluginAddon();

const clusterProvider = new blueprints.GenericClusterProvider({
version: KubernetesVersion.V1_27,
managedNodeGroups: [
inferentiaNodeGroup()
]
});

function inferentiaNodeGroup(): blueprints.ManagedNodeGroup {
return {
id: "mng1",
instanceTypes: [new ec2.InstanceType('inf1.2xlarge')],
desiredSize: 1,
maxSize: 2,
nodeGroupSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS },
};
}

const blueprint = blueprints.EksBlueprint.builder()
.clusterProvider(clusterProvider)
.addOns(addOn)
.build(app, 'my-stack-name');
```

Once deployed, you can see the plugin daemonset in the `kube-system` namespace.

```sh
$ kubectl get daemonset neuron-device-plugin-daemonset -n kube-system

NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
neuron-device-plugin-daemonset 1 1 1 1 1 <none> 24m 20m
```

## Functionality

1. Deploys the plugin daemonset in `kube-system` namespace by default.
2. Provides a plugin for the blueprint to leverage the Inferentia or Trainium instances to use the Neuron SDK.
23 changes: 22 additions & 1 deletion examples/blueprint-construct/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ export default class BlueprintConstruct {
}),
new blueprints.ExternalsSecretsAddOn(),
new blueprints.EksPodIdentityAgentAddOn(),
new blueprints.NeuronPluginAddOn(),
];

// Instantiated to for helm version check.
Expand All @@ -248,7 +249,8 @@ export default class BlueprintConstruct {
addGenericNodeGroup(),
addCustomNodeGroup(),
addWindowsNodeGroup(), // commented out to check the impact on e2e
addGpuNodeGroup()
addGpuNodeGroup(),
addInferentiaNodeGroup(),
]
});

Expand Down Expand Up @@ -413,4 +415,23 @@ function addGpuNodeGroup(): blueprints.ManagedNodeGroup {
};
}

function addInferentiaNodeGroup(): blueprints.ManagedNodeGroup {

return {
id: "mng4-inferentia",
instanceTypes: [new ec2.InstanceType('inf1.2xlarge')],
desiredSize: 1,
minSize: 1,
nodeRole: blueprints.getNamedResource("node-role") as iam.Role,
diskSize: 50,
tags: {
"Name": "Mng4",
"Type": "Managed-InferentiaNode-Group",
"LaunchTemplate": "Inferentia",
"kubernetes.io/cluster/blueprint-construct-dev": "owned"
}
};
}



1 change: 1 addition & 0 deletions lib/addons/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ export * from './emr-on-eks';
export * from './aws-batch-on-eks';
export * from './upbound-universal-crossplane';
export * from './apache-airflow';
export * from './neuron';
export * from './eks-pod-identity-agent';

export class Constants {
Expand Down
39 changes: 39 additions & 0 deletions lib/addons/neuron/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { Construct } from "constructs";

import { ClusterAddOn, ClusterInfo } from "../../spi";
import { KubectlProvider, ManifestDeployment } from "../helm-addon/kubectl-provider";
import { loadExternalYaml } from "../../utils/yaml-utils";

const PLUGIN_URL = "https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin.yml";
const RBAC_URL = "https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin-rbac.yml";

export class NeuronPluginAddOn implements ClusterAddOn {
deploy(clusterInfo: ClusterInfo): Promise<Construct> {
const kubectlProvider = new KubectlProvider(clusterInfo);

// Read in YAML docs
const rbac = loadExternalYaml(RBAC_URL);
const rbacManifest: ManifestDeployment = {
name: "neuron-rbac-manifest",
namespace: "",
manifest: rbac,
values: {}
};

const plugin = loadExternalYaml(PLUGIN_URL);
const pluginManifest: ManifestDeployment = {
name: "neuron-plugin-manifest",
namespace: "kube-system",
manifest: plugin,
values: {}
};

const rbacStatement = kubectlProvider.addManifest(rbacManifest);
const pluginStatement = kubectlProvider.addManifest(pluginManifest);

// Plugin dependency on the RBAC manifest
pluginStatement.node.addDependency(rbacStatement);

return Promise.resolve(pluginStatement);
}
}
32 changes: 32 additions & 0 deletions lib/utils/yaml-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ export function applyYamlFromDir(dir: string, cluster: eks.ICluster, namespaceMa
});
}

/**
* Reads the YAML document from a local path.
* @param path YAML document path
* @returns YAML document string
*/
export function readYamlDocument(path: string): string {
try {
const doc = fs.readFileSync(path, 'utf8');
Expand All @@ -35,18 +40,45 @@ export function readYamlDocument(path: string): string {
}
}

/**
* Reads the YAML document from a local path and parses them as
* multiple YAML documents separated by `---` as expected in a Kubernetes manifest file
* @param path YAML document path
* @returns a list of parsed YAML documents
*/
export function loadMultiResourceYaml(path: string): any {
const doc = readYamlDocument(path);
return doc.split("---").map((e: any) => loadYaml(e));
}

/**
* Parses the sting document into a single YAML document
* @param document document
* @returns yaml document
*/
export function loadYaml(document: string): any {
return yaml.load(document);
}

/**
* Reads the YAML document from a URL and parses
* multiple YAML documents separated by `---` as expected in a Kubernetes manifest file Note: The file from the URL is
* not validated, so user must ensure the URL contains a valid manifest.
* @param url YAML document URL
* @returns a list of parsed YAML documents
*/
export function loadExternalYaml(url: string): any {
/* eslint-disable */
const request = require('sync-request'); // moved away from import as it is causing open handles that prevents jest from completion
const response = request('GET', url);
return yaml.loadAll(response.getBody().toString());
}

/**
* Serializes object as a YAML document
* @param document document
* @returns yaml document
*/
export function serializeYaml(document: any): string {
return yaml.dump(document);
}
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ nav:
- Metrics Server: 'addons/metrics-server.md'
- New Relic: 'addons/newrelic.md'
- Nginx: 'addons/nginx.md'
- Neuron: 'addons/neuron-plugin-addon.md'
- OPA Gatekeeper: 'addons/opa-gatekeeper.md'
- Paralus: 'addons/paralus.md'
- Pixie: 'addons/pixie.md'
Expand Down
6 changes: 6 additions & 0 deletions test/utils/multi-yaml-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
kind: ClusterRole
---
kind: Deployment
---
kind: Pod
1 change: 1 addition & 0 deletions test/utils/yaml-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
apiVersion: apps/v1
65 changes: 65 additions & 0 deletions test/utils/yaml-utils.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import * as yaml from "../../lib/utils/yaml-utils";

describe('Unit tests for yaml utils', () => {

test("The YAML Document file is read correctly", () => {
const doc = yaml.readYamlDocument(__dirname +'/yaml-test.yaml');

expect(doc).toBe("apiVersion: apps/v1");
});

test("The YAML Document file is serialized correctly", () => {
const sample = {"apiVersion":"apps/v1","resource":"Deployment"};

const serialized = yaml.serializeYaml(sample);

expect(serialized.length).toBe(41);
});

test("The YAML Document with multiple resources is read correctly", () => {
const doc = yaml.loadMultiResourceYaml(__dirname +'/multi-yaml-test.yaml');

const firstPart = { "kind": "ClusterRole" };
const secondPart = { "kind": "Deployment" };
const lastPart = { "kind": "Pod" };

expect(doc.length).toBe(4);
expect(doc[1]).toStrictEqual(firstPart);
expect(doc[2]).toStrictEqual(secondPart);
expect(doc[3]).toStrictEqual(lastPart);
});

test("External YAML Document is read correctly", () => {
const doc = yaml.loadExternalYaml('https://raw.githubusercontent.com/kubernetes/examples/master/guestbook/legacy/frontend-controller.yaml');
const part = {
apiVersion: "v1",
kind: "ReplicationController",
metadata: {name: "frontend"},
spec: {
replicas: 3,
template: {
metadata: {
labels: {app: "guestbook", tier: "frontend"}
},
spec: {
containers: [{
name: "php-redis",
image: "gcr.io/google_samples/gb-frontend:v4",
resources: {
requests: {
cpu: "100m",
memory: "100Mi"
}
},
env: [{name: "GET_HOSTS_FROM", value: "dns"}],
ports:[{containerPort: 80}]
}]
}
}
}
};

expect(doc.length).toBe(1);
expect(doc[0]).toStrictEqual(part);
});
});

0 comments on commit 7f6442a

Please sign in to comment.