From c26313e937684643de827a59432e930fda7eacf5 Mon Sep 17 00:00:00 2001 From: yiyione Date: Mon, 1 Feb 2021 14:37:54 +0800 Subject: [PATCH 1/3] [CI] Remove travis and update GitHub Actions to replace it (#5276) * Remove .travis file * Add frameworklauncher test to git action * Add run coveralls in GitHub action * Fix coverall in GitHub Action * Fix coverall in GitHub Action * Move coverall outside rest-server test to avoid post coverage twice * Fix coverall * change build-status badge from travis to GitHub Action * update name of swagger-validate --- .github/workflows/continuous-integration.yml | 51 ++++++++ .github/workflows/lint.yml | 22 ++++ .travis.yml | 118 ------------------- README.md | 2 +- docs/index.md | 4 +- docs_zh_CN/index.md | 2 +- src/rest-server/package.json | 2 +- 7 files changed, 78 insertions(+), 123 deletions(-) delete mode 100644 .travis.yml diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml index ae276f2789..697f3941c3 100644 --- a/.github/workflows/continuous-integration.yml +++ b/.github/workflows/continuous-integration.yml @@ -112,6 +112,36 @@ jobs: yarn install --frozen-lockfiles yarn test + code-coverage: + name: Run code coverage of rest-server on node-${{ matrix.node }}-${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + node: [10] + os: [ubuntu-latest] + + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Use Node ${{ matrix.node }} + uses: actions/setup-node@v1 + with: + node-version: ${{ matrix.node }} + - name: yarn install and test + run: | + cd src/rest-server + yarn install --frozen-lockfiles + yarn test + mkdir ./coverage + yarn run coveralls + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v1.1.1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + # Path to lcov file + path-to-lcov: ./src/rest-server/coverage/lcov.info + webportal: name: Test webportal on node-${{ matrix.node }}-${{ matrix.os }} runs-on: ${{ matrix.os }} @@ -160,3 +190,24 @@ jobs: cd contrib/submit-job-v2 yarn install --frozen-lockfiles yarn build + + frameworklauncher: + name: Test frameworklauncher on java-${{ matrix.java }}-${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [8.0] + os: [ubuntu-16.04, ubuntu-latest] + + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Setup java + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + - name: Test frameworklauncher + run: | + cd subprojects/frameworklauncher/yarn + mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V + mvn clean test jacoco:report diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index cfa51fd105..b561c5a4f9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -64,3 +64,25 @@ jobs: - name: Lint run: | pylint contrib/kubespray/script --rcfile=contrib/kubespray/script/pylintrc + + swagger-validate: + name: Validate swagger + runs-on: ${{ matrix.os }} + strategy: + matrix: + node: [10] + os: [ubuntu-16.04] + + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Use Node ${{ matrix.node }} + uses: actions/setup-node@v1 + with: + node-version: ${{ matrix.node }} + - name: Install swagger-cli + run: | + npm install -g @apidevtools/swagger-cli + - name: validate + run: | + swagger-cli validate src/rest-server/docs/swagger.yaml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5b04aae912..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,118 +0,0 @@ -sudo: required -dict: xenial -matrix: - include: - - language: go - go: 1.13.8 - before_install: - - cd src/watchdog/src - install: - - echo "Skipping default install phase" - script: - - go build ./cmd/watchdog/main.go - - go test ./pkg/watchdog - - - language: python - python: 2.7 - install: - - pip install dnspython==1.13.0 paramiko pyyaml jinja2 python-etcd kubernetes GitPython - script: - - python -m unittest deployment.clusterObjectModel.test.test_cluster_object_model - - python -m unittest deployment.clusterObjectModel.test.test_template_generate - - python -m unittest deployment.clusterObjectModel.test.test_forward_compatibility - - cd deployment - - python -m unittest discover test/ - - - language: python - python: 3.7 - before_install: - - cd src/job-exporter/test - install: - - pip install prometheus_client - script: - - python3 -m unittest discover . - - - language: python - python: 3.6 - before_install: - - cd src/yarn-exporter/test - install: - - pip install prometheus_client twisted requests - script: - - python3 -m unittest discover . - - - language: python - python: 3.8 - install: - - pip install markdown==2.6.11 - script: - - python src/utilities/doc_checker.py . - - - language: java - dist: trusty - jdk: oraclejdk8 - before_install: - - cd subprojects/frameworklauncher/yarn - install: - - mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V - script: - - mvn clean test jacoco:report - - - language: java - jdk: openjdk8 - before_install: - - cd subprojects/frameworklauncher/yarn - install: - - mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V - script: - - mvn clean test jacoco:report - - - language: node_js - node_js: lts/dubnium - env: NODE_ENV=test - before_install: - - cd src/rest-server - install: - - yarn install - script: - - npm test - - npm run coveralls - - - language: node_js - node_js: lts/carbon - before_install: - - cd src/webportal - install: - - yarn install - - npm run build - script: - - npm test - - - language: node_js - node_js: lts/dubnium - before_install: - - cd src/webportal - install: - - yarn install --ignore-engines - - npm run build - script: - - npm test - - - language: node_js - node_js: node - before_install: cd contrib/submit-simple-job - install: npm install - script: npm test - - - language: node_js - node_js: lts/dubnium - before_install: cd contrib/submit-job-v2 - install: yarn --frozen-lockfiles - script: yarn build - - - language: node_js - node_js: node - install: - - npm install -g @apidevtools/swagger-cli - script: - - swagger-cli validate src/rest-server/docs/swagger.yaml diff --git a/README.md b/README.md index 815719c4b9..b408d117af 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [logo]: ./pailogo.jpg "OpenPAI" -[![Build Status](https://travis-ci.org/microsoft/pai.svg?branch=master)](https://travis-ci.org/microsoft/pai) +[![Build Status](https://github.com/microsoft/pai/workflows/CI/badge.svg)](https://github.com/microsoft/pai/actions) [![Join the chat at https://gitter.im/Microsoft/pai](https://badges.gitter.im/Microsoft/pai.svg)](https://gitter.im/Microsoft/pai?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Version](https://img.shields.io/github/release/Microsoft/pai.svg)](https://github.com/Microsoft/pai/releases/latest) diff --git a/docs/index.md b/docs/index.md index 97d699c09d..8b5e9d8c28 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # OpenPAI Handbook -[![Build Status](https://travis-ci.org/microsoft/pai.svg?branch=master)](https://travis-ci.org/microsoft/pai) +[![Build Status](https://github.com/microsoft/pai/workflows/CI/badge.svg)](https://github.com/microsoft/pai/actions) [![Join the chat at https://gitter.im/Microsoft/pai](https://badges.gitter.im/Microsoft/pai.svg)](https://gitter.im/Microsoft/pai?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Version](https://img.shields.io/github/release/Microsoft/pai.svg)](https://github.com/Microsoft/pai/releases/latest) @@ -16,4 +16,4 @@ To set up a new cluster, learn how to manage cluster on OpenPAI, please follow [ To view a general introduction of OpenPAI, please refer to the [Github Readme](https://github.com/microsoft/pai/blob/master/README.md). -For any issue/bug/feature request, please submit it to [GitHub](https://github.com/microsoft/pai). \ No newline at end of file +For any issue/bug/feature request, please submit it to [GitHub](https://github.com/microsoft/pai). diff --git a/docs_zh_CN/index.md b/docs_zh_CN/index.md index e026e3f51b..1e1a53a72e 100644 --- a/docs_zh_CN/index.md +++ b/docs_zh_CN/index.md @@ -1,6 +1,6 @@ # OpenPAI手册 -[![Build Status](https://travis-ci.org/microsoft/pai.svg?branch=master)](https://travis-ci.org/microsoft/pai) +[![Build Status](https://github.com/microsoft/pai/workflows/CI/badge.svg)](https://github.com/microsoft/pai/actions) [![Join the chat at https://gitter.im/Microsoft/pai](https://badges.gitter.im/Microsoft/pai.svg)](https://gitter.im/Microsoft/pai?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Version](https://img.shields.io/github/release/Microsoft/pai.svg)](https://github.com/Microsoft/pai/releases/latest) diff --git a/src/rest-server/package.json b/src/rest-server/package.json index 75e9d39a7e..bc4ef7a500 100644 --- a/src/rest-server/package.json +++ b/src/rest-server/package.json @@ -67,7 +67,7 @@ "@pai": "src" }, "scripts": { - "coveralls": "nyc report --reporter=text-lcov | coveralls ..", + "coveralls": "nyc report --reporter=text-lcov > coverage/lcov.info", "lint": "eslint --ext .js ./src", "mocha": "mocha --file ./test/setup --ui bdd --recursive --timeout 1000 --exit", "preinstall": "npx ncp ../database-controller/sdk openpaidbsdk || echo skip copying openpaidbsdk", From 06eb934e4acdcdb25c5a7c1dbf633ff4619ba568 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 2 Feb 2021 12:46:38 +0800 Subject: [PATCH 2/3] fix add/remove node doc issue (#5269) --- .../how-to-add-and-remove-nodes.md | 40 +++++++++---------- .../cluster-admin/installation-guide.md | 4 +- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/docs/manual/cluster-admin/how-to-add-and-remove-nodes.md b/docs/manual/cluster-admin/how-to-add-and-remove-nodes.md index 511de0bc63..eaf1e86060 100644 --- a/docs/manual/cluster-admin/how-to-add-and-remove-nodes.md +++ b/docs/manual/cluster-admin/how-to-add-and-remove-nodes.md @@ -14,7 +14,7 @@ Log in to your dev box machine, find [the pre-kept folder `~/pai-deploy`](./inst Find the file `~/pai-deploy/kubespray/inventory/pai/hosts.yml`, and follow the steps below to modify it. -Supposing you want to add 2 worker nodes into your cluster and their hostnames are `a` and `b`. Add these 2 nodes into the `hosts.yml`. An example: +Supposing you want to add 2 worker nodes into your cluster and their hostnames are `new-worker-node-0` and `new-worker-node-1`. Add these 2 nodes into the `hosts.yml`. An example: ```yaml all: @@ -35,7 +35,7 @@ all: ... ############# Example start ################### - a: + new-worker-node-0: ip: x.x.x.x access_ip: x.x.x.x ansible_host: x.x.x.x @@ -43,7 +43,7 @@ all: ansible_ssh_pass: "your-password-here" ansible_become_pass: "your-password-here" ansible_ssh_extra_args: '-o StrictHostKeyChecking=no' - b: + new-worker-node-1: ip: x.x.x.x access_ip: x.x.x.x ansible_host: x.x.x.x @@ -65,8 +65,8 @@ all: origin4: ############# Example start ################### - a: - b: + new-worker-node-0: + new-worker-node-1: ############## Example end #################### gpu: @@ -75,8 +75,8 @@ all: ############# Example start ################### #### If the worker doesn't have GPU, please don't add them here. - a: - b: + new-worker-node-0: + new-worker-node-1: ############## Example end #################### etcd: @@ -95,25 +95,25 @@ all: Go into folder `~/pai-deploy/kubespray/`, run: ```bash -ansible-playbook -i inventory/pai/hosts.yml scale.yml -b --become-user=root -e "node=a,b" -e "@inventory/pai/openpai.yml" +ansible-playbook -i inventory/pai/hosts.yml cluster.yml -b --become-user=root --limit=new-worker-node-0,new-worker-node-1 -e "@inventory/pai/openpai.yml" ``` -The nodes to add are specified with `-e` flag. +The nodes to add are specified with the `--limit` flag. ### Update OpenPAI Service Configuration Find your [service configuration file `layout.yaml` and `services-configuration.yaml`](./basic-management-operations.md#pai-service-management-and-paictl) in `~/pai-deploy/cluster-cfg`. -- Add the new node into `machine-list` field in `layout.yaml` +- Add the new node into `machine-list` field in `layout.yaml`, create a new `machine-sku` if necessary. Refer to [layout.yaml](./installation-guide.md#layoutyaml-format) for schema requirements. ```yaml machine-list: - - hostname: a + - hostname: new-worker-node--0 hostip: x.x.x.x machine-type: xxx-sku pai-worker: "true" - - hostname: b + - hostname: new-worker-node-1 hostip: x.x.x.x machine-type: xxx-sku pai-worker: "true" @@ -121,12 +121,12 @@ machine-list: - If you are using hived scheduler, you should modify its setting in `services-configuration.yaml` properly. Please refer to [how to set up virtual clusters](./how-to-set-up-virtual-clusters.md) and the [hived scheduler doc](https://github.com/microsoft/hivedscheduler/blob/master/doc/user-manual.md) for details. If you are using Kubernetes default scheduler, you can skip this step. -- Stop the service, push the latest configuration, and then start services: +- Stop the service, push the latest configuration, and then start related services: ```bash -./paictl.py service stop -n cluster-configuration hivedscheduler rest-server +./paictl.py service stop -n cluster-configuration hivedscheduler rest-server job-exporter ./paictl.py config push -p -m service -./paictl.py service start -n cluster-configuration hivedscheduler rest-server +./paictl.py service start -n cluster-configuration hivedscheduler rest-server job-exporter ``` If you have configured any PV/PVC storage, please confirm the added worker node meets the PV's requirements. See [Confirm Worker Nodes Environment](./how-to-set-up-storage.md#confirm-environment-on-worker-nodes) for details. @@ -139,17 +139,17 @@ To remove nodes from the cluster, there is no need to modify `hosts.yml`. Go into `~/pai-deploy/kubespray/`, run ```bash -ansible-playbook -i inventory/pai/hosts.yml remove-node.yml -b --become-user=root -e "node=a,b" -e "@inventory/pai/openpai.yml" +ansible-playbook -i inventory/pai/hosts.yml remove-node.yml -b --become-user=root -e "node=worker-node-to-remove-0,worker-node-to-remove-1" -e "@inventory/pai/openpai.yml" ``` -The nodes to remove are specified with `-e` flag. +The nodes to remove are specified with the `-e` flag. Modify the `layout.yaml` and `services-configuration.yaml`. -Stop the service, push the latest configuration, and then start services: +Stop the service, push the latest configuration, and then start related services: ```bash -./paictl.py service stop -n cluster-configuration hivedscheduler rest-server +./paictl.py service stop -n cluster-configuration hivedscheduler rest-server job-exporter ./paictl.py config push -p -m service -./paictl.py service start -n cluster-configuration hivedscheduler rest-server +./paictl.py service start -n cluster-configuration hivedscheduler rest-server job-exporter ``` diff --git a/docs/manual/cluster-admin/installation-guide.md b/docs/manual/cluster-admin/installation-guide.md index 49a85ee467..c0beb4601e 100644 --- a/docs/manual/cluster-admin/installation-guide.md +++ b/docs/manual/cluster-admin/installation-guide.md @@ -170,9 +170,7 @@ Please edit `layout.yaml` and a `config.yaml` file under `/contrib These two files spedify the cluster layout and the customized configuration, respectively. The following is the format and example of these 2 files. -#### Tips for China Users - -If you are a China user, before you edit these files, please refer to [here](./configuration-for-china.md) first. +**Tips for China Users**: If you are a China user, before you edit these files, please refer to [here](./configuration-for-china.md) first. #### `layout.yaml` format From 30f21b391db32ee0211bb024e20ae113f9317452 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 2 Feb 2021 16:16:41 +0800 Subject: [PATCH 3/3] generate an application token for every job (#5282) - create application token in rest-server, get a k8s secret definition - create a token object in db, (add a column in table Framework named tokenSecretDef...), let dbc create the secret with the def - mount the token-secrets to initContainers & job containers - revoke the token in dbc, (remove the token from DB) : realized in src/dbc --- src/database-controller/sdk/index.js | 3 +- .../src/common/framework.js | 30 ++++++++++- src/database-controller/src/poller/index.js | 2 + .../src/write-merger/handler.js | 7 ++- .../deploy/rest-server.yaml.template | 2 + src/rest-server/src/config/launcher.js | 2 + src/rest-server/src/models/v2/job/k8s.js | 50 +++++++++++++++++-- 7 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/database-controller/sdk/index.js b/src/database-controller/sdk/index.js index dc6bebcade..5b72ba4f17 100644 --- a/src/database-controller/sdk/index.js +++ b/src/database-controller/sdk/index.js @@ -39,11 +39,12 @@ class DatabaseModel { type: Sequelize.DATE, allowNull: false, }, - // `dockerSecretDef`, `configSecretDef`, and `priorityClassDef` is the definition of job add-ons. + // `dockerSecretDef`, `configSecretDef`, `tokenSecretDef` and `priorityClassDef` is the definition of job add-ons. // They are generated by rest-server and recorded into database by write-merger. // These add-ons are created by poller or the short-cut in write-merger. dockerSecretDef: Sequelize.TEXT, configSecretDef: Sequelize.TEXT, + tokenSecretDef: Sequelize.TEXT, priorityClassDef: Sequelize.TEXT, retries: Sequelize.INTEGER, retryDelayTime: Sequelize.INTEGER, diff --git a/src/database-controller/src/common/framework.js b/src/database-controller/src/common/framework.js index a78f1b8a2d..c874ba4e92 100644 --- a/src/database-controller/src/common/framework.js +++ b/src/database-controller/src/common/framework.js @@ -293,12 +293,13 @@ class Snapshot { } // Class Add-ons handles creation/patching/deletion of job add-ons. -// Currently there are 3 types of add-ons: configSecret, priorityClass, and dockerSecret. +// Currently there are 4 types of add-ons: configSecret, priorityClass, dockerSecret, and tokenSecret. class AddOns { constructor( configSecretDef = null, priorityClassDef = null, dockerSecretDef = null, + tokenSecretDef = null, ) { if (configSecretDef !== null && !(configSecretDef instanceof Object)) { this._configSecretDef = JSON.parse(configSecretDef); @@ -315,6 +316,11 @@ class AddOns { } else { this._dockerSecretDef = dockerSecretDef; } + if (tokenSecretDef !== null && !(tokenSecretDef instanceof Object)) { + this._tokenSecretDef = JSON.parse(tokenSecretDef); + } else { + this._tokenSecretDef = tokenSecretDef; + } } async create() { @@ -357,6 +363,19 @@ class AddOns { } } } + if (this._tokenSecretDef) { + try { + await k8s.createSecret(this._tokenSecretDef); + } catch (err) { + if (err.response && err.response.statusCode === 409) { + logger.warn( + `Secret ${this._tokenSecretDef.metadata.name} already exists.`, + ); + } else { + throw err; + } + } + } } silentPatch(frameworkResponse) { @@ -369,6 +388,10 @@ class AddOns { k8s .patchSecretOwnerToFramework(this._dockerSecretDef, frameworkResponse) .catch(logError); + this._tokenSecretDef && + k8s + .patchSecretOwnerToFramework(this._tokenSecretDef, frameworkResponse) + .catch(logError); } silentDelete() { @@ -381,6 +404,8 @@ class AddOns { .catch(logError); this._dockerSecretDef && k8s.deleteSecret(this._dockerSecretDef.metadata.name).catch(logError); + this._tokenSecretDef && + k8s.deleteSecret(this._tokenSecretDef.metadata.name).catch(logError); } getUpdate() { @@ -394,6 +419,9 @@ class AddOns { if (this._dockerSecretDef) { update.dockerSecretDef = JSON.stringify(this._dockerSecretDef); } + if (this._tokenSecretDef) { + update.tokenSecretDef = JSON.stringify(this._tokenSecretDef); + } return update; } } diff --git a/src/database-controller/src/poller/index.js b/src/database-controller/src/poller/index.js index baafe53635..c78ef5487b 100644 --- a/src/database-controller/src/poller/index.js +++ b/src/database-controller/src/poller/index.js @@ -113,6 +113,7 @@ async function poll() { 'configSecretDef', 'priorityClassDef', 'dockerSecretDef', + 'tokenSecretDef', 'snapshot', 'subState', 'requestSynced', @@ -132,6 +133,7 @@ async function poll() { framework.configSecretDef, framework.priorityClassDef, framework.dockerSecretDef, + framework.tokenSecretDef, ); if (framework.subState === 'Completed') { deleteHandler(snapshot, pollingTs); diff --git a/src/database-controller/src/write-merger/handler.js b/src/database-controller/src/write-merger/handler.js index c6a46d095e..4fd368e0b0 100644 --- a/src/database-controller/src/write-merger/handler.js +++ b/src/database-controller/src/write-merger/handler.js @@ -188,6 +188,7 @@ async function patchFrameworkRequest(req, res, next) { 'configSecretDef', 'priorityClassDef', 'dockerSecretDef', + 'tokenSecretDef', ], where: { name: frameworkName }, }); @@ -203,6 +204,7 @@ async function patchFrameworkRequest(req, res, next) { oldFramework.configSecretDef, oldFramework.priorityClassDef, oldFramework.dockerSecretDef, + oldFramework.tokenSecretDef, ); return onModifyFrameworkRequest(oldSnapshot, snapshot, addOns); } @@ -216,7 +218,7 @@ async function patchFrameworkRequest(req, res, next) { async function putFrameworkRequest(req, res, next) { // The handler to handle PUT /frameworkRequest. // PUT means provide a full spec of framework request, and the corresponding request will be created or updated. - // Along with the framework request, user must provide other job add-ons, e.g. configSecretDef, priorityClassDef, dockerSecretDef. + // Along with the framework request, user must provide other job add-ons, e.g. configSecretDef, priorityClassDef, dockerSecretDef, tokenSecretDef. // If the framework doesn't exist in database, the record will be created. // If the framework already exists, the record will be updated, and all job add-ons will be ignored. (Job add-ons can't be changed). // If the framework request JSON is changed(or created), we will mark it as requestSynced=false. @@ -228,6 +230,7 @@ async function putFrameworkRequest(req, res, next) { configSecretDef, priorityClassDef, dockerSecretDef, + tokenSecretDef, } = req.body; const frameworkName = _.get(frameworkRequest, 'metadata.name'); if (!frameworkName) { @@ -259,6 +262,7 @@ async function putFrameworkRequest(req, res, next) { configSecretDef, priorityClassDef, dockerSecretDef, + tokenSecretDef, ); return onCreateFrameworkRequest(snapshot, submissionTime, addOns); } else { @@ -269,6 +273,7 @@ async function putFrameworkRequest(req, res, next) { oldFramework.configSecretDef, oldFramework.priorityClassDef, oldFramework.dockerSecretDef, + oldFramework.tokenSecretDef, ); return onModifyFrameworkRequest(oldSnapshot, snapshot, addOns); } diff --git a/src/rest-server/deploy/rest-server.yaml.template b/src/rest-server/deploy/rest-server.yaml.template index 7f0d82b545..0fa5b97963 100644 --- a/src/rest-server/deploy/rest-server.yaml.template +++ b/src/rest-server/deploy/rest-server.yaml.template @@ -97,6 +97,8 @@ spec: {%- else %} value: "{{ cluster_cfg['pylon']['uri']}}" {%- endif %} + - name: REST_SERVER_URI + value: "{{ cluster_cfg['rest-server']['uri']}}" {% if not cluster_cfg['authentication']['OIDC'] %} - name: AUTHN_METHOD value: basic diff --git a/src/rest-server/src/config/launcher.js b/src/rest-server/src/config/launcher.js index 27b787137f..1887944f79 100644 --- a/src/rest-server/src/config/launcher.js +++ b/src/rest-server/src/config/launcher.js @@ -22,6 +22,7 @@ const Joi = require('joi'); const k8sLauncherConfigSchema = Joi.object() .keys({ hivedWebserviceUri: Joi.string().uri().required(), + restServerUri: Joi.string().uri().required(), enabledPriorityClass: Joi.boolean().required(), apiVersion: Joi.string().required(), podGracefulDeletionTimeoutSec: Joi.number() @@ -55,6 +56,7 @@ const launcherType = process.env.LAUNCHER_TYPE; if (launcherType === 'k8s') { launcherConfig = { hivedWebserviceUri: process.env.HIVED_WEBSERVICE_URI, + restServerUri: process.env.REST_SERVER_URI, enabledPriorityClass: process.env.LAUNCHER_PRIORITY_CLASS === 'true', apiVersion: 'frameworkcontroller.microsoft.com/v1', podGracefulDeletionTimeoutSec: 600, diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index b8d0a518b5..5617b6a517 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -24,6 +24,7 @@ const launcherConfig = require('@pai/config/launcher'); const createError = require('@pai/utils/error'); const protocolSecret = require('@pai/utils/protocolSecret'); const userModel = require('@pai/models/v2/user'); +const tokenModel = require('@pai/models/token'); const storageModel = require('@pai/models/v2/storage'); const logger = require('@pai/config/logger'); const { apiserver } = require('@pai/config/kubernetes'); @@ -442,6 +443,10 @@ const generateTaskRole = ( name: 'KUBE_APISERVER_ADDRESS', value: apiserver.uri, }, + { + name: 'REST_SERVER_URI', + value: launcherConfig.restServerUri, + }, { name: 'GANG_ALLOCATION', value: gangAllocation, @@ -749,6 +754,7 @@ const generateFrameworkDescription = ( taskRoleDescription.task.pod.spec.priorityClassName = 'pai-job-minimal-priority'; } + // mount job secrets to initContainers & job container if exist if (config.secrets) { taskRoleDescription.task.pod.spec.volumes.push({ name: 'job-secrets', @@ -765,6 +771,21 @@ const generateFrameworkDescription = ( mountPath: '/usr/local/pai/secrets', }); } + // mount token-secrets to initContainers & job container + taskRoleDescription.task.pod.spec.volumes.push({ + name: 'token-secrets', + secret: { + secretName: `${encodeName(frameworkName)}-tokencred`, + }, + }); + taskRoleDescription.task.pod.spec.initContainers[0].volumeMounts.push({ + name: 'token-secrets', + mountPath: '/usr/local/pai/token-secrets', + }); + taskRoleDescription.task.pod.spec.containers[0].volumeMounts.push({ + name: 'token-secrets', + mountPath: '/usr/local/pai/token-secrets', + }); frameworkDescription.spec.taskRoles.push(taskRoleDescription); } frameworkDescription.metadata.annotations.totalGpuNumber = `${totalGpuNumber}`; @@ -830,6 +851,22 @@ const getConfigSecretDef = (frameworkName, secrets) => { }; }; +const getTokenSecretDef = (frameworkName, token) => { + const data = { + token: Buffer.from(token).toString('base64'), + }; + return { + apiVersion: 'v1', + kind: 'Secret', + metadata: { + name: `${encodeName(frameworkName)}-tokencred`, + namespace: 'default', + }, + data: data, + type: 'Opaque', + }; +}; + const list = async ( attributes, filters, @@ -1053,7 +1090,7 @@ const put = async (frameworkName, config, rawConfig) => { config, rawConfig, ); - // generate image pull secret + // generate the image pull secret definition const auths = Object.values(config.prerequisites.dockerimage) .filter((dockerimage) => dockerimage.auth != null) .map((dockerimage) => dockerimage.auth); @@ -1061,11 +1098,17 @@ const put = async (frameworkName, config, rawConfig) => { ? getDockerSecretDef(frameworkName, auths) : null; - // generate job config secret + // generate the job config secret definition const configSecretDef = config.secrets ? getConfigSecretDef(frameworkName, config.secrets) : null; + // create an application token + // TODO: need a mechanism to label this token as job specific token and revoke it if job is stopped / failed + const token = await tokenModel.create(userName, true); + // generate the application token secret definition + const tokenSecretDef = getTokenSecretDef(frameworkName, token); + // calculate pod priority // reference: https://github.com/microsoft/pai/issues/3704 // Truncate submissionTime to multiple of 1000. @@ -1088,7 +1131,7 @@ const put = async (frameworkName, config, rawConfig) => { priorityClassDef = getPriorityClassDef(frameworkName, podPriority); } - // send request to framework controller + // send request to DB controller let response; try { response = await axios({ @@ -1103,6 +1146,7 @@ const put = async (frameworkName, config, rawConfig) => { configSecretDef: configSecretDef, priorityClassDef: priorityClassDef, dockerSecretDef: dockerSecretDef, + tokenSecretDef: tokenSecretDef, }, headers: { 'Content-Type': 'application/json',