From 5fbefb87c6f9c31638c47319b58a16c36d4b78d4 Mon Sep 17 00:00:00 2001 From: AmberMsy <46340789+AmberMsy@users.noreply.github.com> Date: Wed, 11 Nov 2020 14:35:04 +0800 Subject: [PATCH 01/12] Update Stdout/Stderr/Stdout+Stderr button (#5063) * log web * change * Get Log * Url Token Refresh * merge log&log.1 * fix merge part * fix refresh part * fix by prettier * fix state Co-authored-by: Binyang Li --- .../components/task-role-container-list.jsx | 139 ++++++++++-------- .../job/job-view/fabric/job-detail/conn.js | 98 +++++------- 2 files changed, 114 insertions(+), 123 deletions(-) diff --git a/src/webportal/src/app/job/job-view/fabric/job-detail/components/task-role-container-list.jsx b/src/webportal/src/app/job/job-view/fabric/job-detail/components/task-role-container-list.jsx index 9a97167d28..f8d06fe418 100644 --- a/src/webportal/src/app/job/job-view/fabric/job-detail/components/task-role-container-list.jsx +++ b/src/webportal/src/app/job/job-view/fabric/job-detail/components/task-role-container-list.jsx @@ -48,7 +48,7 @@ import t from '../../../../../components/tachyons.scss'; import Context from './context'; import Timer from './timer'; -import { getContainerLog } from '../conn'; +import { getContainerLog, getContainerLogList } from '../conn'; import config from '../../../../../config/webportal.config'; import MonacoPanel from '../../../../../components/monaco-panel'; import StatusBadge from '../../../../../components/status-badge'; @@ -137,7 +137,10 @@ export default class TaskRoleContainerList extends React.Component { monacoProps: null, monacoTitle: '', monacoFooterButton: null, - logUrl: null, + fullLogUrls: null, + tailLogUrls: null, + logListUrl: null, + logType: null, items: props.tasks, ordering: { field: null, descending: false }, hideDialog: true, @@ -145,7 +148,7 @@ export default class TaskRoleContainerList extends React.Component { this.showSshInfo = this.showSshInfo.bind(this); this.onDismiss = this.onDismiss.bind(this); - this.showContainerLog = this.showContainerLog.bind(this); + this.showContainerTailLog = this.showContainerTailLog.bind(this); this.onRenderRow = this.onRenderRow.bind(this); this.logAutoRefresh = this.logAutoRefresh.bind(this); this.onColumnClick = this.onColumnClick.bind(this); @@ -159,12 +162,12 @@ export default class TaskRoleContainerList extends React.Component { } logAutoRefresh() { - const { logUrl } = this.state; - getContainerLog(logUrl) + const { fullLogUrls, tailLogUrls, logListUrl, logType } = this.state; + getContainerLog(tailLogUrls, fullLogUrls, logType) .then(({ text, fullLogLink }) => this.setState( prevState => - prevState.logUrl === logUrl && { + prevState.tailLogUrls[logType] === tailLogUrls[logType] && { monacoProps: { value: text }, monacoFooterButton: ( + .catch(err => { this.setState( prevState => - prevState.logUrl === logUrl && { + prevState.tailLogUrls[logType] === tailLogUrls[logType] && { monacoProps: { value: err.message }, }, - ), - ); + ); + if (err.message === '403') { + this.showContainerTailLog(logListUrl, logType); + } + }); } onDismiss() { @@ -194,7 +200,8 @@ export default class TaskRoleContainerList extends React.Component { monacoProps: null, monacoTitle: '', monacoFooterButton: null, - logUrl: null, + fullLogUrls: null, + tailLogUrls: null, }); } @@ -213,40 +220,52 @@ export default class TaskRoleContainerList extends React.Component { } } - showContainerLog(logUrl, logType) { - let title; - let logHint; - - if (config.logType === 'yarn') { - logHint = 'Last 4096 bytes'; - } else if (config.logType === 'log-manager') { - logHint = 'Last 16384 bytes'; - } else { - logHint = ''; - } - switch (logType) { - case 'stdout': - title = `Standard Output (${logHint})`; - break; - case 'stderr': - title = `Standard Error (${logHint})`; - break; - case 'stdall': - title = `User logs (${logHint}. Notice: The logs may out of order when merging stdout & stderr streams)`; - break; - default: - throw new Error(`Unsupported log type`); + convertObjectFormat(logUrls) { + const logs = {}; + for (const p in logUrls.locations) { + logs[logUrls.locations[p].name] = logUrls.locations[p].uri; } - this.setState( - { - monacoProps: { value: 'Loading...' }, - monacoTitle: title, - logUrl, - }, - () => { - this.logAutoRefresh(); // start immediately - }, - ); + return logs; + } + + showContainerTailLog(logListUrl, logType) { + let title; + let logHint = ''; + this.setState({ logListUrl: logListUrl }); + getContainerLogList(logListUrl) + .then(({ fullLogUrls, tailLogUrls }) => { + if (config.logType === 'log-manager') { + logHint = 'Last 16384 bytes'; + } + switch (logType) { + case 'stdout': + title = `Standard Output (${logHint})`; + break; + case 'stderr': + title = `Standard Error (${logHint})`; + break; + case 'all': + title = `User logs (${logHint}. Notice: The logs may out of order when merging stdout & stderr streams)`; + break; + default: + throw new Error(`Unsupported log type`); + } + this.setState( + { + monacoProps: { value: 'Loading...' }, + monacoTitle: title, + fullLogUrls: this.convertObjectFormat(fullLogUrls), + tailLogUrls: this.convertObjectFormat(tailLogUrls), + logType, + }, + () => { + this.logAutoRefresh(); // start immediately + }, + ); + }) + .catch(err => { + this.setState({ monacoProps: { value: err.message } }); + }); } showSshInfo(id, containerPorts, containerIp) { @@ -424,7 +443,7 @@ export default class TaskRoleContainerList extends React.Component { monacoTitle, monacoProps, monacoFooterButton, - logUrl, + tailLogUrls, items, } = this.state; const { showMoreDiagnostics } = this.props; @@ -443,7 +462,9 @@ export default class TaskRoleContainerList extends React.Component { {/* Timer */} {/* Monaco Editor Panel */} @@ -624,8 +645,8 @@ export default class TaskRoleContainerList extends React.Component { iconProps={{ iconName: 'TextDocument' }} text='Stdout' onClick={() => - this.showContainerLog( - `${item.containerLog}user.pai.stdout`, + this.showContainerTailLog( + `${config.restServerUri}${item.containerLog}`, 'stdout', ) } @@ -640,8 +661,8 @@ export default class TaskRoleContainerList extends React.Component { iconProps={{ iconName: 'Error' }} text='Stderr' onClick={() => - this.showContainerLog( - `${item.containerLog}user.pai.stderr`, + this.showContainerTailLog( + `${config.restServerUri}${item.containerLog}`, 'stderr', ) } @@ -662,23 +683,11 @@ export default class TaskRoleContainerList extends React.Component { iconProps: { iconName: 'TextDocument' }, disabled: isNil(item.containerId), onClick: () => - this.showContainerLog( - `${item.containerLog}user.pai.all`, - 'stdall', + this.showContainerTailLog( + `${config.restServerUri}${item.containerLog}`, + 'all', ), }, - { - key: 'trackingPage', - name: - config.launcherType === 'yarn' - ? 'Go to Yarn Tracking Page' - : 'Browse log folder', - iconProps: { iconName: 'Link' }, - href: isNil(item.containerLog) - ? item.containerLog - : item.containerLog.replace('/tail/', '/'), - target: '_blank', - }, ], }} disabled={isNil(item.containerId)} diff --git a/src/webportal/src/app/job/job-view/fabric/job-detail/conn.js b/src/webportal/src/app/job/job-view/fabric/job-detail/conn.js index 1a70e72082..72a7039436 100644 --- a/src/webportal/src/app/job/job-view/fabric/job-detail/conn.js +++ b/src/webportal/src/app/job/job-view/fabric/job-detail/conn.js @@ -12,7 +12,6 @@ import config from '../../../../config/webportal.config'; const params = new URLSearchParams(window.location.search); const userName = params.get('username'); const jobName = params.get('jobName'); -const absoluteUrlRegExp = /^[a-z][a-z\d+.-]*:/; const token = cookies.get('token'); const client = new PAIV2.OpenPAIClient({ @@ -157,77 +156,60 @@ export async function stopJob() { ); } -export async function getContainerLog(logUrl) { - const ret = { - fullLogLink: logUrl, - text: null, - }; - const res = await fetch(logUrl); - var text = await res.text(); - if (!res.ok) { - throw new Error(res.statusText); +export async function getContainerLogList(logListUrl) { + const res = await Promise.all([ + fetch(`${logListUrl}`, { + headers: { + Authorization: `Bearer ${token}`, + }, + }), + fetch(`${logListUrl}?tail-mode=true`, { + headers: { + Authorization: `Bearer ${token}`, + }, + }), + ]); + const resp = res.find(r => !r.ok); + if (resp) { + throw new Error('Log folder can not be retrieved'); } + const logUrls = await Promise.all(res.map(r => r.json())); + return { + fullLogUrls: logUrls[0], + tailLogUrls: logUrls[1], + }; +} - const contentType = res.headers.get('content-type'); - if (!contentType) { - throw new Error(`Log not available`); +export async function getContainerLog(tailLogUrls, fullLogUrls, logType) { + const res = await fetch(tailLogUrls[logType]); + if (!res.ok) { + throw new Error(res.status); } + let text = await res.text(); - // Check log type. The log type is in LOG_TYPE and should be yarn|log-manager. - if (config.logType === 'yarn') { - try { - const parser = new DOMParser(); - const doc = parser.parseFromString(text, 'text/html'); - const content = doc.getElementsByClassName('content')[0]; - const pre = content.getElementsByTagName('pre')[0]; - ret.text = pre.innerText; - // fetch full log link - if (pre.previousElementSibling) { - const link = pre.previousElementSibling.getElementsByTagName('a'); - if (link.length === 1) { - ret.fullLogLink = link[0].getAttribute('href'); - // relative link - if (ret.fullLogLink && !absoluteUrlRegExp.test(ret.fullLogLink)) { - let baseUrl = res.url; - // check base tag - const baseTags = doc.getElementsByTagName('base'); - // There can be only one element in a document. - if (baseTags.length > 0 && baseTags[0].hasAttribute('href')) { - baseUrl = baseTags[0].getAttribute('href'); - // relative base tag url - if (!absoluteUrlRegExp.test(baseUrl)) { - baseUrl = new URL(baseUrl, res.url); - } - } - const url = new URL(ret.fullLogLink, baseUrl); - ret.fullLogLink = url.href; - } - } - } - return ret; - } catch (e) { - throw new Error(`Log not available`); - } - } else if (config.logType === 'log-manager') { + // Check log type. The log type is in LOG_TYPE only support log-manager. + if (config.logType === 'log-manager') { // Try to get roated log if currently log content is less than 15KB - if (text.length <= 15 * 1024) { - const fullLogUrl = logUrl.replace('/tail/', '/full/'); - const rotatedLogUrl = logUrl + '.1'; + if (text.length <= 15 * 1024 && tailLogUrls[logType + '.1']) { + const rotatedLogUrl = tailLogUrls[logType + '.1']; const rotatedLogRes = await fetch(rotatedLogUrl); - const fullLogRes = await fetch(fullLogUrl); + const fullLogRes = await fetch(fullLogUrls[logType]); const rotatedText = await rotatedLogRes.text(); const fullLog = await fullLogRes.text(); - if (rotatedLogRes.ok && rotatedText.trim() !== 'No such file!') { + if (rotatedLogRes.ok) { text = rotatedText - .concat('\n--------log is rotated, may be lost during this--------\n') + .concat( + '\n ------- log is rotated, may be lost during this ------- \n', + ) .concat(fullLog); } // get last 16KB text = text.slice(-16 * 1024); } - ret.text = text; - ret.fullLogLink = logUrl.replace('/tail/', '/full/'); - return ret; + return { + fullLogLink: fullLogUrls[logType], + text: text, + }; } else { throw new Error(`Log not available`); } From 1e9580e47265d90da01570227963fdc1e81e3572 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 11 Nov 2020 16:11:24 +0800 Subject: [PATCH 02/12] Update docs for Cluster Autoscaler on AKS Engine (#5057) Update docs for Cluster Autoscaler on AKS Engine. --- .../aks-engine/{config.yml => config.yaml} | 0 contrib/aks-engine/readme.md | 100 +++++++++++------- .../installation-faqs-and-troubleshooting.md | 6 +- 3 files changed, 65 insertions(+), 41 deletions(-) rename contrib/aks-engine/{config.yml => config.yaml} (100%) diff --git a/contrib/aks-engine/config.yml b/contrib/aks-engine/config.yaml similarity index 100% rename from contrib/aks-engine/config.yml rename to contrib/aks-engine/config.yaml diff --git a/contrib/aks-engine/readme.md b/contrib/aks-engine/readme.md index 7965a2d350..a28ceaffc5 100644 --- a/contrib/aks-engine/readme.md +++ b/contrib/aks-engine/readme.md @@ -1,56 +1,76 @@ -#### Install Necessary Package. +# Cluster Autoscaler on AKS Engine -- [ Install Azure CLI ](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) -- [ Install AKS-Engine ](https://github.com/Azure/aks-engine/blob/master/docs/tutorials/quickstart.md#install-the-aks-engine-binary) +[AKS Engine](https://github.com/Azure/aks-engine) is a tool to help you provision a self-managed Kubernetes cluster on Azure, +while [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) is another tool that automatically adjusts the size of the Kubernetes cluster. +The Cluster Autoscaler on Azure dynamically scales Kubernetes worker nodes. -#### Create Resource Group +This contrib aims to help you deploy a OpenPAI cluster on Azure using AKS Engine, and runs Cluster Autoscaler as a deployment in your cluster. -- Solution A [ Azure Portal ](https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal#create-resource-groups) (Recommended) -- Solution B [ Azure CLI ](https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-cli#create-resource-groups) -Remember the following parameters +## Preparations on Azure -- subscription id: ```${subscriptionId}``` -- resource groupname: ```${resourcegroup}``` -- location: ```${location}``` +1. Install Dependencies -#### Create Service Principle + 1. Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) + 2. Install [AKS Engine](https://github.com/Azure/aks-engine/blob/master/docs/tutorials/quickstart.md#install-the-aks-engine-binary) -```bash -az ad sp create-for-rbac --skip-assignment --name ${service-principal-name} -``` +2. Create resource group -If the command success, the output will like the following example. + There're two options to create resource group in your subscription: + * It's recommended to use [Azure Portal](https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal#create-resource-groups) + * You can also use [Azure CLI](https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-cli#create-resource-groups) -```json -{ - "appId": "559513bd-0c19-4c1a-87cd-851a26afd5fc", - "displayName": "${service-principal-name}", - "name": "http://${service-principal-name}", - "password": "e763725a-5eee-40e8-a466-dc88d980f415", - "tenant": "72f988bf-86f1-41af-91ab-2d7cd011db48" -} -``` -Remember the following parameters. + Remember the following parameters which will be used later: + * subscription id `${subscriptionId}` + * resource groupname `${resourcegroup}` + * location `${location}` -- ```appId```: ```${appId}``` -- ```password```: ```${password}``` -- ```displayName```: ```${spName}``` -- ```tenant```: ```${tenant}``` - - -[The doc about this steps](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal#manually-create-a-service-principal) +3. Create Service Principal -#### Ask your subscription's admin to add the new service principal as the owner of the new resource group. + Run the following command: -Content as the title. Important and don't forget it. + ```sh + az ad sp create-for-rbac --skip-assignment --name ${service-principal-name} + ``` -#### Write Configuration + You will see the following output if it succeed: -[Configuration example](config.yml) + ```json + { + "appId": "87432405-56b6-4d76-923b-39d1d75d19f7", + "displayName": "${service-principal-name}", + "name": "http://${service-principal-name}", + "password": "ff5b1601-1298-460d-a94f-fcc8b5ef96f0", + "tenant": "72e9b8a0-54c8-4742-8da6-1f5d1418c3c5" + } + ``` -#### Start Cluster + Remember the following parameters which will be used later: + * appId `${appId}` + * password `${password}` + * displayName `${spName}` + * tenant `${tenant}` -``` -python3 azure.py -c config.yml -``` + For more details on how to create service principal, please refer to [manually-create-a-service-principal document](https://docs.microsoft.com/en-us/azure/aks/kubernetes-service-principal#manually-create-a-service-principal). + +4. Add the service principal as the owner of the resource group. + + +## OpenPAI Deployment + +1. Prepare the [configuration file](./config.yaml), replace the variables with parameters in previous steps. +To use Cluster Autosaler, specify the following lines in `openpai_worker_vmss`: + + ```yaml + openpai_worker_vmss: + ... + ca_enable: true + min_vm_count: 1 + max_vm_count: 10 + ``` + +2. Deploy Kubernetes cluster with AKS Engine, and deploy OpenPAI: + + ```sh + python3 azure.py -c config.yaml + ``` diff --git a/docs/manual/cluster-admin/installation-faqs-and-troubleshooting.md b/docs/manual/cluster-admin/installation-faqs-and-troubleshooting.md index f88cd95115..d20b453bac 100644 --- a/docs/manual/cluster-admin/installation-faqs-and-troubleshooting.md +++ b/docs/manual/cluster-admin/installation-faqs-and-troubleshooting.md @@ -68,7 +68,7 @@ To remove the network plugin, you could use following `ansible-playbook`: shell: systemctl restart kubelet args: executable: /bin/bash - + - name: restart docker shell: systemctl restart docker args: @@ -113,6 +113,10 @@ Please refer to the [official document](https://github.com/NVIDIA/nvidia-contain } ``` +#### How to deploy on [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/) with [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)? + +Please refer to [this document](https://github.com/microsoft/pai/tree/master/contrib/aks-engine). + ## Troubleshooting #### Command `Apt install ` fails in the script. From 2ccdc1b958ad6068d7a6d38146372abba251c438 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Wed, 11 Nov 2020 18:56:01 +0800 Subject: [PATCH 03/12] Add permission and new API in log-manager (#5046) Add permission check in log-manage Add log-manager API to retrieve log --- .../services-configuration.yaml.template | 4 + .../services-configuration.yaml.template | 4 + .../build/log-manager-cleaner.k8s.dockerfile | 24 ++++ .../log-manager-logrotate.k8s.dockerfile | 55 -------- .../build/log-manager-nginx.k8s.dockerfile | 10 +- src/log-manager/config/log-manager.md | 47 +++---- src/log-manager/config/log-manager.yaml | 4 + .../deploy/log-manager.yaml.template | 20 ++- src/log-manager/src/cleaner/entrypoint.sh | 43 ++++++ .../src/logrotate/docker-entrypoint.sh | 97 ------------- src/log-manager/src/logrotate/logrotate.conf | 14 -- src/log-manager/src/nginx/get_log_content.lua | 84 ++++++++++++ src/log-manager/src/nginx/guard.lua | 38 ++++++ src/log-manager/src/nginx/list_logs.lua | 74 ++++++++++ src/log-manager/src/nginx/nginx.conf | 128 +++++++++--------- src/log-manager/src/nginx/nginx.conf.default | 77 +++++++++++ src/log-manager/src/nginx/token.lua | 45 ++++++ 17 files changed, 508 insertions(+), 260 deletions(-) create mode 100644 src/log-manager/build/log-manager-cleaner.k8s.dockerfile delete mode 100644 src/log-manager/build/log-manager-logrotate.k8s.dockerfile create mode 100755 src/log-manager/src/cleaner/entrypoint.sh delete mode 100755 src/log-manager/src/logrotate/docker-entrypoint.sh delete mode 100644 src/log-manager/src/logrotate/logrotate.conf create mode 100644 src/log-manager/src/nginx/get_log_content.lua create mode 100644 src/log-manager/src/nginx/guard.lua create mode 100644 src/log-manager/src/nginx/list_logs.lua create mode 100644 src/log-manager/src/nginx/nginx.conf.default create mode 100644 src/log-manager/src/nginx/token.lua diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index 8a223b710e..f29b950bf1 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -238,6 +238,10 @@ authentication: # uncomment following section if you want to customize the port of log-manager # log-manager: # port: 9103 +# admin_name: "admin" +# admin_password: "admin" +# jwt_secret: "jwt_secret" +# token_expired_second: 120 # uncomment following section if you want to customize the port of storage-manager diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index b4e4958792..ef6fb089a3 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -128,6 +128,10 @@ rest-server: # uncomment following section if you want to customize the port of log-manager # log-manager: # port: 9103 +# admin_name: "admin" +# admin_password: "admin" +# jwt_secret: "jwt_secret" +# token_expired_second: 120 # uncomment following section if you want to customize the port of storage-manager # storage-manager: diff --git a/src/log-manager/build/log-manager-cleaner.k8s.dockerfile b/src/log-manager/build/log-manager-cleaner.k8s.dockerfile new file mode 100644 index 0000000000..245649d0a8 --- /dev/null +++ b/src/log-manager/build/log-manager-cleaner.k8s.dockerfile @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM alpine:3.10 + +# install dev tools +RUN apk update && apk add --no-cache tini bash findutils +COPY src/cleaner/ /usr/bin/cleaner/ +ENTRYPOINT ["/sbin/tini","--","/usr/bin/cleaner/entrypoint.sh"] + diff --git a/src/log-manager/build/log-manager-logrotate.k8s.dockerfile b/src/log-manager/build/log-manager-logrotate.k8s.dockerfile deleted file mode 100644 index 598274c29b..0000000000 --- a/src/log-manager/build/log-manager-logrotate.k8s.dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -# Original work Copyright (c) 2015 Steffen Bleul -# Modified work Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -FROM alpine:3.10 - -# logrotate version (e.g. 3.9.1-r0) -ARG LOGROTATE_VERSION=latest -# permissions -ARG CONTAINER_UID=1000 -ARG CONTAINER_GID=1000 - -# install dev tools -RUN export CONTAINER_USER=logrotate && \ - export CONTAINER_GROUP=logrotate && \ - addgroup -g $CONTAINER_GID logrotate && \ - adduser -u $CONTAINER_UID -G logrotate -h /usr/bin/logrotate.d -s /bin/bash -S logrotate && \ - apk add --update \ - tini \ - bash \ - tar \ - gzip \ - wget \ - tzdata && \ - if [ "${LOGROTATE_VERSION}" = "latest" ]; \ - then apk add logrotate ; \ - else apk add "logrotate=${LOGROTATE_VERSION}" ; \ - fi && \ - mkdir -p /usr/bin/logrotate.d && \ - wget --no-check-certificate -O /tmp/go-cron.tar.gz https://github.com/michaloo/go-cron/releases/download/v0.0.2/go-cron.tar.gz && \ - tar xvf /tmp/go-cron.tar.gz -C /usr/bin && \ - apk del \ - wget && \ - rm -rf /var/cache/apk/* && rm -rf /tmp/* - -COPY src/logrotate/ /usr/bin/logrotate.d/ -RUN chmod +x /usr/bin/logrotate.d/docker-entrypoint.sh - -ENTRYPOINT ["/sbin/tini","--","/usr/bin/logrotate.d/docker-entrypoint.sh"] -VOLUME ["/logrotate-status"] -CMD ["cron"] diff --git a/src/log-manager/build/log-manager-nginx.k8s.dockerfile b/src/log-manager/build/log-manager-nginx.k8s.dockerfile index d1d4fb59c7..a5896ac5cb 100644 --- a/src/log-manager/build/log-manager-nginx.k8s.dockerfile +++ b/src/log-manager/build/log-manager-nginx.k8s.dockerfile @@ -15,5 +15,11 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -FROM openresty/openresty:1.15.8.2-alpine -COPY src/nginx/nginx.conf /etc/nginx/conf.d/default.conf \ No newline at end of file +FROM openresty/openresty:1.15.8.3-2-alpine-fat + +RUN luarocks install lua-cjson && luarocks install lua-resty-jwt && \ + luarocks install luafilesystem + +COPY src/nginx/nginx.conf.default /etc/nginx/conf.d/default.conf +COPY src/nginx/nginx.conf /usr/local/openresty/nginx/conf/nginx.conf +COPY src/nginx/*.lua /etc/nginx/lua/ diff --git a/src/log-manager/config/log-manager.md b/src/log-manager/config/log-manager.md index afbf148d0c..194a7f45d5 100644 --- a/src/log-manager/config/log-manager.md +++ b/src/log-manager/config/log-manager.md @@ -1,46 +1,43 @@ ## Log-manager section parser -- [Default Configuration](#D_Config) -- [How to Configure](#HT_Config) -- [Generated Configuration](#G_Config) -- [Data Table](#T_config) +- [Default configuration](#default-configuration) +- [How to configure cluster section in service-configuration.yaml](#how-to-configure-cluster-section-in-service-configurationyaml) +- [Generated Configuration](#generated-configuration) +- [Table](#table) -#### Default configuration +#### Default configuration [log-manager default configuration](log-manager.yaml) -#### How to configure cluster section in service-configuration.yaml +#### How to configure cluster section in service-configuration.yaml All configurations in this section is optional. If you want to customized these value, you can configure it in service-configuration.yaml. For example, if you want to use different port than the default 9103, add following to your service-configuration.yaml as following: ```yaml log-manager: - port: new-value + port: new-value ``` -#### Generated Configuration +#### Generated Configuration Generated configuration means the object model after parsing. The parsed data will be presented by a yaml format. ```yaml log-manager: - port: 9103 + port: 9103 + admin_name: admin + admin_password: admin + jwt_secret: "jwt_secret" + token_expired_second: 120 ``` -#### Table - - - - - - - - - - - - - - -
Data in Configuration FileData in Cluster Object ModelData in Jinja2 TemplateData type
log-manager.portcom["log-manager"]["port"]cluster_cfg["log-manager"]["port"]Int
+#### Table + +| Data in Configuration File | Data in Cluster Object Model | Data in Jinja2 Template | Data type | +|-----------------------------------|---------------------------------------------|----------------------------------------------------|-----------| +| log-manager.port | com["log-manager"]["port"] | cluster_cfg["log-manager"]["port"] | Int | +| log-manager.admin_name | com["log-manager"]["admin_name"] | cluster_cfg["log-manager"]["admin_name"] | String | +| log-manager.admin_password | com["log-manager"]["admin_password"] | cluster_cfg["log-manager"]["admin_password"] | String | +| log-manager.jwt_secret | com["log-manager"]["jwt_secret"] | cluster_cfg["log-manager"]["jwt_secret"] | String | +| log-manager.token_expired_second | com["log-manager"]["token_expired_second"] | cluster_cfg["log-manager"]["token_expired_second"] | Int | diff --git a/src/log-manager/config/log-manager.yaml b/src/log-manager/config/log-manager.yaml index 1fd2af76a5..302b5bcb51 100644 --- a/src/log-manager/config/log-manager.yaml +++ b/src/log-manager/config/log-manager.yaml @@ -18,3 +18,7 @@ service_type: "k8s" port: 9103 +admin_name: "admin" +admin_password: "admin" +jwt_secret: "jwt_secret" +token_expired_second: 120 diff --git a/src/log-manager/deploy/log-manager.yaml.template b/src/log-manager/deploy/log-manager.yaml.template index dfedc2ef5e..c20767863c 100644 --- a/src/log-manager/deploy/log-manager.yaml.template +++ b/src/log-manager/deploy/log-manager.yaml.template @@ -34,12 +34,9 @@ spec: priorityClassName: pai-daemon-priority hostNetwork: false containers: - - name: log-manager-logrotate - image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}log-manager-logrotate:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} + - name: log-cleaner + image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}log-manager-cleaner:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} imagePullPolicy: Always - env: - - name: LOGROTATE_CRONSCHEDULE - value: "*/10 * * * *" volumeMounts: - name: pai-log mountPath: /usr/local/pai/logs @@ -77,6 +74,19 @@ spec: cpu: 0 memory: "128Mi" {%- endif %} + env: + - name: ADMIN_NAME + value: {{ cluster_cfg["log-manager"]["admin_name"] }} + - name: ADMIN_PASSWORD + value: {{ cluster_cfg["log-manager"]["admin_password"] }} + - name: JWT_SECRET + value: {{ cluster_cfg["log-manager"]["jwt_secret"] }} + - name: TOKEN_EXPIRED_SECOND + value: '{{ cluster_cfg["log-manager"]["token_expired_second"] }}' + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumes: - name: pai-log hostPath: diff --git a/src/log-manager/src/cleaner/entrypoint.sh b/src/log-manager/src/cleaner/entrypoint.sh new file mode 100755 index 0000000000..41adf9708b --- /dev/null +++ b/src/log-manager/src/cleaner/entrypoint.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +set -o errexit +set -o pipefail + +log_exist_time=30 # 30 day +if [ -n "${LOG_EXIST_TIME}" ]; then + log_exist_time=${LOG_EXIST_TIME} +fi + +cat > /etc/periodic/daily/remove_logs << EOF +#!/bin/bash +/usr/bin/pgrep -f ^find 2>&1 > /dev/null || find /usr/local/pai/logs/* -mtime +${log_exist_time} -type f -exec rm -fv {} \; +EOF + +cat > /etc/periodic/weekly/remove_log_dir << EOF +#!/bin/bash +"/usr/bin/pgrep -f ^find 2>&1 > /dev/null || find /usr/local/pai/logs/* -mtime +${log_exist_time} -type d -empty -exec rmdir -v {} \;" +EOF + +chmod a+x /etc/periodic/daily/remove_logs /etc/periodic/weekly/remove_log_dir + +echo "cron job added" + +crond -f -l 0 + diff --git a/src/log-manager/src/logrotate/docker-entrypoint.sh b/src/log-manager/src/logrotate/docker-entrypoint.sh deleted file mode 100755 index 1b03c1fd6c..0000000000 --- a/src/log-manager/src/logrotate/docker-entrypoint.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Original work Copyright (c) 2015 Steffen Bleul -# Modified work Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License - - -# A helper script for ENTRYPOINT. - -set -e - -[[ ${DEBUG} == true ]] && set -x - -if [ -n "${DELAYED_START}" ]; then - sleep ${DELAYED_START} -fi - -# Logrotate status file handling -readonly logrotate_logstatus=${LOGROTATE_STATUSFILE:-"/logrotate-status/logrotate.status"} - -# ----- Crontab Generation ------ - -logrotate_parameters="" - -if [ -n "${LOGROTATE_PARAMETERS}" ]; then - logrotate_parameters="-"${LOGROTATE_PARAMETERS} -fi - -syslogger_tag="" - -if [ -n "${SYSLOGGER_TAG}" ]; then -syslogger_tag=" -t "${SYSLOGGER_TAG} -fi - -syslogger_command="" - -if [ -n "${SYSLOGGER}" ]; then - syslogger_command="logger "${syslogger_tag} -fi - -logrotate_cronlog="" - -if [ -n "${LOGROTATE_LOGFILE}" ] && [ -z "${SYSLOGGER}"]; then - logrotate_cronlog=" 2>&1 | tee -a "${LOGROTATE_LOGFILE} -else - if [ -n "${SYSLOGGER}" ]; then - logrotate_cronlog=" 2>&1 | "${syslogger_command} - fi -fi - -logrotate_croninterval="1 0 0 * * *" - -if [ -n "${LOGROTATE_INTERVAL}" ]; then - case "$LOGROTATE_INTERVAL" in - hourly) - logrotate_croninterval='@hourly' - ;; - daily) - logrotate_croninterval='@daily' - ;; - weekly) - logrotate_croninterval='@weekly' - ;; - monthly) - logrotate_croninterval='@monthly' - ;; - yearly) - logrotate_croninterval='@yearly' - ;; - *) - logrotate_croninterval="1 0 0 * * *" - ;; - esac -fi - -if [ -n "${LOGROTATE_CRONSCHEDULE}" ]; then - logrotate_croninterval=${LOGROTATE_CRONSCHEDULE} -fi - -logrotate_cron_timetable="/usr/bin/pgrep -f ^/usr/sbin/logrotate 2>&1 > /dev/null || /usr/sbin/logrotate ${logrotate_parameters} --state=${logrotate_logstatus} /usr/bin/logrotate.d/logrotate.conf ${logrotate_cronlog}" - -log_exist_time=30 # 30 day -if [ -n "${LOG_EXIST_TIME}" ]; then - log_exist_time=${LOG_EXIST_TIME} -fi - -# ----- Cron Start ------ -exec /usr/bin/go-cron '@daily' /bin/bash -c "/usr/bin/pgrep -f ^find 2>&1 > /dev/null || find /usr/local/pai/logs/* -mtime +${log_exist_time} -type f -exec rm -fv {} \;"& -if [ "$1" = 'cron' ]; then - exec /usr/bin/go-cron "${logrotate_croninterval}" /bin/bash -c "${logrotate_cron_timetable}" -fi - -#----------------------- - -exec "$@" diff --git a/src/log-manager/src/logrotate/logrotate.conf b/src/log-manager/src/logrotate/logrotate.conf deleted file mode 100644 index fc6e36183d..0000000000 --- a/src/log-manager/src/logrotate/logrotate.conf +++ /dev/null @@ -1,14 +0,0 @@ -/usr/local/pai/logs/*/*/*/*/*.log -/usr/local/pai/logs/*/*/*/*/*.stdout -/usr/local/pai/logs/*/*/*/*/*.stderr -/usr/local/pai/logs/*/*/*/*/*.all -{ - nomail - rotate 1 - nocompress - missingok - notifempty - copytruncate - size 256M - maxage 30 -} diff --git a/src/log-manager/src/nginx/get_log_content.lua b/src/log-manager/src/nginx/get_log_content.lua new file mode 100644 index 0000000000..1399f547c7 --- /dev/null +++ b/src/log-manager/src/nginx/get_log_content.lua @@ -0,0 +1,84 @@ +-- Copyright (c) Microsoft Corporation +-- All rights reserved. +-- MIT License +-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +-- documentation files (the "Software"), to deal in the Software without restriction, including without limitation +-- the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +-- to permit persons to whom the Software is furnished to do so, subject to the following conditions: +-- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +-- THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +-- BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +-- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +local lfs = require "lfs" + +local function get_rotated_log(log_path) + for file in lfs.dir(log_path) do + local rotated_log_name = string.match(file, "^@.*%.s") + if rotated_log_name then + return rotated_log_name + end + end +end + + +local args = ngx.req.get_uri_args() +local username = args["username"] +local framework_name = args["framework-name"] +local taskrole = args["taskrole"] +local pod_uid = args["pod-uid"] +local token = args["token"] +local tail_mode = args["tail-mode"] + +if not token or not username or not taskrole or not framework_name or not pod_uid then + ngx.log(ngx.ERR, "some query parameters is nil") + ngx.status = ngx.HTTP_BAD_REQUEST + return ngx.exit(ngx.HTTP_OK) +end + +local path_prefix = "/usr/local/pai/logs/"..username.."/".. framework_name.."/".. taskrole.."/"..pod_uid.."/" +local log_name = ngx.var[1] + +local log_path = path_prefix..log_name +ngx.log(ngx.INFO, "get log name "..log_name) +if string.match(log_name, "^user%-.*$") then + -- we only keep one rotated log in log manager + if string.match(log_name, "%.1$") then + local parent_path = path_prefix..string.sub(log_name, 1, string.len(log_name) - 2) + local rotated_log_name = get_rotated_log(parent_path) + if not rotated_log_name then + ngx.status = ngx.HTTP_NOT_FOUND + return ngx.exit(ngx.HTTP_OK) + else + log_path = parent_path.."/"..rotated_log_name + end + else + log_path = path_prefix..log_name.."/current" + end +end + +ngx.log(ngx.INFO, "get log from path"..log_path) + +if lfs.attributes(log_path, "mode") ~= "file" then + ngx.log(ngx.ERR, log_path.." not exists") + ngx.status = ngx.HTTP_NOT_FOUND + return ngx.exit(ngx.HTTP_OK) +end + +local logs +if (tail_mode == "true") then + logs = io.popen("tail -c 16k "..log_path) +else + logs = io.popen("cat "..log_path) +end + +-- buffer size (8K) +local size = 2^13 +while true do + local block = logs:read(size) + if not block then break end + ngx.say(block) +end diff --git a/src/log-manager/src/nginx/guard.lua b/src/log-manager/src/nginx/guard.lua new file mode 100644 index 0000000000..fed14bb5ae --- /dev/null +++ b/src/log-manager/src/nginx/guard.lua @@ -0,0 +1,38 @@ +-- Copyright (c) Microsoft Corporation +-- All rights reserved. +-- MIT License +-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +-- documentation files (the "Software"), to deal in the Software without restriction, including without limitation +-- the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +-- to permit persons to whom the Software is furnished to do so, subject to the following conditions: +-- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +-- THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +-- BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +-- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +local jwt = require "resty.jwt" +local validators = require "resty.jwt-validators" + +local args = ngx.req.get_uri_args() +local jwt_token = args["token"] +if not jwt_token then + ngx.status = ngx.HTTP_FORBIDDEN + return ngx.exit(ngx.HTTP_OK) +end + +local jwt_secret = os.getenv("JWT_SECRET") +local node_name = os.getenv("NODE_NAME") + +local claim_spec = { + sub = validators.equals("log-manager-"..node_name), + exp = validators.is_not_expired() +} +local jwt_obj = jwt:verify(jwt_secret, jwt_token, claim_spec) + +if not jwt_obj["verified"] then + ngx.status = ngx.HTTP_FORBIDDEN + ngx.header["Access-Control-Allow-Origin"] = "*"; + return ngx.exit(ngx.HTTP_OK) +end diff --git a/src/log-manager/src/nginx/list_logs.lua b/src/log-manager/src/nginx/list_logs.lua new file mode 100644 index 0000000000..8acf9edf1f --- /dev/null +++ b/src/log-manager/src/nginx/list_logs.lua @@ -0,0 +1,74 @@ +-- Copyright (c) Microsoft Corporation +-- All rights reserved. +-- MIT License +-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +-- documentation files (the "Software"), to deal in the Software without restriction, including without limitation +-- the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +-- to permit persons to whom the Software is furnished to do so, subject to the following conditions: +-- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +-- THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +-- BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +-- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +local cjson = require "cjson" +local lfs = require "lfs" + +local function has_file_with_pattern(path, pattern) + for file in lfs.dir(path) do + if string.match(file, pattern) then + return true + end + end + return false +end + +local function is_dir(path) + return lfs.attributes(path, "mode") == "directory" +end + +local args = ngx.req.get_uri_args() +local username = args["username"] +local framework_name = args["framework-name"] +local taskrole = args["taskrole"] +local pod_uid = args["pod-uid"] +local token = args["token"] + +if not token or not username or not taskrole or not framework_name or not pod_uid then + ngx.log(ngx.ERR, "some query parameters is nil") + ngx.status = ngx.HTTP_BAD_REQUEST + return ngx.exit(ngx.HTTP_OK) +end + +local log_query_param = "?username="..username.."&framework-name="..framework_name.. + "&pod-uid="..pod_uid.."&taskrole="..taskrole.."&token="..token +local path = "/usr/local/pai/logs/"..username.."/".. framework_name.."/".. taskrole.."/"..pod_uid.."/" +local path_prefix = "/api/v1/logs/" + +local ret = {} + +if not is_dir(path) then + ngx.log(ngx.ERR, "log folder not exists") + ngx.status = ngx.HTTP_NOT_FOUND + return ngx.exit(ngx.HTTP_OK) +end + +for file in lfs.dir(path) do + if not is_dir(path..file) then + if string.match(file, "^user%.pai%..*$") then + local sub_str = string.sub(file, string.len("user.pai.") + 1) + ret[sub_str] = path_prefix..file..log_query_param + else + ret[file] = path_prefix..file..log_query_param + end + elseif string.match(file, "^user-.*$") then + local sub_str = string.sub(file, string.len("user-") + 1) + ret[sub_str] = path_prefix..file..log_query_param + if has_file_with_pattern(path..file, "^@.*%.s") then + ret[sub_str..".1"] = path_prefix..file..".1"..log_query_param + end + end +end + +ngx.say(cjson.encode(ret)) diff --git a/src/log-manager/src/nginx/nginx.conf b/src/log-manager/src/nginx/nginx.conf index 29d6c11b87..5bda1eb6c9 100644 --- a/src/log-manager/src/nginx/nginx.conf +++ b/src/log-manager/src/nginx/nginx.conf @@ -15,70 +15,74 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -server { - listen 80; - server_name localhost; - client_max_body_size 0; # Disable checking of client request body size. - client_body_buffer_size 256M; - proxy_connect_timeout 60m; - proxy_send_timeout 60m; - proxy_read_timeout 60m; - send_timeout 60m; - - # - # Health check - # - location = /healthz { - default_type text/plain; - return 200 "Log manager ready."; - } - # - # Get all logs - # - location /log-manager { - add_header Access-Control-Allow-Origin *; - add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS'; - default_type text/plain; - alias /usr/local/pai/logs; - autoindex on; - autoindex_exact_size off; - autoindex_localtime on; - } +# nginx.conf -- docker-openresty +# +# This file is installed to: +# `/usr/local/openresty/nginx/conf/nginx.conf` +# and is the file loaded by nginx at startup, +# unless the user specifies otherwise. +# +# It tracks the upstream OpenResty's `nginx.conf`, but removes the `server` +# section and adds this directive: +# `include /etc/nginx/conf.d/*.conf;` +# +# The `docker-openresty` file `nginx.vh.default.conf` is copied to +# `/etc/nginx/conf.d/default.conf`. It contains the `server section +# of the upstream `nginx.conf`. +# +# See https://github.com/openresty/docker-openresty/blob/master/README.md#nginx-config-files +# - # - # Get full/tailed log - # - location ~ ^/log-manager/(full|tail)/(.*)$ { - add_header Access-Control-Allow-Origin *; - add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS'; - default_type text/plain; - content_by_lua_block{ - logpath = "/usr/local/pai/logs/"..ngx.var[2] - errcheck = " || echo No such file!" - if (ngx.var[1] == "tail") - then - logs = io.popen("tail -c 16k "..logpath..errcheck) - elseif (ngx.var[1] == "full") - then - logs = io.popen("cat "..logpath..errcheck) - end - for line in logs:lines() do - ngx.say(line) - end - } - } +#user nobody; +worker_processes 1; + +#error_log logs/error.log; +#error_log logs/error.log notice; +error_log logs/error.log info; + +#pid logs/nginx.pid; + + +events { + worker_connections 1024; +} + + +http { + include mime.types; + default_type application/octet-stream; + + #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + # '$status $body_bytes_sent "$http_referer" ' + # '"$http_user_agent" "$http_x_forwarded_for"'; + + #access_log logs/access.log main; + + # See Move default writable paths to a dedicated directory (#119) + # https://github.com/openresty/docker-openresty/issues/119 + client_body_temp_path /var/run/openresty/nginx-client-body; + proxy_temp_path /var/run/openresty/nginx-proxy; + fastcgi_temp_path /var/run/openresty/nginx-fastcgi; + uwsgi_temp_path /var/run/openresty/nginx-uwsgi; + scgi_temp_path /var/run/openresty/nginx-scgi; + + sendfile on; + #tcp_nopush on; + + #keepalive_timeout 0; + keepalive_timeout 65; + + #gzip on; - # - # Get compressed logs for old job - # - location /log-backup { - add_header Access-Control-Allow-Origin *; - add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS'; - default_type text/plain; - alias /usr/local/pai/logs-backup; - autoindex on; - autoindex_exact_size off; - autoindex_localtime on; + init_by_lua_block { + local errlog = require "ngx.errlog" + errlog.set_filter_level(ngx.INFO) } + include /etc/nginx/conf.d/*.conf; } +env ADMIN_NAME; +env ADMIN_PASSWORD; +env JWT_SECRET; +env TOKEN_EXPIRED_SECOND; +env NODE_NAME; diff --git a/src/log-manager/src/nginx/nginx.conf.default b/src/log-manager/src/nginx/nginx.conf.default new file mode 100644 index 0000000000..2adea886bc --- /dev/null +++ b/src/log-manager/src/nginx/nginx.conf.default @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +server { + listen 80; + server_name localhost; + client_max_body_size 0; # Disable checking of client request body size. + client_body_buffer_size 256M; + proxy_connect_timeout 60m; + proxy_send_timeout 60m; + proxy_read_timeout 60m; + send_timeout 60m; + + # + # Health check + # + location = /healthz { + default_type text/plain; + return 200 "Log manager ready."; + } + + # + # Get log list + # + location /api/v1/logs { + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'GET'; + limit_except GET { + deny all; + } + default_type application/json; + access_by_lua_file /etc/nginx/lua/guard.lua; + content_by_lua_file /etc/nginx/lua/list_logs.lua; + } + + # + # Get the token + # + location /api/v1/tokens { + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'POST'; + limit_except POST { + deny all; + } + default_type application/json; + content_by_lua_file /etc/nginx/lua/token.lua; + } + + # + # Get full/tail log + # + location ~ ^/api/v1/logs/(.*)$ { + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'GET'; + limit_except GET { + deny all; + } + default_type text/plain; + access_by_lua_file /etc/nginx/lua/guard.lua; + content_by_lua_file /etc/nginx/lua/get_log_content.lua; + } +} diff --git a/src/log-manager/src/nginx/token.lua b/src/log-manager/src/nginx/token.lua new file mode 100644 index 0000000000..bb92adc16c --- /dev/null +++ b/src/log-manager/src/nginx/token.lua @@ -0,0 +1,45 @@ +-- Copyright (c) Microsoft Corporation +-- All rights reserved. +-- MIT License +-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +-- documentation files (the "Software"), to deal in the Software without restriction, including without limitation +-- the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +-- to permit persons to whom the Software is furnished to do so, subject to the following conditions: +-- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +-- THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +-- BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +-- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +local cjson = require "cjson" +local jwt = require "resty.jwt" + +-- check username & password +local admin_name = os.getenv("ADMIN_NAME") +local admin_password = os.getenv("ADMIN_PASSWORD") +local token_expired_second = os.getenv("TOKEN_EXPIRED_SECOND") +ngx.req.read_body() +local ok, body = pcall(cjson.decode, ngx.req.get_body_data()) + +if not ok then + ngx.status = ngx.HTTP_BAD_REQUEST + return ngx.exit(ngx.HTTP_OK) +end + +if body["username"] ~= admin_name or body["password"] ~= admin_password then + ngx.status = ngx.HTTP_UNAUTHORIZED + return ngx.exit(ngx.HTTP_OK) +end + +-- sign jwt token +local jwt_secret = os.getenv("JWT_SECRET") +local node_name = os.getenv("NODE_NAME") +local jwt_token = jwt:sign( + jwt_secret, + { + header={typ="JWT", alg="HS256"}, + payload={sub="log-manager-"..node_name, iat=os.time(), exp=os.time() + tonumber(token_expired_second)} + } +) +ngx.say(cjson.encode({token=jwt_token})) From 7c180f0b4277dbca53f870bb06d3789fbc92dd2d Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Wed, 11 Nov 2020 18:58:05 +0800 Subject: [PATCH 04/12] New get Pod logs API (#5048) Add new API to get pod logs: GET /jobs/{job-name}/pods/{podUid}/logs RESP: { "locations": [ {"name": "stdout", "uri": "https://nodeIP/podUid/stdout.0?token=string"}, {"name": "stderr"," uri": "https://nodeIP/podUid/stderr?token=string"} ] } --- .../deploy/rest-server.yaml.template | 6 +- src/rest-server/docs/swagger.yaml | 76 ++++++++++++- src/rest-server/src/controllers/v2/job.js | 24 ++++- src/rest-server/src/models/v2/job/index.js | 5 +- src/rest-server/src/models/v2/job/k8s.js | 9 +- src/rest-server/src/models/v2/job/log.js | 101 ++++++++++++++++++ src/rest-server/src/routes/v2/job.js | 5 + src/rest-server/src/utils/error.d.ts | 1 + tests/jenkins/test_rest_server_js_sdk.sh | 2 +- 9 files changed, 217 insertions(+), 12 deletions(-) create mode 100644 src/rest-server/src/models/v2/job/log.js diff --git a/src/rest-server/deploy/rest-server.yaml.template b/src/rest-server/deploy/rest-server.yaml.template index f3b268612e..ae2c3f74f4 100644 --- a/src/rest-server/deploy/rest-server.yaml.template +++ b/src/rest-server/deploy/rest-server.yaml.template @@ -72,6 +72,10 @@ spec: value: {{ cluster_cfg['hivedscheduler']['webservice'] }} - name: LOG_MANAGER_PORT value: "{{ cluster_cfg['log-manager']['port'] }}" + - name: LOG_MANAGER_ADMIN_NAME + value: "{{ cluster_cfg['log-manager']['admin_name'] }}" + - name: LOG_MANAGER_ADMIN_PASSWORD + value: "{{ cluster_cfg['log-manager']['admin_password'] }}" {%- endif %} - name: RATE_LIMIT_API_PER_MIN value: "{{ cluster_cfg['rest-server']['rate-limit-api-per-min'] }}" @@ -89,7 +93,7 @@ spec: value: {{ cluster_cfg['rest-server']['jwt-expire-time'] }} - name: WEBPORTAL_URL {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %} - value: "{{ cluster_cfg['pylon']['uriHttps']}}" + value: "{{ cluster_cfg['pylon']['uri-https']}}" {%- else %} value: "{{ cluster_cfg['pylon']['uri']}}" {%- endif %} diff --git a/src/rest-server/docs/swagger.yaml b/src/rest-server/docs/swagger.yaml index 504a50915f..4d2aec74e4 100644 --- a/src/rest-server/docs/swagger.yaml +++ b/src/rest-server/docs/swagger.yaml @@ -12,6 +12,7 @@ info: Version 2.1.1: add get event api Version 2.2.0: add get task status api; add jobAttempId filter to job status api and extend job detail schema Version 2.2.1: a user can add/delete tags to/from his/her own jobs + version 2.2.2: add get pod logs api license: name: MIT License url: "https://github.com/microsoft/pai/blob/master/LICENSE" @@ -1485,6 +1486,8 @@ paths: security: - bearerAuth: [] parameters: + - $ref: "#/components/parameters/user" + - $ref: "#/components/parameters/job" - name: type in: query description: filter events with type. Could be "Warning" or "Normal". @@ -1738,7 +1741,7 @@ paths: ssh: 37508 http: 24661 containerGpus: null - containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/ + containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs containerExitCode: -220 containerExitSpec: code: -220 @@ -1765,7 +1768,7 @@ paths: ssh: 37508 http: 24661 containerGpus: null - containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/ + containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs containerExitCode: -220 containerExitSpec: code: -220 @@ -1784,6 +1787,43 @@ paths: $ref: "#/components/responses/NoTaskError" "500": $ref: "#/components/responses/UnknownError" + "/api/v2/jobs/{user}~{job}/pods/{podUid}/logs": + get: + tags: + - job + summary: Get job pod log list. + description: Get job pod log list. + operationId: getPodLogs + security: + - bearerAuth: [] + parameters: + - $ref: "#/components/parameters/user" + - $ref: "#/components/parameters/job" + - $ref: "#/components/parameters/podUid" + - name: tailMode + in: query + description: getting log content via tail mode. Could be "true" or "false" + schema: + type: boolean + responses: + "200": + description: Succeeded + content: + application/json: + schema: + $ref: "#/components/schemas/PodLogInfo" + example: + locations: + - name: stderr + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stderr?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + - name: all + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.all?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + - name: stdout + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stdout?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + "404": + $ref: "#/components/responses/NoPodLogsError" + "500": + $ref: "#/components/responses/UnknownError" /api/v2/kubernetes/nodes: get: tags: @@ -1917,6 +1957,13 @@ components: required: true schema: type: string + podUid: + name: podUid + in: path + description: job pod uid + required: true + schema: + type: string schemas: Response: type: object @@ -3067,6 +3114,20 @@ components: - storageConfig - email - extension + PodLogInfo: + type: object + properties: + locations: + type: array + items: + type: object + properties: + name: + type: string + description: log name. + uri: + type: string + description: log content lnk. responses: InvalidParametersError: description: InvalidParametersError @@ -3226,6 +3287,17 @@ components: value: code: NoJobSshInfoError message: "SSH info of job {job} is not found." + NoPodLogsError: + description: NoPodLogsError + content: + application/json: + schema: + $ref: "#/components/schemas/Response" + examples: + NoJobSshInfoError: + value: + code: NoPodLogsError + message: "Logs for pod {podUid} is not found." ConflictUserError: description: ConflictUserError content: diff --git a/src/rest-server/src/controllers/v2/job.js b/src/rest-server/src/controllers/v2/job.js index 0831bb5e62..7621038f3e 100644 --- a/src/rest-server/src/controllers/v2/job.js +++ b/src/rest-server/src/controllers/v2/job.js @@ -20,7 +20,8 @@ const status = require('statuses'); const asyncHandler = require('@pai/middlewares/v2/asyncHandler'); const createError = require('@pai/utils/error'); -const job = require('@pai/models/v2/job'); +const { job, log } = require('@pai/models/v2/job'); +const logger = require('@pai/config/logger'); const { Op } = require('sequelize'); const list = asyncHandler(async (req, res) => { @@ -288,6 +289,26 @@ const getEvents = asyncHandler(async (req, res) => { res.json(data); }); +const getLogs = asyncHandler(async (req, res) => { + try { + const data = await log.getLogListFromLogManager( + req.params.frameworkName, + req.params.podUid, + req.query['tail-mode'], + ); + res.json(data); + } catch (error) { + logger.error(`Got error when retrieving log list, error: ${error}`); + throw error.code === 'NoPodLogsError' + ? error + : createError( + 'Internal Server Error', + 'UnknownError', + 'Failed to get log list', + ); + } +}); + // module exports module.exports = { list, @@ -299,4 +320,5 @@ module.exports = { addTag, deleteTag, getEvents, + getLogs, }; diff --git a/src/rest-server/src/models/v2/job/index.js b/src/rest-server/src/models/v2/job/index.js index 1e5e3db7e0..09e4d86151 100644 --- a/src/rest-server/src/models/v2/job/index.js +++ b/src/rest-server/src/models/v2/job/index.js @@ -37,4 +37,7 @@ if (config.env !== 'test') { } })(); } -module.exports = require('@pai/models/v2/job/k8s'); +module.exports = { + job: require('@pai/models/v2/job/k8s'), + log: require('@pai/models/v2/job/log'), +}; diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index 8e5e697129..67d64f0d24 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -76,7 +76,7 @@ const convertFrameworkSummary = (framework) => { }; }; -const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => { +const convertTaskDetail = async (taskStatus, ports, frameworkName) => { // get containerPorts const containerPorts = getContainerPorts( ports, @@ -103,7 +103,7 @@ const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => { containerNodeName: taskStatus.attemptStatus.podNodeName, containerPorts, containerGpus, - containerLog: `http://${taskStatus.attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/tail/${logPathPrefix}/${taskStatus.attemptStatus.podUID}/`, + containerLog: `/api/v2/jobs/${frameworkName}/pods/${taskStatus.attemptStatus.podUID}/logs`, containerExitCode: completionStatus ? completionStatus.code : null, containerExitSpec: completionStatus ? generateExitSpec(completionStatus.code) @@ -158,9 +158,6 @@ const convertFrameworkDetail = async ( const virtualCluster = frameworkWithLatestAttempt.metadata.labels ? frameworkWithLatestAttempt.metadata.labels.virtualCluster : 'unknown'; - const logPathInfix = frameworkWithLatestAttempt.metadata.annotations - ? frameworkWithLatestAttempt.metadata.annotations.logPathInfix - : null; const latestAttemptStatus = frameworkWithLatestAttempt.status.attemptStatus; const latestAttemptCompletionStatus = latestAttemptStatus.completionStatus; @@ -291,7 +288,7 @@ const convertFrameworkDetail = async ( await convertTaskDetail( status, ports[taskRoleStatus.name], - `${userName}/${logPathInfix || jobName}/${taskRoleStatus.name}`, + `${userName}~${jobName}`, ), ), ); diff --git a/src/rest-server/src/models/v2/job/log.js b/src/rest-server/src/models/v2/job/log.js new file mode 100644 index 0000000000..556bf0a1b2 --- /dev/null +++ b/src/rest-server/src/models/v2/job/log.js @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft Corporation +// All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +// to permit persons to whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +const axios = require('axios'); +const job = require('./k8s'); +const logger = require('@pai/config/logger'); +const createError = require('@pai/utils/error'); +const { encodeName } = require('@pai/models/v2/utils/name'); + +const LOG_MANAGER_PORT = process.env.LOG_MANAGER_PORT; +const WEBPORTAL_URL = process.env.WEBPORTAL_URL; + +const constrcutLogManagerPrefix = (nodeIp) => { + return `http://${nodeIp}:${LOG_MANAGER_PORT}/api/v1`; +}; + +const loginLogManager = async (nodeIp, username, password) => { + const prefix = constrcutLogManagerPrefix(nodeIp); + return axios.post(`${prefix}/tokens`, { + username: username, + password: password, + }); +}; + +const getLogListFromLogManager = async (frameworkName, podUid, tailMode) => { + const adminName = process.env.LOG_MANAGER_ADMIN_NAME; + const adminPassword = process.env.LOG_MANAGER_ADMIN_PASSWORD; + + const jobDetail = await job.get(frameworkName); + const noPodLogsErr = createError( + 'Not Found', + 'NoPodLogsError', + `Logs for pod ${podUid} is not found.`, + ); + let nodeIp; + let taskRoleName; + for (const [key, taskRole] of Object.entries(jobDetail.taskRoles)) { + const status = taskRole.taskStatuses.find( + (status) => status.containerId === podUid, + ); + if (!status) { + logger.error(`Failed to find pod which has pod uid ${podUid}`); + throw noPodLogsErr; + } + nodeIp = status.containerIp; + taskRoleName = key; + } + + let res = await loginLogManager(nodeIp, adminName, adminPassword); + const token = res.data.token; + + const prefix = constrcutLogManagerPrefix(nodeIp); + try { + const params = { + token: token, + username: jobDetail.jobStatus.username, + taskrole: taskRoleName, + }; + params['framework-name'] = encodeName(frameworkName); + params['pod-uid'] = podUid; + res = await axios.get(`${prefix}/logs`, { + params: params, + }); + } catch (err) { + if (err.response && err.response.status === 404) { + throw noPodLogsErr; + } + throw err; + } + const logList = res.data; + + const ret = { locations: [] }; + const urlPrefix = `${WEBPORTAL_URL}/log-manager/${nodeIp}:${LOG_MANAGER_PORT}`; + const urlSuffix = tailMode === 'true' ? '&tail-mode=true' : ''; + for (const key in logList) { + ret.locations.push({ + name: key, + uri: `${urlPrefix}${logList[key]}${urlSuffix}`, + }); + } + + return ret; +}; + +module.exports = { + getLogListFromLogManager, +}; diff --git a/src/rest-server/src/routes/v2/job.js b/src/rest-server/src/routes/v2/job.js index 9498449ad9..c1ee6cfbcf 100644 --- a/src/rest-server/src/routes/v2/job.js +++ b/src/rest-server/src/routes/v2/job.js @@ -97,5 +97,10 @@ router /** GET /api/v2/jobs/:frameworkName/events - Get events of a framework */ .get(token.check, controller.getEvents); +router + .route('/:frameworkName/pods/:podUid/logs') + /** GET /api/v2/jobs/:frameworkName/pods/:podUid/logs - Get logs of a pod */ + .get(token.check, controller.getLogs); + // module exports module.exports = router; diff --git a/src/rest-server/src/utils/error.d.ts b/src/rest-server/src/utils/error.d.ts index 01d5d5e997..f384dcc281 100644 --- a/src/rest-server/src/utils/error.d.ts +++ b/src/rest-server/src/utils/error.d.ts @@ -49,6 +49,7 @@ declare type Code = 'UnauthorizedUserError' | 'NoEnoughQuotaError' | 'NotImplementedError' | + 'NoPodLogsError' | 'UnknownError'; declare function createError(status: Status, code: Code, message: string): HttpError; diff --git a/tests/jenkins/test_rest_server_js_sdk.sh b/tests/jenkins/test_rest_server_js_sdk.sh index a97b9426f6..a4f3dffeb1 100644 --- a/tests/jenkins/test_rest_server_js_sdk.sh +++ b/tests/jenkins/test_rest_server_js_sdk.sh @@ -49,7 +49,7 @@ EOT cp ${WORKSPACE}/src/rest-server/docs/swagger.yaml . node tests/common/apiTestCaseGenerator.js -- "swagger.yaml" ".tests/apiTestCase.json" sudo npm install -g mocha - mocha tests/api_tests/**/*.spec.js -t 20000 + mocha tests/api_tests/**/*.spec.js -t 1800000 cd ../.. ;; From 8cda19243a944b3e6e418fbdc6ea68b2cf7ef703 Mon Sep 17 00:00:00 2001 From: vvfreesoul <285009003@qq.com> Date: Thu, 12 Nov 2020 10:36:26 +0800 Subject: [PATCH 05/12] HTTPS (#5076) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix如何设置HTTPS访问 * lint error fix * https test * https配置完整中文版本 * https配置完整中文版本fix * https配置完整中文版本fix --- .../basic-management-operations.md | 119 +++++++++++++++++- .../how-to-manage-users-and-groups.md | 55 +------- .../imgs/aad/openssl_CA_result.png | Bin 0 -> 4771 bytes .../cluster-admin/imgs/aad/openssl_result.png | Bin 8685 -> 13813 bytes 4 files changed, 118 insertions(+), 56 deletions(-) create mode 100644 docs_zh_CN/manual/cluster-admin/imgs/aad/openssl_CA_result.png diff --git a/docs_zh_CN/manual/cluster-admin/basic-management-operations.md b/docs_zh_CN/manual/cluster-admin/basic-management-operations.md index 8358d42daa..2486d28a59 100644 --- a/docs_zh_CN/manual/cluster-admin/basic-management-operations.md +++ b/docs_zh_CN/manual/cluster-admin/basic-management-operations.md @@ -40,7 +40,7 @@ Webportal上有一个k8s仪表板的快捷方式,如下图所示。 -要使用它,您首先应该为OpenPAI设置`https`访问(使用`http://`会使访问无效)。 然后,在dev box机器上,按照以下步骤操作: +要使用它,您首先应该为OpenPAI设置`https`访问(使用`http://`会使访问无效),请参考[这里](#how-to-set-up-https) 。然后,在dev box机器上,按照以下步骤操作: **步骤 1.** 将以下Yaml文本另存为`admin-user.yaml` @@ -151,4 +151,119 @@ cd /pai ./paictl.py service start ``` -您可以使用`exit`离开dev-box容器,并使用`sudo docker exec -it dev-box bash`重新进入它。如果您不再需要它,请使用`sudo docker stop dev-box`和`sudo docker rm dev-box`删除Docker容器。 \ No newline at end of file +您可以使用`exit`离开dev-box容器,并使用`sudo docker exec -it dev-box bash`重新进入它。如果您不再需要它,请使用`sudo docker stop dev-box`和`sudo docker rm dev-box`删除Docker容器。 + +##
如何设置HTTPS访问
+ +为pylon配置https证书您需要先获得数字证书,然后将数字证书相关文件保存到dev-box容器中,在dev-box内您可以找到`services-configuration.yaml`这个配置文件,然后您需要把已经保存的数字证书的文件路径配置到`services-configuration.yaml`文件中。您可以选择自签名证书或由CA机构颁发的证书,接下来将首先演示自签名证书的配置过程。两种证书的配置过程是近似的。 + +### 配置自签名证书 + + +#### 1. 进入dev-box容器 + +要使用[`paictl`](#pai-service-management-and-paictl),请通过以下方式进入容器: + +```bash +sudo docker exec -it dev-box bash +``` + +#### 2. 在dev-box容器中创建一个文件夹 +当您进入容器后,您需要创建一个文件夹,并在此文件夹下生成自签名证书,我们可以在home文件夹下创建ssl文件夹。 + +``` bash +mkdir /home/ssl +cd /home/ssl +``` +#### 3. 使用OpenSSL生成RSA私钥 +接下来输入的命令多次会用到FileName参数,您可以选择合适的文件名来替换FileName。 + +``` bash +openssl genrsa -des3 -out FileName.key 1024 +``` + +这步会需要您填一个密码。 + +#### 4. 生成证书请求 + +```bash +SUBJECT="/C=US/ST=Washington/CN=FileName" +openssl req -new -subj $SUBJECT -key FileName.key -out FileName.csr +``` + +#### 5. 生成证书 + +```bash +mv FileName.key FileName.origin.key +openssl rsa -in FileName.origin.key -out FileName.key +openssl x509 -req -days 3650 -in FileName.csr -signkey FileName.key -out FileName.crt +``` + +#### 6. 最后结果 + +在当前目录下,您将会发现有4个文件 + +
+paictl overview picture +
+ +#### 7. 设置services-configuration.yaml + + +如果您是第一次配置,dev-box容器内可能不存在`services-configuration.yaml`。您应该按照以下过程来更改配置文件并使其生效。关闭pylon service,将OpenPAI的配置文件`services-configuration.yaml`拉取到本地,更改配置文件,上传配置文件,重新启动pylon service。您需要的命令依次为: +```bash +./paictl.py service stop -n pylon +./paictl.py config pull -o +vim /services-configuration.yaml +./paictl.py config push -p -m service +./paictl.py service start -n pylon +``` + + +如果您的容器内已经有`services-configuration.yaml`,您可以省略拉取文件的过程。请注意配置文件中的的master_ip就是您master machine的IP,而不是您dev box machine的IP。请按照以下格式来配置yaml文件: + +``` +pylon: + port: 80 + uri: "http://master_ip:80" + ssl: + crt_name: xxxxxx + crt_path: /path/to/xxxxxx + key_name: yyyyyy + key_path: /path/to/yyyyyy +``` +在我们刚才给出的样例中,配置文件内容应当为: +``` +pylon: + port: 80 + uri: "http://master_ip:80" + ssl: + crt_name: FileName.crt + crt_path: /home/ssl/FileName.crt + key_name: FileName.key + key_path: /home/ssl/FileName.key +``` +重启pylon service,可以通过https来访问OpenPAI。 + +#### 配置CA证书 +##### 1. 将CA证书保存到dev-box容器内 +要配置CA证书,您首先需要申请并导出您的CA证书,您最终会得到一个crt文件和一个key文件,然后将这两个文件保存到dev-box容器中,比如存储到dev-box容器的/home/ssl文件夹下。如图所示: + +
+paictl overview picture +
+ +##### 2. 设置services-configuration.yaml +这一步您可以按照`配置自签名证书`过程中的第7步进行配置,更改FileName字段即可。例如: + +``` +pylon: + port: 80 + uri: "http://master_ip:80" + ssl: + crt_name: n32.openpai.org_chain.crt + crt_path: /home/ssl/n32.openpai.org_chain.crt + key_name: n32.openpai.org_key.key + key_path: /home/ssl/n32.openpai.org_key.key +``` + diff --git a/docs_zh_CN/manual/cluster-admin/how-to-manage-users-and-groups.md b/docs_zh_CN/manual/cluster-admin/how-to-manage-users-and-groups.md index 09019a9c41..3eba96b8c5 100644 --- a/docs_zh_CN/manual/cluster-admin/how-to-manage-users-and-groups.md +++ b/docs_zh_CN/manual/cluster-admin/how-to-manage-users-and-groups.md @@ -25,61 +25,8 @@ #### 注意 -如果您在基础认证模式下已经设置过一些用户,您需要手动将他们迁移至AAD。一旦AAD认证设置成功,您将不能使用原先的基础认证。 +如果您在基础认证模式下已经设置过一些用户,您需要手动将他们迁移至AAD。一旦AAD认证设置成功,您将不能使用原先的基础认证。要设置AAD,请先按照[这里](./basic-management-operations.md#how-to-set-up-https)的说明为OpenPAI设置HTTPS访问。 -#### 设置HTTPS证书(以自签名证书为例) - -##### 1. 将您的域名保存到环境变量 - -```bash -DOMAIN={pylon address} -``` -##### 2. 使用OpenSSL生成RSA私钥 - -``` bash -openssl genrsa -des3 -out $DOMAIN.key 1024 -``` - -这步会需要您填一个密码。您可以跳过这步,按Enter键跳过,此时密码将会是一个空值。 - -##### 3. 生成证书请求 - -```bash -SUBJECT="/C=US/ST=Washington/CN=$DOMAIN" -openssl req -new -subj $SUBJECT -key $DOMAIN.key -out $DOMAIN.csr -``` - -##### 4. 生成证书 - -```bash -mv $DOMAIN.key $DOMAIN.origin.key -openssl rsa -in $DOMAIN.origin.key -out $DOMAIN.key -openssl x509 -req -days 3650 -in $DOMAIN.csr -signkey $DOMAIN.key -out $DOMAIN.crt -``` - -##### 5. 最后结果 - -在当前目录下,您将会发现有4个文件 - -
-paictl overview picture -
- -##### 6. 设置Pylon - -修改您的`services-configuration.yaml`。 如果您还不知道什么是`services-configuration.yaml`,请参考[这个文档](basic-management-operations.md#pai-service-management-and-paictl)。 - -``` -pylon: - port: 80 - uri: "http://master_ip:80" - ssl: - # self-sign - crt_name: xxxxxx - crt_path: /path/to/xxxxxx - key_name: yyyyyy - key_path: /path/to/yyyyyy -``` #### 设置AAD diff --git a/docs_zh_CN/manual/cluster-admin/imgs/aad/openssl_CA_result.png b/docs_zh_CN/manual/cluster-admin/imgs/aad/openssl_CA_result.png new file mode 100644 index 0000000000000000000000000000000000000000..3a2a23bc33901eaf26fe12b35ea0d946b6d8f952 GIT binary patch literal 4771 zcmcIm`#+QY|3?SpE>RA5WaLy%OQk}#oN}s&oNX(Lq-O8gWKQINRJEEEjh*#t_Z7;h5io97882^^v3<>BeLC z?|2JA+_@1u=>(;uA(D&TH~pcO*xfxDzGBSAO=;{E{;A+`Kj{xW@@LG>Fj^By%C#OS zJ1pW2y*Gupxv?`xZ4tCRzV9>Uw{~H4kKDBpLMO>K?8|L|o#?TKnpQVwfhp+Klpf9a zLDQug=M`?E$46WyVTw1Y|J&`PA&o&(C(g%_Hr|%l8-9T*w0p#*V}_jDtbc|kkB=hv}3TwL{Du1M1Ekgmh-;hq;S zH!m1WcA%=mkLcYR8CWzKJi4fL$Fc;r((4Qvvd4vOil9c4MuJ~#UIFK%_J{K^YSgyn zxkb=o^SLt81i)&m7WAz`%oFf3o^?deX?Q0>3p+@f+zWM|MWa66-5`-@Rf+eEA_)0J z+BKKII$dN!SRZ>7F?`_ih$Tcasla#&{e#4S?Bva)2p+fEoBrkwG zvKFU3et+o61F=m-KD3*>Z#3(~P`t~4W&_R<>qo1Su}k2J=dWD@{@T7#9;3t~`T?vB16 zHhY)9EL1na7#6T{_A93Zc!zOXMlN~&a z!l}7)K*ca6%wm&Aj<)~NGv|j1he#)zvALc}whC+Kwyf+njRYc&?>y8V_WFx-ns0h) zx|{7L%vN}VUH>SmgVWw_}e0Z zUL>_HuWgMXfl91ZzG@iwmUw7wH^hn}GHS#)f6Fd6mJ*{#ICEgT8T0@>bglHV%S$sl7VFuMpjXg?1bIL~YWEZBkRk<;3+rf0&A@nH+{T1kNRZ9SF!i0k;`Uw2yD6IyodW$N zu-UKx$Bc^a7wB45Tqit7mfkwlxl}o#9Y?4)k)#?vsy00TLt5zEXqsSCA|k}Vt-Q-b@>lGjc+1C^&W0`7+6e9Nrn*D{#GxDUw$ zQqJqx91=9^)J1o(u;^L?&js;<5hw@y^K>s}{A@qXtUMN9q;MCD~vU{Y1z ziJqmv`&*}2s82@U`e$PYeKRj4+J;d-JzdhQVdn0`1stLe4@09+BKUtY_>BX~;YgkP z#Pe?Dl-F6kf3y>h7k_lRjlB#yoI0;I7Kh^NYoKU^XO}g-7fq*%qHD)=g)SCf4tXTf zQcdtE#lK=h>81}g{9r}3IMKcV^cgvUmTM*b%>mzL)$d5inmu9|O}*H!EayLS6z4&( z!YZIXvC3fj0^z-=!V?KxKlC^~r!Z4iKto_auF>;>QAUrWE_Ph!AX^~isGP1*lU+8s zri(ZuGN}M_$k}em=(2SSOJd1W6xI;Q!6FD}_;8?Pc({HOa;mWw9n#PaD(Oe7B1!T~ z`&IeoUw4MT8WNO8)@bEl@J9M_{V_9xpnS!oxrDO?zx2jb>u`g`KHEXuAONa&|XL-LoW0$jUY>X-eTj z;xkh7WvQ&Ipf&$>@9MoUekcO_2VFtkz+G8P2z`)0wS+V^m&$Lu7j3EQTXj!n$1|;B zNTBA&G1wKuvqBRqxxPu7`jV;3wPKA{kZpOk>XKKub%xPy@JMbluGxGWY=muFRRd)t zXzbDxaX+i`q8xU0m|fcHn-e#EHy zg;8ophPn#fS+fL46QeM5R0%V%@9v3=dQG}%hgYxuM%O(jW;XP|y4xyKo3q0yo>EuN zL6Mt(!PvV};X*3S!Ka%7{%knTxDu8BNERJpMHMa{FT=UJ1PK4Ws#l3S>vibAozO{? zV;2v7zm0SGdL)R~BtxQaJ`auhZ^z6ea$@2-Hd<-jL4D6qg^GR|wGg?-GXcLB5tSQd z@YcFn!8A3L(>kpw0&ITlIznv~dY|RrMEz$OJvB#_-4=+{`1>1hT0ai)Gc{tB#b^qF zFD#C+Z@DCNa}Q{04rrbhJAvL11mDJ!lS}ZIjI*}GthX?WuxCdV6ZytlLqjTiEAz|} zNsjZty*p``E<%8pGYD?p@B$=xo7#FWeKAcC&AmD)o?vT1IG*p}Cq(}W?#wYLr;x!V zXeIs*pWY3mj+>|8=ZuDT!8myu?Cs`=qtlFHS$xQ?>!5=>JQR_W>IPH?PC_-u%I&Na z1H(4`LgynUO@W}Au9dYV@vr!I=9mXSe)h^J>A>fKs9+tzCQzvSCnCP#KJ{?{qX)=wVKi@U{`lhe< zxnmE39ot5na;BzcVaSL1U9=lF3Mm&S%_!L^Cl=}zh-aKywP;`RfB+Dnz%Al?)Ra$! zk-6&Dd2mYixfy*3SjJ-`n)!+V*bpH3fB|{iN?Yfp1WRerh`Cb&G1PGu`BL1hQ_>-d zhTP#$S2HcOqK9s;f{t~JtGjPD-fvMI3n@l^YifQPu9N*@yEkAi8}hJAJc(ESM(^c+ z#7auHGq+t7$j{Tj8LY(4(60bodY|F~EOxD}vzYB{7!|u@0@YhRAh2B@u5N7uRp`hh_* zsyDX4xj<$}ds?LGZ+&}=SWUYyVpP!y!{Ux588y`DwHJm1`HVGP`I$C#6Zn8d{d*jT z+Q~t)&(9Y7&!s24vOggcc=tvYynw%zDxpX0V)1aeYI z$8F2C74|i1tp;Y!o9|s37Gu4cmJm(Zt&_b|23C}(CB6`A7|Y%1o+*X2UXl1K*5XmS z+2}19G9^n~?a{eo_PaxMO|-Hkom!a@lj(fx+FY%9ba(S0eiYW)?)sRismhvRv?pnU zy7xrtRtF=4_Ky6jBUHfacU7f(*XXjpTi3n=fmo??_tZREOnW7^n4O-2k-vZ&j!&hLOD$? z=kDw738Z1W|0(U;3Y!$9sDjsMKrQckCq>xt2HaCG7o#tqQuqm?LqD4<#qH=dUp@A& zimqF-^m7B!1Io~q+OAt<#Ra-vzH?fVZPD|tVkuc~DVo$n^+!oa48p9UALg-T)p8o( z#y*mJ0^J>)Oz^A8fREdfdH56h2)iqJY=vSg*Qd(MdiM0x90M zg2X-;qnIm|0Aps|ps~mYW5Xq?H&u^Q*w>$5R#DQlG}`OT8kWJRi2*m`QZy37EoWgM zrZKCyp!0;ncKKbmPuY_xyg~3hmU3`Rr>4DZ!7uVCKC?Ui^`K+8{bO{13NaFOM+V&` zVV(jm&0Cuc`JEa6dy~5G^YK<{%s?egQ;xnNO6g>0#KIB*L)Jf7#fG~%EMIkY^=fVV zPuD*s>=stI^ngI27XRNh#k|Y76+#KOjq$GQr?Ya`7wpLIwf2da9T=Sw%=|gMi1M(U zL)R+Wn_{i#Fli{iziraFlv*1VfJgm~ZJ?T&$|5J%priEcpLl|iFxub~dZrciz5KU+ zBF+sYI_z|0T^bJhnmPZ zfw)`Z;BgF>zd`ss6COYzEqrGFY$2vlxYa|afG$_M%OtdnsVerNkI z%d`E3lqutZ4Dc^bpLrH~>gB!8_=a~Lq78K?it z4Q~_dud3=9{lsuTP{e0=@b9ge`xo{GLZ|b9;UG!cC3J@1MY<8f|C@4)ij)sPQ12R$ z`Eayzopj$%hs{QauzL<84a!HK3Fmn{A+{gheo!IS077Q+S|H{66bkK;5y!lXIvSMH z-c5_c(*mn-o9JO>CjIjF8g_MhKoNL!+@he}(mU6lsVNUIP1wwPjtCml-JNkdjd1&v z48pE?M9?HC%f&F?N8Ee@_ol?^9CofSY@0`3lWm}bFeu{5uND}t_B|zCHMJk;sli15 z*b#2KGr*haYH}_1H+--@SG8t6GYn)J_6+z-tqvau@?3AvgSmCZ3s7JVDcG&l1?O3CG5;<0D8s2!HjXv@}rb6@_t&x_jNRvpL zA0&lU!|Dg=i__cL;W5pzWyAgQM!AzEp>>&Ge1>@|He;Une5zC5Egj}{Vw?y(<4Xir zOX&F+TI_|T2RJ-cogG>1N!$cKoCbC-zvg^tF_HrGNb#TMiHSPiy7=@c(UU!hg9S68?47cVQAv~n9cyTWR8 zQ_`^`G^kSL(Uo;fDCXUkaD*FM*cG9J>V^653ONM6jQPEJb>~X_!HVWKDWWn1&3MK| zaH4Bn4Whv>1=cLl?`ai#3cS8UG literal 0 HcmV?d00001 diff --git a/docs_zh_CN/manual/cluster-admin/imgs/aad/openssl_result.png b/docs_zh_CN/manual/cluster-admin/imgs/aad/openssl_result.png index 24903b772cc1f71f24a11bf9d3b0c05140c9285d..74092a6e73f983af37106304e69db05972d96aac 100644 GIT binary patch literal 13813 zcmcJ$cUV*Fx2PLP2rWqONYe!8Sw#EzEX_o={MJm_ekYp_0gS)vH;Pd_P9oUfEo^68>W zZu_I?hhY{%`>-#o7T>c|Eno})a!vrbHov6Z-B`Z$3sQ`#V2RWN^5~1F!~2NCht#3` zRD{KE`&LDb$GhBW3?MaG4kidyC_fJsH8}i+r&p=BRl|G9FWi;0eH3cGEZR_hs!%un ziyRDZzR!o91Mzbld@@>Dtn0EO`j-qTRCsD{@Kb3;fPl^68AbLcK2My4!cnyr?$QaA zM#F3Dl{K^deB7Py*Hdc;l|jv6$o=x*KxH$F_g&t%J0?D?Ej+Y{=k0GjU90rxq6C`k z3$4vO=P?sod=epwy;1HUg}!&~*RO!>ir(4o8n#*m`d0E7N7i{P@gn5Dz?K^X@6dS?($Bo!NlqEQ;qiF%0WJcsR3!5 zip0Tr&v~w2gYymqxuyoAd_`uSLetJ#`a>cgDm6j6?m z(ZXBqHB1o5%}_Grr$Sg0ssb*tbJ}}3GwbQkch17UJ|)&os7^`Qe4ffaDZ7T?dfH@o2>8i3Wze)vt4+*_Wf;<^Xq^R?c@(JjrG1lnok7*2YIvD5*^+K| zTYYi`#?P?uvD&(o)4J*$MLcUo9fw3d9Wbc#wN9(~(~ReDs?W9j+@@SY&!_l+n$8)| zW^#Nds1tI_={AcrQ14wTC1;N<4WZK!GC-9*>tn;-l|D*yNhFFE?2RFCK_vuxc_N{eiHu6zqtu&(11x1+X^I#X>NggsWA+NqtM)2^X znP*x*@@RcD2N-SJN~}FOsSFf$$@St8Ej{l34NNhZdH3GPF{q4S z_(IXLUu^rqm6@E$ot`D!rZ`AR)BMUSY*?q+npR^yN*JdSJ+j|`H~rO~q~prfA6vFR zt8!U|sJm5hr~=q2s_p8vU#KeyBJG$*Bd$=x>o4TZg!in~r(9sJpVRB`FtXGxKCR*u zPUN2aRDA%-GUMbMO)!hH{)8HL(DoXlsI>7g>1gkZN_w|r z>R(64WIf>4Mhic>EaSu`yDML%-+dsBYRnGpjv~sBu=)ip*&An8vEB|p^mnh`+YY*w z+i(NloiKdxxx0V)6hK3sRk+CeLhWo@!NJw<&$jEmX&)js)_g*A*{z2n-9z8CWe1UV zly<10GpnN`pM5&Bvz)s*99RAxHvN7@0c?`}p^&^OWi#5(sk@EwjT%LH>aizARi9xY zT%+`BDh-PkD<)*r*xf}_IV#A=ryb!S456Bq3_RB<*1KpR9QqR76m*p42fBONxwB$y z(QW?P^BcFFrj!Fd8l{$;q)LleN?Q1v7epQjVVZPpEo7zHI3U&7ZJ1MdW9GFu2BhuH zkFl(2aN_swzGKc7^-yFi>^23wILKm5Rv*>S;<)dK#Dp|367Cbsp^1#)`P53jOt21d z)Ejz=wmS{ZD_V;e=)kkrKw~s(W8|UUi0TS|fg$n>&N2ULOfUgvzSmrZlnkQ=JCv~> zHAO8fz;Xsp$u27I3SdH@SvSBy@6bl0sttA%s2;5BKKJ(f;O};jio>faXSaFeIq+9Y zS2(VoKcGM&(=8TjP^=WHe(k#qMWHb+7gFN_j_+ozt6N*n4)}1Ff)A1d@O8u4s0%x= zyoZkZNHRB9ZOZm0F$-ymq+a1_Q$H^8CY~$NDTMCnG$Ii7M~2E$csH zaVJ`tMgf!+>=g;FZ!h~IL`Jf{5yeu5`NK_$*>>Vu;7miw zJ=yDQoRDc_3Vw@TkZ<|jZU#mv;oIAVj8$H=gjm~&Cmv3Oo^qzQz>d~r;{Yx66oEGz zYl*Xl8TKonA7X`TPA|qb+`NhjX^#ok6S=A2o=A>MQ-@#D*LK8!4!6DIazZX|92Kgh z-Vd5DacVTZ-^~-;;;WRpgDw8(Az)4|td$8A1UxV2~O^cnVq^ zngC=W)YRI_Ckv4vP&WYtBmp8N=@z+mzK^tpfz(F${=vh(r_^VEwiom7gPNY*W+wfz zKr57tbVlIjNGNhW>$u8lG@I#{Iq6)@*Zv?RuNf4ui@blrv@;x0^Yr=O=h(i02_YSV zOoXe~(Fy+j`uHE>oQ5;}eZ7T0rxw5fT*LnU6sq!1n|!%JtJ9O@5^MLyT}#btthUZ? zt)NLXOZTrM0DYKqtN3e2s2dDos%Gud&At+RPx%3ursT|rDvNO9*05*gRBon5D5aY0 zWm2bw=5&?5)Eb#x6>uJ0d;Zx^F{Rd}@qHI-HgUvMKS*-m8BGG!RO3~zn$g$!7)M&w!S8wm2YZ&+SIM_#4m4=GXW<(+UnVaW0kMfNp!0x3tGREt$a?1gvZln%f6Z+4bGS?O5Wrwo@LlZs{gR2x`g-cny1yOxd9gzH$Z zrgJhvuHk=N*MjEgl+co<&YFodm^Z@&8OcMsj>f1ZslmM5F|=T5c3rNP!OZj7n?@n5 zVAf4j3QP%hYw&^nI=u1y+EMHXIuz41r1G z-p7VWxw}5TUy@u>KI}QOpwEo%7qg2##o?0606MjCv|KY{3SpImMJ-@o8fLRjboIt zVYueL=*oLd91g>rr5{mhWgCm5kd?p6 zyf94?zKy{42=QTN3cG(`TXjFrN7rZ43#Kq-H|E$u2y+u#mEnLk88Zc%BNmJozJgYS zCL}n@{Jv0+=oM@#EeIB)Ba^G~8jg@rGn*Rb(UT#!Ii)|7hs%WuhyGLtlqsxMQCBGJ6PbD69!fFMUn zkvb1@L1p2fZ^=zm8P+?8cz}BrmB8b9S(|y|J5E`Ui{yGSrEk2mCET1Cu+$RkuP}i4 zafiPq6Jq!!k6dn`>e)JmW4^s@^DkSwg)e1)*nn$ZC}H0kjn`R7bKF8uy=#2>U`Ebm zAS{bsJzz(Qi73=Gc=%H)?6XYMko<%MrGen;<|f&DH|KvSTz~4j=5oj8Y`{n`4F{th9{n2;Po};=cp#*B$_c}LT!b6#={^p%-#$D**%X`?Km>{ zhDbN08d9n`mrgx?`fXowtu=P`3ZqOeN1lu;02pBinX%)Ww~pvH?P<({sSbBX{ly~3 z^?6<`hWkR!-gyYeOj5}&{av^^;LAg^ODyG9g^Z6#;7xjricrY?Z^SK<7VThau_g=xKch123FU3q0Gog&DYO-nSv2g6{zrj1i#{L;B zr(>AcZ1~*dfn()*iV}xx>XClatpo)Yg3hFy?y`SPA?1S8T@2gOsLNb-%Ff6jBPGx( zf?SYMCN?D3PRLb4yG-yI{pDx<)QpyZ+>ND?Ue^08$^mrXe_&g8ffnhku`G%!g&9l= z4bHwKVN`d66M<>6`Ub&e;!P=gq?0SIkY(BI3UR_WPKBU|BGU7F>aphQc7h!S*>O#95$FPS^1&6uGtAmydnq|K7EJ zIGV7!U2&umf*A%mW5QhfJ@DL7Rp&To<*Azm!^o;+*q!4xs*(y)U(KX2irPjzPFLCf zObQdq5@GyI;o6rS%+^-=+TEY*X3{jc9Ar@}%Iu$emOpL`LaL@ngM~PHWNaFrWYdDL zkiQHd4*?mF>(=BCZsq34b|k}3L=I)|DpKg;yPsmHBN)*a>8ix(jtcSwf7OMxS~*Sq z!SU4p!tv?<;5Z#w!(Z>6BB6SrWCD*~U_w~W`z*5&XEmM4!%$->)M@NjcK-$C@<__P zN!t8}6LM1N%rTXFRL}8H91CgjW4B}L_<3jpBoFX|K`(_NEN&1_wPw=;7=WTFjKIcx z*~KuYd2X^^S6hgX0TodH_DZUAF2M%H;2^6K8;9%HJAgBqPj{pN(VKCoZ&Mu2F#L6KYRQ;R-1D zaVVMIp8wD7$T!(k3(sLW-w?Na&$AQm4GSpT#BYvX&FbGK(cL?-s~j6iKGn6lCfBgh z3&!QwWk~VD3+3{BPKm4#fwi~)!tLHaxLxi$;R6Nmcm{D>w{st4og%Nt5iF4WR>z_( z@fsAsd#&%7tJ(N_XLZpC_EllsD%&{6NHq+=a0Q_~nM4-pcb7FjZXzN6uXUOb_}XF38=feMn-R9gEFjWT7jOyPE%g@90|M!Q@c{Lu5bE1vL zqEre0b+P7`HGw@y00?O1{p2L0>af$Qd(c19`y4gzv+d({FpQo(u$KRO$p3@U{XjCh z6V8*S8askXD-_GFDpl`&3Yslo-c#5uYjAI_L^m~3RzL^}CC%tCIP&ei?DLuuv-pO8R zmgE7Js#KMfGJ%$THh4Yj@z+_iMvS1-LKaG#!o^?5zrY01p>GxCtX%llIm#dWMzpw( zPXT2$#>ws7`m4H&`-ywjS;ir2gPgX%`IMuainMcayDPh^Mc2z{694uW&cij*C?trb zwyrx{B~o2T)Foxmb>uO=tr8fasUOQIknwIXuX}U4s_dzaxNwd;PEIn^(~<7l|3U8R z_nJI2Mo#U<6kd0YP^yB9O|nHaEL|%(@qm8-mfLZs&{nG}1k|erm+c2HUE{NRjQ%`$ zs1jlh4Z#asMHe5BVo%g$Z|(6w%qbJTD#*`_G#<&eEt}V?5Ud@EvDQCeIq0As%B*|e zuaB(X;K?Qti2K@GsQ_ZGxH0>(x9$3Q#~`ycxw5YIq1SIbbANe38FcIZ7P|tLa_G=E z?2!Bl>m1jbE72)2@%__}kC$co7U@-uC%wi?NnZEYKVH{8zwzNcrwP-M(TS|cqDQYKjt>K%QPbIUPn8%9^#H{9%qck`=3%aQ2<5ya?hu@56^Z7uhTb|m*HSd z05hEp_mfn!)Ct_0??7?Z=5 z^1dmE3Q|(_SsMDzgERy6k6ua&Fqx5@HI{5vkummrhkI)W1@yWB_+y!%Vnrq4Y4b-l z9^vXbBQQ|b>JM19ZJ9bc#KyXwm0Qs2z4>XF#w95}EB zO~*tqM)W9iws=nPh1**DlEk+LNVMp_nV=f~j(mK8u$u^&}lHMlZ=s49dZHsC`yM~DUu2+k7k z+UQ#5e_>>ej?#~$S4tF1RtC_V75moc^2!v~&KetIHLc!7dYh^jIg*(M{ekHWi6EeA zeou37=0Y?zn!39o40$&(DJp1PA%5OPF?FXvsw>JulK6HD-+j>_{S=sk0hnK1I6mH` zuOKj271_S|6D+)E%K9-B9b`&9d1i2#Z@QO-d|ya-dtaBYQpHxa{Hg}jaNa2aR1kzR=x;v3HrW90!*(%95y48KErpd@(m! zlAhXB;o_gY<|d0a;$pGFFVD%{DU(c8uElaAE+#}kh__C-gGs&p!I z3M&@YKCh28Xv(MSB4df~RyHU;1*&Fy|2S3WhYn=#5PODQ zn{DkI$HPCg#-)qJp-%XC=Uc3FaS+v*?&dv!b{JX4aa>L}1yIg9g&KTx29X>^zacVp8vzX>K2;ZSq@5Tp<@{^m zQ;w^jdt`&&VYp33k5kvUeFEH2h_R@m>-q5QWZvYTxedFP!oPlN<|u=Moy&+I$gc1Q zVR(O&zAWpR#dSRZm|Zwtd*>+hRQ7RehiUHULih1mYsD^de2uF5uLU#?ZjiwoV(N{0 zpBSVz{2*C9*Vf=cWp;=i=;A&IK%E6j@wi<)uc0ROQgZz0&X)T-Uz(KmY(GuL3(h)e z?7@~%`Vk82f}u!1`+5aFyGkU1iqUasFf`@jwg6P{M^ zqD8o0?*@U*Dy0a)egZ}Ye=yf=36{g`3;Sbq%VK^TU8&zjSGq1_c>h^W(AV3|_v)%h z>&D$;3j?4Dq1+c2x;K%-0Z{9!=cmH8<24Ojn1o62ct?Ecp=!A?)wK-1_V&h{+rR0t zuKIbKpkI+Jd&abP-eb%!lC>R>`4*$x%@YoJvN%-7x+lw9EqsG|M7o!wwAwNh{cgPC zy(=&jDSfIDm~(gRDCuC}s`9fV{EyFO06a~q5N&CSdF{WDGKW3O z7rL8Om37Ouqdub1^yfvTHt4R{uALwCUk?c64G7}?$?f{hJLhHqC72Q4?xIVgci-Mp z>bCdH7sWPEmj>R4i z`{QSme?DK3?VY%cUU=+Cyw)d-zJ>=4Chqm-Ien%(9&@P#be1?&jVU~xlrb~EqeY54 z0a$Y!*~is&5D2eNpjc-whYvZgx9`ji+x-!>{Hd%ARwNluoCRY^0( zBUm+#XDp#{2#S>4<>DKhQZ!zA!Tdkm{V{;)cA&7$gPE>Zcd9MRvZ0gl*9*>rDS)b| zQ-PK&2)_jsLjlq_-h`WeV$=^+qBwJ|>beYPY4}w}h<-aodb6$SZ=Xs^9gq_Clvx+2 zjcEKX^Z$*GZqPF`Ywdc`td?f{b$F8czYYxu0|5`h0zPKCdHg>1EOmdzpmG0^wj@mp zn(6uf^r3a{nPGjso`=ZzKG}FBI3D~D!k$rOnU%)je5XF@vz3P5-?afz2MRRHM?SKi z$yROK+}w$Z<{LjlNqHn|^&bPDixn!bv!5DImdK`K!-`h+V#drjed^zU2$nl;uuGZugP%~SGkPKC58QNykE)>}|z#cAt z-szQs*SCdo3@}+3l^~H_{&}vWlvD(fM(DJKfGgZA-Y(tjm<^~bXqL4Lv@km%%hiWp zxJk=yp=?ruJZv*z5fw8!R$u*Z*m;x)%b_6^E0Agk!U3?)4c_W{p-61|-Q}2IXBqb8 zRF!jNMck1`z8Q-iwr#D+Yniq6@)r(e`A2&45MD#)%`AKwRZ55)5a%wVH{IhVxeoH= z?)}t+On+0HthEIL&T{ohdaNO>0J*BE)B{fz067d3Ra7DuZfkuTJ zGB_@%%jy*0$i`z_t$(&Y)K|qfSX+V?`?wr-3NP}bN)M+&RRS1qL5X_pVz3W&Ey8=xA9)O0Kk0E){`%e7zIYoU@_P6S zh9drPhUTFuE^PjevUZ@nf+h=kZPopv&sqFV_1fUlIswStFiMw>6>W z6n)vfQsp~@U+0j0eT1GK6S}Zk(-y9i<D`s5r3y9mqq7b(U2 z2&zY?{hJ|c9!cNW69)H)YnGrN40Enxd|1B$%CAeCq^G+`pS8Kvu>0-PU;Dudd;~f$ zr0g|_Z$0}EyKfnCvDj7GRoHOCPX5VVmkoQ#-}sO!!13Ohj|Bm}xkQ`2K`!HO$&Q+ z=1yjrHLnt8GCioRv+jYRgKD$Kk8=u`7Q`c#?b#=mP9ywgg3 zc6#%bmI!%DDvIaJvD=}wd5xq<0VN6A9_s#N>dHaCz+;g)O5{@KfgA*Xvb|iszld73 zsxbIP?K$DPX#3xQxGgT=-TMDq93Izp5esPczO5OgyTAq8j%%=N-a~Ai+*YtojHda~ zADwOp=}ngjX%=A75IL8ps55F5^1Hs_@s;anwr#ka$@dSFB1gw+sv>f|H&T|z!$U4| z04_(6?X4``zXSXqaQNzf;V`Oom+q3)`gcDILM#pV(<99w%w4PCCoMa}3=UAhKEII= zu0b{6^zqS0M`zxw6EN^TbJ1rn8$O}AHL&sV1c;B4_Ny~dHm&0~^ZyT=ObJ*oN^yTS zA}msMvuPXCGCTv+dR%&&)w`klh@%DoF4W2bC>S+Hp=qig@G=L$-tqbt$yj^ zuV%o#%^z`*B$Tc|^b^F_zto>T%}#C&uA;sl%`NkQ%ZzZk&_TRT-bMk$gLGn42(YtQ zRe)6AP&VJ{UKxbV3q`&yS-%&q&Ydt03qdf2?xKpBt%A9&ITy6B6Lyb;2JLKO+jKN9 zo@*d37(5Gpag|lp&lm)}sjJzHWRe*9;3nW-AkFOKI z7nt=_)p4lZVN}Tl;Br?d5(Y4bjR9LY=Z9jtJ;QbN6hHRYd=N%5He+s^i{s}y_oS1lQ& zP8=XT=Z+YT@GGvwqF+xZKMh|BsIPW6TY4Lqy?w&kqC9^k+;$X)I4*y_Rik`u;Bbic zC!Pp+>~MN>-^54XpSSLG?g#-HX`Ck*q*8!^E?eN=Y#&a=b2VxIe`k9t^nbCv0;B#2 z78$>r&X)l9O1tz&4C}^<82XWMmTLEYlRc}~=%U^`w?PTgP(qjtHzMw4E1#5RtiD-Be4tzx>ZH3qltA#V!6@|o%3vndZZ@~Aqx`cOU zT1uDa+guB}9IR%qlQpkuc~}}>W6pIriEXRy+TNT*=ss|RoSf=ZWy3Giq>q;E$20T=ff#w)1dip&mTn7>d% z&`)f62NP0bkA?U-S5RwT*zIO0j6{!TH~XI?(ewy-m{9OF46Y?UW;z_04#Ks{-Z0 zInmL^y0F1%Qk4p#X^|mC%&LwEGmgR4A0|Io+x>ZZknC@}6@1Uta!Z}Fin%TH{FbM( z--L-pns?Necc=_sc*sG>){JEGIa}JG5cy}p;h`UgP3Ybs=_>3?Q)so@tUQ!?=qHl$ zK+fsfrazsIP`nC_Cwq;04<>fEhA|l`fA>8Neg?9SlEWt5 zE>({MgM5pDoEtIkkVV!yV-a zxV`)So92@lEd>4H{dza-*|d!Z`gR}XOc6u(FEkr8dOunw>h+y>Hwz{toWA#Qv+~=O z^rZLSUv@1}W?7{?;4W)=bX@ z&L?kd^HeqUpzq20*P#F$etTqPTze=X5U>p7fA+3nj+g{J;_3ncxg?LS-PO@`nL(d-|0Bm;1{$9R+*B9ZG7L)(K@+;OX01zixtFkiW;_vjNH36Vcs)d5p)& zEpIYk5Au0W@}$tvD-L{uUtz22TnF5XyAJMP(ZU{~4H*=cd)L)MI@Z1d8$ACrJT0Hu zR!s94vs|{B`QTDxH9_NK6U$c+6uP<4`qvN{T=8DYa{1>$uTbTxpaX+lZcZRBxDcqx z!S_Qr;5%4b)+O}Q^&TuNeeb@{d5x)Il?c(@jYiaV%<_toh{>=hl?(OA-oke0uy=;N z-}wqJzrf}yb}zC}xdgWUI&r4mzo@~-QW{qLj8p70T#?(t-a1gg@J%0ajh?9Ca(FS{ zKryby-4)Z6XxXA##XwefHReJEy~lL@1!etCaF15}G|@=MoOmUSOl34ONx`C*M@FFP zIjZI??II?@0jO9Sa|Sl9(!R?O?|HJ`<3-U zGixh|BeZ_g>l@AERgEtT1=+;d+t{K^#!W6Yp4bTor0n38->IfI$xs^W@! z&t9vyv`fDm@o7y;Jbd@y9X{=grI-cOaM_Cfsfj=06{5xUC=}#6b6r0thxqR@vbIa#vf<`0K9n zB;Jfrv=7T!H)2d~EIdoos73uu2Qd74tW2@#*Tk2?5E+v6&|K&*A4^aN^65cmJ}}o| z77oy+RFISx%sFuqwgIg6R7t_)+z%~v;{Y8Ao%xSEF`5(Lr3?1A*Je%+6j>as#3+f2 z&QC(>Do&0<{9(9#vOQnk?-~gLh9=siD#=$JxvlWfd21C2+1CEiKLeblH~KM^2hUi$h#`mZsbV1_%L&X*c$OqK^4c zeBXEJS?VMMkf21up zO*@Bd%-@;teNF+1SXfh_2YADPk342QUQDDXp z?baQ)?{@-rD2R7^-ehsZ3&W2>o7`pDlZ}B#qvt)%ymVC)Z3``xlzIu>gY~1&sLXoB zPH^f2`^=ATe9Sy;+WI#JgV#5vVNvqj7OlJ%GLWJ+vu#>P;jR*Pe z*RGH7nRA5_^~fsNgSu%Op6ud^@%5*FnmYaXJnofQ$Q4*$TQs}tjxUdpwgexAv-K7) zEe*B5SP9>emar&|{?ggH*ihV15$6u^oRxdlOp2!DeLiYVQ2J)@t^ijk5H(Mot1{N} zpNet!x#`a0M%Y5|a#n5=YVr()$xR;kJ@OoL?G|_5LfeyR$&nD(1C>hAL(5w&TZDMB zd=HNh@j2v`nx`})?u+@3wlf_IO~m$K%&~7>fijGCmyb4s{)b+InpKZyF2XGXNMIq7 zW-^PCSojF&rmu)Kto9(&U5tMnMF1r9+mbSqq5O7-ji>9UzSU2yd1|;3;9N*yS zsH&H4%@uo_MR5b+Oz)i`d5urKpK#Wty!nMAFdT5e*CgL^>cBqy`eiYDQ>7$N*mke2 zYEcGHJ~5i?%LscKSQ%3j8M5~Mbr&P)8CVRb{O9r^O34X#mqpX~`plNNf&aNWc;3Tr zN$PLE_emQHl_Yh5ZlX)Mp?6;oDmBgQ!-Ls6TJ>lpw!tzS1Hl0RA|=11i3tGi>6l?q2{ZF_ zl$y2;Y#5OdOW0G3HqT@fEA5uhTD5`<6%F|0%l959PN`d&sd4rA#F9H!AT~I zXEm3JH$stq-zEL$lmU?{X7!y?GpHleB)Y34ffhqsH21iUw5h$E`C=U7`@ zS@$$YC0^BRL%=?UI6uY~k5#^7b80hN-i1_l3ko~LHj!4n|Je|?;RSvG;k%@$$@rAd ZZ_tM7&H>>q>0h+~Iydw+OVnYZ{|imei2(or literal 8685 zcma)?2T&7V-|s=`z4t0ANUug|5v58K>AeZ4AOR@~goIwCsYovYK~Q=T0TDv)5Q>1* z5Q-py&|4tk2LI3VyzhPQy)*aBWV1WjGjlfQ?0!DqUlRMsK#P`&i;9Sdh*n2i{V@^I z6&1q1It3YF`&=kOknl$A^H}RXQN=LNAHo5No0`5F5m9v#^|>P{;h54(+tP=Kh_3ze zMa(MlFocLmFib~X&E&P+Ru0I6r^l}qX7aU&R{AB?7g-yDmvq8ZEH+bDUQUHw*((;H z*pCR4-HWsjEDHUg8vb135$VS~5|4ax?RyNF20AfpW1>!>*cKgZUco&E1*V^x_dfbP zZ+KZQlvo{Z;Wsp66k+B2&Q(e7?8wvK-{0-nf7>$&3|xHIq@)CT_1o2LVRLwJ_9_t@ zbC>`{7z2x#wbyfJmOVN2>j(jVvY=g{8UsnSZhCrp$k33!Ab3`zf~Gji+%cDi&x!m# zD3Bsdz@PEcV@gT8A2C5hv$L~1i{bQ3d}^B>z3YM$VZUxDAO1AnnQ^ad>!1h5Yk3@P zqV$Q?Ho@!rY-$XhbS_}94Y<8VT(@^`ucl{Fu)QOpd&(Ya`$Iw~kR4wX9YJVM$SLJK z%amSWq^*T{2Jh5ZI9E)tnx=aT9NZ1edBF+saB{lO4NPbs_r1Ue{|MsFjoM#B0LIn} zqt|e{Qq4cL0ZGT<{O7F&$%SGcRkX)k`%SF`ayJY&)LYsFG-DO1^BUs$@YW;0=ifuIH*8f;vOCP&&s zBPwb^w~O-!1K5j9K7wA6y1#{)zA;1-i`T$g%B2&Gd{kTr)A3{ryN*s7bk@sY-~?4zW|tGFWp zo#F^~J=s9*Yq9eVx49pB+`Gzk;mD1TWB0%O<8tXiI% zCN3jq6W_03oO8aTjH;8Z9z4glB0K|en?Any(+}BKkJ*B6r7VEDm)iS;{!&-ifww&VBN46v49!{UqCYNP5pEB!zIFOc^Q1Q1kP0(zT98betKR`#qV@LBLnKItH zOB&_)S~tWJXWwpPQYfqV(JYVWpqEhbvjf7$b?6X);8iq9j?dd+iXIxG_b!daa6y=C zpkzOjo?ZtmDb|#3XhUV_o(zA`ba3_sZX!VgCu53p6}8J(H-#T@47%pE>`G!H3-+-x zc|DG#e6DhF(r^CE07<30t!Sg2T-c(_i)H2us?sw^&7L_uWrm+KV%vC-Nnernpjwg% z(^JmAwqG$9wePsOLkBmM3*Dzv9MdICxZ&sn(OEYI|7$B`0#{2PMN(DuFQnGV4t#nW zbK=(^u{!nL_SWgT$bCnSJV`BzYc+TtnkVgOIQUf7v+)la-60``z^~2{RsIMKx{Ux{ zD0?DL# z7n%M&;lReXx>Ippl!$2~LCsakWG8Lc+vf~RvNUknCiI5I0s&3tyOVw zuvVFubKj%{drrx3Z#c7Ed2Gprq&Kb#Nx)ud{Rx3h_{4b?b?SvY7uvV`b2LnRimfk!2V(U>ZgyU=X9nzwg;~C&ony#;MpJJVI|AwC)I$Kb8)Y~gXNQI6p9H#ma99cDwFB>JC^NCT4$|a?aAy?@_O1{& zkVMw0RIN4Z9!csVm5$7;8w2){-lZB#lS%3VK{>gPuJenb>!%Ihhlai+mK8IWw&xG6 zt`>h{R=O$U-)d*Tbx5tZySxDsGQgiqR2O+MUu!wA`pJmoE4@#c?I}iHqG|n*7oER& zNjY74a=4vx7mLF;?z3Z1$U1;h$dEryGXyE)8j)uxc71Y+`b%@%8q{RbHl^_??p;T8ua$C zJaKpu;m1%}S!-vK^P={-)Y5Bi=uA0$s(Qcpt3ZNrZ%Y2$(^%>Acprhh-SPHiF=z2~ zeBB6){Bj}_KeI&V3kbuKW{r?6i?c@a!YG@@Ie8n41&PnF)+@JKH_bjVB7I&UaMN(m zTGfEo{yr}hb%ShPUBv$7sfj9M&3Q?~!iED@89VKb|Mp8Rw>J+q>Kz`voAt$L?j=W9 zqj0)W8!e?W6Ul{#-_hoDs%qt*-9~)#xK}#7?cGXa6A?E*Z?p<%zaE&jIAiO|qVfh- z=fLDj=jwRM3|<~@1qN4w<2WB*u;6`QgVabTz;Z|RMF4C4ov1yOH8d&$DkHe86X)hz zE410RpCcpS%6&&p((CITcb~6oSRKIOD_a{m^GDZ)b5Vcxlj7vGvb~0SgN^YdXn-zZ zmW#XO9&{+3>8vDe$I}KyB{KszYF&UZ{FHy%wnC}RJm%#2UEg9@ z>}?p%6{O>+h*PZvwNJX}w8dWuD(9Y8R0Q*K(}LN|23}V%dj)IE)z&veSQhtaK`*io z(GvbYG{Ri_b~8XjXz~<#ADn9me`AC1$fkr~kXJatSbh;#ePgX(Vq8hm^|Q((dA4_v zV#2p^vX=&@0iI1Gypq=CSAyaQC&s9w!{5H52jc!15*`Qqo%+7?vX}Z2qkSFV>9)3) z09r4`K)2lOJ0w1Gza4$RK%&M#>9e<@GxzcVDDXRV!k8f~ccD{b%C`tx)zCbcfr!DY!@7 zn;+frLHeBa*ZYy>+*7FC6#U)N)JiK;mmIzpPs8z2#pCauLC&B~YNn!E=BJhVW0a$O zEKX98!tWLJV%qapMMX0ySl#}o2sEE$OCt47m&f=8k-R!*xWkHcuIY7@5lo`qEBO7W zvlV!Ra;+yDX{Z8BO&-B8ifJ9SPo^~=8_f3wa7Scqrah;C_7^)M@_ZETFS{zhCl zj<{hYi7(VB=&`eT%PoiLqHKI;T>BfFwMNHVqNDlNQZW?sy@q5GR^g^)KdFDX=E=Ie zCQ*sT`Zd_DwVeGj%#%(}z=(eQ7wTv1iulI6f(;U@%{uZeG9pnE&-9K(#9dwMDi?8k zTI=LKVBqDQpRnUsM`ikX9T=M4-X(jyCzl#to!As$^dqm-Q)9_N4^EsQRqfaGcyF|= z*~v$WMUYoheUd-n4y>LpXj5kH4T@V#6agUxeEjc(CP!tJlGS5Ss>?%&=S1{8hYeZU zVr_?RjH0!i79Tdcp6CBz({wHV2KAQaQc;{boOAa650me5?CkC{s{;;>Re!uM?^yb< zlI9jn=E_CN)xlj+H|6MRx6NP#1EQAEgztTegry+gF3crul!2#3zJXo_ly?Y@wJ8Z zIpa<9M;`Msmce$yIpxBI7rSr1lH}^VA>tKXxSd}xx;49#*qq`Se<&mdD2QtnZPmo3 zl^oq#-Si8*|1K~f5q8A(wmDmV<6pzfKHn8vyBw#gyS(%kEhT#Q?$+@XI_7}f zfr_0s#xx!D;hEWFF!75%Y>I?YWVRX&sKBN!w_Cmcn3 zrvbOGWQ>5gsHB@FcpfaqqDMg&Ejb+@fH!~cSw_fxqA4@09_Z?~>YAwSJ_tCN1^xW7 z97}^WRzLl;|4lIT%qkVn3V2UeoKrdlf5k<&54iJQ`4zjva=~)d+6(;A8k0@aK>SCo z-C8!i_Q5uBacCF_%yqI3Vuqb}AeU3a^T8(*u-rW@bq%HV3|%87p*A*|+13niL04yW ze$EAboGrV{LG#;~TKU)5^W~*MM&Y?v{vV+K23EaBAJ7~&5r}tH^=~492OdEE2l0J5?Guz#NLMTZ&jJP-Y1V|Tf zv)?a!+Hzsp&Kt3?6z1J%cM~JM%3})sa0|^iW_QsKTT0dUI;E&U&aE24Z6dzl=eh{7 z^FE=C6}-P*g1LhxI?cB$m}N>OtBBPb8N{6zW=#3QmSdl(z~uh{txb+ zW)%L2OnZ3aHSTb#V~9g2hQlj4^>3Vx)_$Mv3sVkYs<(>fRBmA1#D6p%qw}2_eZ#R{ z$)ByqWWTOYPe}cvvQta1wY4q{a&9Bh-zkPaObIcwHgZ)DY`{dsfbWA}qq{j)8fX`Z>|Yt3G0- zW|^exE8SY;Qf_#iuT-W~h~lH*-VzgdPp7klPJX)yuAVx6TG1AY4wj#MGa3yzzM)fi z9?%^1$~t}2p>dMl=}&ykaGHhCr+2WDDgRv9p5lYi=I`X_w8Ac=u+Ed_SD3lB0;XAb zcf!isN=^}+kC}vhT}i*93V1%Z$lMR(H-FT<5W{}mX!1{N2pAfAA67pj8VU;h72X#p z;9{Os^!RI#$oItL^_=x9;PpfZ%^dO0!QDT5M`YnaSRP}1!$JL9@5@B%!cf!h9$Jgv zRpwm#X)npk{8I9_*|QCY8)?nT;S zP@Er!r?||J?ay91zEi62b9HH%O%O3m=0P$07I%gjPt?tJhQZ)d($6p0T5+K-m<%wp zA?B*SW)3YRt4X&V4qoN3X41|nC2;>~4L}#0dE*aZw)Ni;#~|}p+}#LkyQu@TSJp)3 z0(rJ+SJ}LH|9t`} z#Nh+E>)s8uSs)sp{flq{;L!5>tk(-f`-QLlhja-X(-aB}mQcl!;F*n^527ntW1D*N zcVzgy7+Mx2P&EGUWKK!^Xmq0fMpTBgo+S0)u9~_=CtG474()V`lKQ!Y$ zBn`RMcMF5{v-mR5QfwiWbQabBqzzIs7IIF|L2yk%i<8NOZWYh;=l1${2Ff=sviVLX z&4^1L7Ch-dQdFbQ6x(ShVYgS>x^0W0bQPOT#E2jKEBAewO2B#uMsUYCfGBC!T1D+B z-g&1N#CrEP7KFpT83?V~QCWNWt2t;IeGqZ&@I7+AmBu9lLBqJ1Az}ArpqU+9m!*lj zVtn3!osH=$^|Ps*`VZY|qHl^5mQrqyWES<{W;OTWtq{^w@&wHs(=g+>GWVAF(~kI$ ztga>z{vtD!PUrZ1i^ykvRL_P!TZMV&!V~#!Tui#{rC*qeRDl!iv`El~%;&XcW2Msd zbaJj_hOG(u7#sN*-A9XbI>-)$&1^l2H>i}nw^jNgqWlgSTf~Lu7Y zaqvtXuU>g#R^-s4dd1n#)KAuf0KpnC2>lPGcL$$P9lR_ za9f~dv*d@L!mN2U|A}@()0UN(tG#^Sj-zzQ|0UJU#Q8u`4x0TYe0ya!v? z0jlY-r0FSSe5F%I3ozW7KV1`89=MG{<3=$A1Fw{1dULB%L(Xt+?H&nx2Fs)}?$3%@~7 z$+o(tL6N|poDIs9a^$sD=dp5C(`{PNEH6xDto`X*3$EyAb&|B!7ZRq93dUip(;|Lo z%sr*@I9-`mgx#~A z`#!DYEdye4%Dg3c6G(-W4%O!tR( zx>t{TJ9&$;3pVJ}Ya}aqcv-hqPCn2hKKLmc-g0)_0G{y-?Kj3n@+2~msBKDHaK~Bi zdSyS^I5>?|Iav(%BGh{4;^>Z7{WgEe%%SNZr^j7V5Jpwpf$?0NCV(e#ukC1H`t)~$ zOqkh|n}p!j=K&(DG`A5>Wq$Auv1i8LeC9_FS8fVC6!kp3@(1TJ^P)jV_ma~!57dh$ z5caD=Rd5jy8Osm8L=(49XmPrgxma%)(CogiPJ)@$RJVr0c%imq_rm& zi{-+a=#d%y?GqbI`L|DW;XVN`f1w^17uVqOyH6O_DFIM}b)5`K@Pd`;)3N}drFRS0 zDG|HRLX!SWKX&ue5by*`fQu~Ccv;-b1(^u#{X$Bzu%Tw)a=|t9<6UZ>#NB%Jasjy7 z#<$KsCEoAun-6yf4bWvHS-Fye;!oWahD0Et6!#tce{G05=X$+5uVedeD~YqkH2$G5 zfKy%)I%r?m6Q6OA)gI2NXcAt^Zx`7zp{o2|+y-vJv2z8aWfHa69s)U<95)0eSO=aj z2#~(a6UBV;*S32>KDF4^MLq8*=%9GE@bLEbHRRRN+sdg7I^;KY4|TNtTYjPatD;8S z>@C;cP>Z>h&8H2gO<}604xUA6vGT!#Q~)_7#Yk}{fv3(4pEH5G`%)J0yUJxAkR8qz zx0u8-Elmiw&sxC?Wl{c~Ly&xWcmJKh2f7#8rUT~XB{c^0_VqoAvax5$W(pw7iK~x) z2%r=#=RPk7>VcwG&SodZRskw{U&$kP$U)^cZ1gf|*}nZI8icv$Q1~K#e~gKe3X zuXOe3xmWxHe35M)zUGWMQ%S@sbYcwBS(Xt-dhEw{?{+`Tc~C7&-BCK8jxn+r9DHv7 z?(Dy4?=RoZJ}GLFKX#(rF8VdZ@bjFY;PJ8+(@Y{2zJjXVrpkl3&gEzLUZl>AAigJmQWaD23;xQUDt^OwC`aMr-jL z{M#y$miO`o`=(o9vX)gauEfW;2&dF21 z-08ClN2bCj5ujnB|M?Al^~3-`|8nVO`!aDIQ!0goELP4Ly7 zhGy0`pY2xHhMQxoy<@F`nIq^V&GR)tP`AwM?hH;LXM+puA!}N=3(6X?`gRW1Y+xwz zSn711_J=^zUDp^QI4A1V5|h)(5i{k93p=8_?je?$n@4TjK*X5yb zRM3+G*IBWzg7`FK3?MJAhG@YaU}2;+mCIL=Y`q(Ls!=H6WS zKUC#`R>422BB8t!G#Y>XvUGcsX_OxTA&Zi;iJSGV5zxb7Hcq~1K9;_$TC2Wk6!{_I z=8$I-GT6^DIHl#_Ax_Pi_9^mz5A3DR`h>zfc=D;k4ENHkvne6(Zjns72_UtD_$q48 zJzk%(oidY7HW+BVHa1(CXN>hXA2W(2EPMUaL0auow4f8=k*U?v0;JzQdof3aa86Po zW0$h)RDr(t0##R^a(Zgq`~MdiO?f!PH)0-IWSY{g|hQ%9T}oQ}MD{-6G3H2=?vbu0ed$i9)UW^83Q1@slH_T`_lb+ciG zjVBkM_!hM!zmh**9tcID?TbgBteM-?#phh0xF@a*dYW=gAugQwcg}3d@=%}p+_`sPkM=Q0nbtN)Ard%L$s7t;y~3; z|G2?FDjMFf?C$E>&|fdB)R}?H&xNgnIIfx~;nK!>kV4mwC@E0|q1v z2QACIdwjgC?nX^~m5bZqF;`{@hsanzq}tbG2Q2WXg`mZja+|wuXkt6%U;8VoK@Ku8 zeC=sTPY(;2>IL`Ygkc543$mT`xN~fwVMg=B9YY=GbnLvUmhlA%#jzg(KyU?$uzMIJ zyfWuSH@UA;?^G4vrXNh(C7Ry6eU+%nP z0T;cKZ+fTA{+~|-3W{}ijMl7&pC=ogUtuI!5`D12#Wu{v$= z{%uk@IxV_R!R+SB4^MMEMe`mmxlpSW3w1A3tRsG-`+)El4I&*41NDmg4q^WbU@e4d From 49c8b0f02604200c5406f97e89aa3a77e87bed39 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Thu, 12 Nov 2020 13:15:35 +0800 Subject: [PATCH 06/12] deployment process refine (#5077) - move requirement check before configuration setting; - move quick-start-config folder setting to quick-start-services.sh; - fix installation document branch checkout issue; change suggested version to v1.3.y; - refine uninstallation doc - fix git clone issue: clone to an existing folder may raise error, so the code pai/kubespray may be unsuccessfully cloned. --- contrib/kubespray/quick-start-kubespray.sh | 17 +++++++------ contrib/kubespray/quick-start-service.sh | 5 +++- .../script/configuration-kubespray.sh | 25 +++++++++++++++++++ ...iguration.sh => configuration-services.sh} | 7 ------ contrib/kubespray/script/environment.sh | 23 ++++++----------- contrib/kubespray/script/service-boot.sh | 15 +++-------- .../cluster-admin/how-to-uninstall-openpai.md | 4 +-- .../cluster-admin/installation-guide.md | 7 +++--- .../cluster-admin/installation-guide.md | 3 +-- 9 files changed, 54 insertions(+), 52 deletions(-) create mode 100644 contrib/kubespray/script/configuration-kubespray.sh rename contrib/kubespray/script/{configuration.sh => configuration-services.sh} (63%) diff --git a/contrib/kubespray/quick-start-kubespray.sh b/contrib/kubespray/quick-start-kubespray.sh index 42b6ad901b..08d5f09956 100644 --- a/contrib/kubespray/quick-start-kubespray.sh +++ b/contrib/kubespray/quick-start-kubespray.sh @@ -40,14 +40,10 @@ then exit 1 fi +# environment set up /bin/bash script/environment.sh -c ${CLUSTER_CONFIG} || exit $? -/bin/bash script/configuration.sh -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} || exit $? - -echo "Ping Test" - -ansible all -i ${HOME}/pai-deploy/cluster-cfg/hosts.yml -m ping || exit $? - +# check requirements /bin/bash requirement.sh -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} ret_code_check=$? if [ $ret_code_check -ne 0 ]; then @@ -60,8 +56,13 @@ if [ $ret_code_check -ne 0 ]; then fi fi -/bin/bash preinstall.sh -c ${CLUSTER_CONFIG} || exit $? +# prepare cluster-cfg folder +/bin/bash script/configuration-kubespray.sh -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} || exit $? -/bin/bash script/kubernetes-boot.sh || exit $? +echo "Ping Test" +ansible all -i ${HOME}/pai-deploy/cluster-cfg/hosts.yml -m ping || exit $? +/bin/bash preinstall.sh -c ${CLUSTER_CONFIG} || exit $? +# setup k8s cluster +/bin/bash script/kubernetes-boot.sh || exit $? diff --git a/contrib/kubespray/quick-start-service.sh b/contrib/kubespray/quick-start-service.sh index ca6c4177e3..e1eaf756b1 100644 --- a/contrib/kubespray/quick-start-service.sh +++ b/contrib/kubespray/quick-start-service.sh @@ -40,4 +40,7 @@ then exit 1 fi -/bin/bash script/service-boot.sh -c ${CLUSTER_CONFIG} \ No newline at end of file +# prepare quick-start-config folder +/bin/bash script/configuration-services.sh -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} || exit $? + +/bin/bash script/service-boot.sh -c ${CLUSTER_CONFIG} diff --git a/contrib/kubespray/script/configuration-kubespray.sh b/contrib/kubespray/script/configuration-kubespray.sh new file mode 100644 index 0000000000..d7e3d60228 --- /dev/null +++ b/contrib/kubespray/script/configuration-kubespray.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +while getopts "w:m:c:" opt; do + case $opt in + w) + WORKER_LIST=$OPTARG + ;; + m) + MASTER_LIST=$OPTARG + ;; + c) + CLUSTER_CONFIG=$OPTARG + ;; + \?) + echo "Invalid option: -$OPTARG" + exit 1 + ;; + esac +done + +mkdir -p ${HOME}/pai-deploy/cluster-cfg +python3 ${HOME}/pai-deploy/pai/contrib/kubespray/script/k8s-generator.py -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} -o ${HOME}/pai-deploy/cluster-cfg || exit $? + +cp ${HOME}/pai-deploy/cluster-cfg/openpai.yml ${HOME}/pai-deploy/kubespray/inventory/pai/ +cp ${HOME}/pai-deploy/cluster-cfg/hosts.yml ${HOME}/pai-deploy/kubespray/inventory/pai/ diff --git a/contrib/kubespray/script/configuration.sh b/contrib/kubespray/script/configuration-services.sh similarity index 63% rename from contrib/kubespray/script/configuration.sh rename to contrib/kubespray/script/configuration-services.sh index 375afcdff9..6f5e8e1c8f 100644 --- a/contrib/kubespray/script/configuration.sh +++ b/contrib/kubespray/script/configuration-services.sh @@ -18,13 +18,6 @@ while getopts "w:m:c:" opt; do esac done -cd ${HOME}/pai-deploy/pai/contrib/kubespray -mkdir -p ${HOME}/pai-deploy/cluster-cfg -python3 ${HOME}/pai-deploy/pai/contrib/kubespray/script/k8s-generator.py -m ${MASTER_LIST} -w ${WORKER_LIST} -c ${CLUSTER_CONFIG} -o ${HOME}/pai-deploy/cluster-cfg || exit $? - -cp ${HOME}/pai-deploy/cluster-cfg/openpai.yml ${HOME}/pai-deploy/kubespray/inventory/pai/ -cp ${HOME}/pai-deploy/cluster-cfg/hosts.yml ${HOME}/pai-deploy/kubespray/inventory/pai/ - mkdir -p ${HOME}/pai-deploy/quick-start-config/ cp ${WORKER_LIST} ${HOME}/pai-deploy/quick-start-config/worker.csv cp ${MASTER_LIST} ${HOME}/pai-deploy/quick-start-config/master.csv diff --git a/contrib/kubespray/script/environment.sh b/contrib/kubespray/script/environment.sh index 379ac0b1bf..17a41d4abf 100644 --- a/contrib/kubespray/script/environment.sh +++ b/contrib/kubespray/script/environment.sh @@ -16,14 +16,10 @@ OPENPAI_BRANCH_NAME=`cat ${CLUSTER_CONFIG} | grep branch_name | tr -d "[:space:] echo "Create working folder in ${HOME}/pai-deploy" mkdir -p ${HOME}/pai-deploy/ -cd ${HOME}/pai-deploy -echo "Clone kubespray source code from github" -git clone https://github.com/kubernetes-sigs/kubespray.git - -echo "Checkout to the Release Branch" -cd kubespray -git checkout release-2.11 +echo "Clone kubespray source code from github to ${HOME}/pai-deploy" +sudo rm -rf ${HOME}/pai-deploy/kubespray +git clone -b release-2.11 https://github.com/kubernetes-sigs/kubespray.git ${HOME}/pai-deploy/kubespray echo "Copy inventory folder, and save it " cp -rfp ${HOME}/pai-deploy/kubespray/inventory/sample ${HOME}/pai-deploy/kubespray/inventory/pai @@ -44,13 +40,8 @@ echo "Install sshpass" sudo apt-get -y install sshpass echo "Install kubespray's requirements and ansible is included" -cd ${HOME}/pai-deploy/kubespray -sudo pip3 install -r requirements.txt - -echo "Clone OpenPAI source code from github" -cd ${HOME}/pai-deploy -git clone https://github.com/microsoft/pai.git -cd pai +sudo pip3 install -r ${HOME}/pai-deploy/kubespray/requirements.txt -echo "switch to the branch ${OPENPAI_BRANCH_NAME}" -git checkout ${OPENPAI_BRANCH_NAME} \ No newline at end of file +echo "Clone OpenPAI source code from github to ${HOME}/pai-deploy" +sudo rm -rf ${HOME}/pai-deploy/pai +git clone -b ${OPENPAI_BRANCH_NAME} https://github.com/microsoft/pai.git ${HOME}/pai-deploy/pai diff --git a/contrib/kubespray/script/service-boot.sh b/contrib/kubespray/script/service-boot.sh index 25b50ea56b..aa4d7b921b 100644 --- a/contrib/kubespray/script/service-boot.sh +++ b/contrib/kubespray/script/service-boot.sh @@ -46,16 +46,6 @@ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py python3 get-pip.py pip3 install kubernetes==11.0.0b2 jinja2 -cd /root - -git clone https://github.com/microsoft/pai.git -cd pai - -echo "branch name: ${OPENPAI_BRANCH_NAME}" - -git checkout ${OPENPAI_BRANCH_NAME} -git pull - echo "starting nvidia device plugin to detect nvidia gpu resource" svn cat https://github.com/NVIDIA/k8s-device-plugin.git/tags/1.0.0-beta4/nvidia-device-plugin.yml \ | kubectl apply --overwrite=true -f - || exit $? @@ -66,6 +56,7 @@ svn cat https://github.com/RadeonOpenCompute/k8s-device-plugin.git/trunk/k8s-ds- | kubectl apply --overwrite=true -f - || exit $? sleep 5 +git clone -b ${OPENPAI_BRANCH_NAME} https://github.com/microsoft/pai.git /root/pai python3 /root/pai/contrib/kubespray/script/openpai-generator.py -m /quick-start-config/master.csv -w /quick-start-config/worker.csv -c /quick-start-config/config.yml -o /cluster-configuration || exit $? kubectl delete ds nvidia-device-plugin-daemonset -n kube-system || exit $? @@ -79,10 +70,10 @@ pip3 install kubernetes kubectl create namespace pai-storage # 1. Push cluster config to cluster -echo -e "pai\n" | python paictl.py config push -p /cluster-configuration -m service +echo -e "pai\n" | python /root/pai/paictl.py config push -p /cluster-configuration -m service # 2. Start OpenPAI service -echo -e "pai\n" | python paictl.py service start +echo -e "pai\n" | python /root/pai/paictl.py service start EOF_DEV_BOX if [ $? -ne 0 ]; then diff --git a/docs/manual/cluster-admin/how-to-uninstall-openpai.md b/docs/manual/cluster-admin/how-to-uninstall-openpai.md index ba17aa4889..ba16fc133e 100644 --- a/docs/manual/cluster-admin/how-to-uninstall-openpai.md +++ b/docs/manual/cluster-admin/how-to-uninstall-openpai.md @@ -2,10 +2,10 @@ The uninstallation of OpenPAI is irreversible: all the data will be removed and you cannot find them back. If you need a backup, do it before uninstallation. -First, log in to the dev box machine and delete all PAI services: +First, log in to the dev box machine and delete all PAI services with [dev box container](./basic-management-operations.md#pai-service-management-and-paictl).: ```bash -./paictl.py service delete +sudo docker exec -it dev-box /pai/paictl.py service delete ``` Now all PAI services and data are deleted. If you want to destroy the Kubernetes cluster too, please go into [`~/pai-deploy/kubespray` folder](installation-guide.md#keep-a-folder), run: diff --git a/docs/manual/cluster-admin/installation-guide.md b/docs/manual/cluster-admin/installation-guide.md index 39fbe9cbb3..f82c02ad2c 100644 --- a/docs/manual/cluster-admin/installation-guide.md +++ b/docs/manual/cluster-admin/installation-guide.md @@ -102,8 +102,8 @@ openpai-004,10.0.0.4 ```yaml user: password: -branch_name: pai-1.0.y -docker_image_tag: v1.0.0 +branch_name: pai-1.3.y +docker_image_tag: v1.3.0 # Optional @@ -192,8 +192,7 @@ docker_image_tag: v1.0.0 On the dev box machine, use the following commands to clone the OpenPAI repo: ```bash -git clone https://github.com/microsoft/pai.git -git checkout pai-1.0.y # change to a different branch if you want to deploy a different version +git clone -b pai-1.3.y https://github.com/microsoft/pai.git # change to a different branch if you want to deploy a different version cd pai/contrib/kubespray ``` diff --git a/docs_zh_CN/manual/cluster-admin/installation-guide.md b/docs_zh_CN/manual/cluster-admin/installation-guide.md index 6f821c6dd4..a48be8ed85 100644 --- a/docs_zh_CN/manual/cluster-admin/installation-guide.md +++ b/docs_zh_CN/manual/cluster-admin/installation-guide.md @@ -191,8 +191,7 @@ docker_image_tag: v1.0.0 在dev box机器上,使用下面的命令克隆OpenPAI的源代码。 ```bash -git clone https://github.com/microsoft/pai.git -git checkout pai-1.0.y # 如果您想要部署不同的版本,请切换到相应的branch。 +git clone -b pai-1.3.y https://github.com/microsoft/pai.git # 如果您想要部署不同的版本,请切换到相应的branch。 cd pai/contrib/kubespray ``` From ca72107155da3fb2bce6a003de305239c1f62104 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Thu, 12 Nov 2020 13:36:14 +0800 Subject: [PATCH 07/12] change log-manager redirection (#5047) Remove yarn log-redirection config. update config for log-manager --- .../pylon-config/location.conf.template | 143 +----------------- 1 file changed, 1 insertion(+), 142 deletions(-) diff --git a/src/pylon/deploy/pylon-config/location.conf.template b/src/pylon/deploy/pylon-config/location.conf.template index b1e303203c..19b144b0e9 100644 --- a/src/pylon/deploy/pylon-config/location.conf.template +++ b/src/pylon/deploy/pylon-config/location.conf.template @@ -33,23 +33,6 @@ location ~ ^/rest-server/api(.*)$ { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Real-IP $remote_addr; proxy_set_header Accept-Encoding ""; - subs_filter_types *; - subs_filter - \"containerLog\":\"http://([^/]*):8042/([^\"]*)\" - \"containerLog\":\"$scheme://$http_host/yarn/$1:8042/$2\" - r; - subs_filter - \"appTrackingUrl\":\"http://([^/]*):8088/([^\"]*)\" - \"appTrackingUrl\":\"$scheme://$http_host/yarn/$1:8088/$2\" - r; - subs_filter - \"privateKeyDirectDownloadLink\":\"http://([^/]*):5070/([^\"]*)\" - \"privateKeyDirectDownloadLink\":\"$scheme://$http_host/a/$1:5070/$2\" - r; - subs_filter - \"containerLog\":\"http://([^/]*):(\d+)/log-manager/([^\"]*)\" - \"containerLog\":\"/log-manager/$1:$2/$3\" - r; proxy_set_header If-None-Match $request_if_none_match; add_header Etag $response_etag; @@ -90,101 +73,9 @@ location ~ ^/alert-manager(.*)$ { # log-manager server location ~ ^/log-manager/([^/]+):(\d+)/(.*)$ { - proxy_pass http://$1:$2/log-manager/$3$is_args$args; + proxy_pass http://$1:$2/$3$is_args$args; } -{% if CLUSTER_TYPE == 'yarn' %} -# -# YARN web portal -# - -location ~ ^/yarn$ { - # Add '/' to the end of the URL, otherwise there will be a 404 error. - return 301 $scheme://$http_host$request_uri/; -} -location ~ ^/yarn/([^/]+):(\d+)/(.*/)?([^/]*)$ { - set $target_host $1; - set $target_port $2; - set $target_path $3$4$is_args$args; - set $condition ""; - # - proxy_pass http://$target_host:$target_port/$target_path; - proxy_set_header Accept-Encoding $routing_accept_encoding; - proxy_intercept_errors on; - error_page 301 302 307 = @handle_yarn_redirect; - # - set $base $scheme://$http_host/yarn/$target_host:$target_port/; - # - subs_filter - "" - "" - o; - subs_filter - " src=\"/static" - " src=\"static" - g; - subs_filter - href=(['"])/ - href=$1 - r; - # - subs_filter - href=\'http://([^/]*):8042\' - href=\'$scheme://$http_host/yarn/$1:8042/node\' - r; - subs_filter - href=\'http://([^/]*):8042/([^\']*)\' - href=\'$scheme://$http_host/yarn/$1:8042/$2\' - r; - subs_filter - href=\'http://([^/]*):8088/([^\']*)\' - href=\'$scheme://$http_host/yarn/$1:8088/$2\' - r; - # - subs_filter - href=\"/([^/]*):8042\" - href=\"$scheme://$http_host/yarn/$1:8042/node\" - r; - subs_filter - href=\"http://([^/]*):8042\" - href=\"$scheme://$http_host/yarn/$1:8042/node\" - r; - subs_filter - href=\"http://([^/]*):8042/([^\"]*)\" - href=\"$scheme://$http_host/yarn/$1:8042/$2\" - r; - subs_filter - href=\"http://([^/]*):8088\" - href=\"$scheme://$http_host/yarn/$1:8088/cluster\" - r; - subs_filter - href=\"http://([^/]*):8088/([^\"]*)\" - href=\"$scheme://$http_host/yarn/$1:8088/$2\" - r; - subs_filter - url=http://([^/]*):8188/([^\"]*)\"> - url=$scheme://$http_host/yarn/$1:8188/$2\"> - r; - # - subs_filter - 8088/proxy/application - 8088/cluster/app/application - g; -} -location ~ ^/yarn(.*)$ { - set $target_path $1$is_args$args; - set $yarn_root_url {{YARN_WEB_PORTAL_URI}}; - if ($yarn_root_url ~ ^http://([^/]+):(\d+)$) { - return 301 $scheme://$http_host/yarn/$1:$2$target_path; - } -} -location @handle_yarn_redirect { - if ($upstream_http_location ~ ^http://([^/]+):(\d+)/(.*/)?([^/]*)$) { - return 301 $scheme://$http_host/yarn/$1:$2/$3$4; - } -} -{% endif %} - # # Other web portals # @@ -198,38 +89,6 @@ location ~ ^/kubernetes-dashboard(.*)$ { proxy_pass {{K8S_DASHBOARD_URI}}$1$is_args$args; } -{% if CLUSTER_TYPE == 'yarn' %} -# WebHDFS dashboard -location ~ ^/webhdfs$ { - # Add '/' to the end of the URL, otherwise there will be a 404 error. - return 301 $scheme://$http_host$request_uri/; -} -location ~ ^/webhdfs(.*)$ { - proxy_pass {{WEBHDFS_URI}}$1$is_args$args; - set $base $scheme://$http_host/webhdfs/; - sub_filter_types *; - sub_filter_once off; - sub_filter - "href=\"/" - "href=\"$base"; - sub_filter - "src=\"/static" - "src=\"${base}static"; - sub_filter - "/jmx?qry=" - "${base}jmx?qry="; - sub_filter - "/webhdfs/v1" - "/webhdfs/api/v1"; - sub_filter - "'url': '/conf'" - "'url': '${base}conf'"; - sub_filter - "get('/startupProgress'" - "get('${base}startupProgress'"; -} -{% endif %} - # Grafana location ~ ^/grafana$ { # Add '/' to the end of the URL, otherwise there will be a 404 error. From da3abef491dbccc791fa5441a5562c39d5518fbb Mon Sep 17 00:00:00 2001 From: yiyione Date: Thu, 12 Nov 2020 13:59:03 +0800 Subject: [PATCH 08/12] [Marketplace] Deploy scripts and config (#5066) * update * update * update * update * add marketplace-db * update * update * fix * fix * add marketplace-restserver * update * update * fix * update pylon * update * update * update * update * update * update --- .../build/marketplace-db.k8s.dockerfile | 4 ++ src/marketplace-db/config/marketplace-db.yaml | 11 +++++ src/marketplace-db/config/marketplace_db.py | 27 ++++++++++++ src/marketplace-db/deploy/delete.sh | 10 +++++ .../deploy/marketplace-db.yaml.template | 43 +++++++++++++++++++ src/marketplace-db/deploy/refresh.sh | 11 +++++ src/marketplace-db/deploy/service.yaml | 20 +++++++++ src/marketplace-db/deploy/start.sh.template | 13 ++++++ src/marketplace-db/deploy/stop.sh | 6 +++ .../marketplace-restserver.k8s.dockerfile | 4 ++ .../config/marketplace-restserver.yaml | 11 +++++ .../config/marketplace_restserver.py | 28 ++++++++++++ src/marketplace-restserver/deploy/delete.sh | 10 +++++ .../marketplace-restserver.yaml.template | 39 +++++++++++++++++ src/marketplace-restserver/deploy/refresh.sh | 11 +++++ .../deploy/service.yaml | 20 +++++++++ .../deploy/start.sh.template | 13 ++++++ src/marketplace-restserver/deploy/stop.sh | 6 +++ .../marketplace-webportal.k8s.dockerfile | 4 ++ .../config/marketplace-webportal.yaml | 8 ++++ .../config/marketplace_webportal.py | 29 +++++++++++++ src/marketplace-webportal/deploy/delete.sh | 10 +++++ .../marketplace-webportal.yaml.template | 29 +++++++++++++ src/marketplace-webportal/deploy/refresh.sh | 11 +++++ src/marketplace-webportal/deploy/service.yaml | 20 +++++++++ .../deploy/start.sh.template | 13 ++++++ src/marketplace-webportal/deploy/stop.sh | 6 +++ .../pylon-config/location.conf.template | 32 ++++++++++++++ src/pylon/deploy/pylon.yaml.template | 8 ++++ 29 files changed, 457 insertions(+) create mode 100644 src/marketplace-db/build/marketplace-db.k8s.dockerfile create mode 100644 src/marketplace-db/config/marketplace-db.yaml create mode 100644 src/marketplace-db/config/marketplace_db.py create mode 100644 src/marketplace-db/deploy/delete.sh create mode 100644 src/marketplace-db/deploy/marketplace-db.yaml.template create mode 100644 src/marketplace-db/deploy/refresh.sh create mode 100644 src/marketplace-db/deploy/service.yaml create mode 100644 src/marketplace-db/deploy/start.sh.template create mode 100644 src/marketplace-db/deploy/stop.sh create mode 100644 src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile create mode 100644 src/marketplace-restserver/config/marketplace-restserver.yaml create mode 100644 src/marketplace-restserver/config/marketplace_restserver.py create mode 100644 src/marketplace-restserver/deploy/delete.sh create mode 100644 src/marketplace-restserver/deploy/marketplace-restserver.yaml.template create mode 100644 src/marketplace-restserver/deploy/refresh.sh create mode 100644 src/marketplace-restserver/deploy/service.yaml create mode 100644 src/marketplace-restserver/deploy/start.sh.template create mode 100644 src/marketplace-restserver/deploy/stop.sh create mode 100644 src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile create mode 100644 src/marketplace-webportal/config/marketplace-webportal.yaml create mode 100644 src/marketplace-webportal/config/marketplace_webportal.py create mode 100644 src/marketplace-webportal/deploy/delete.sh create mode 100644 src/marketplace-webportal/deploy/marketplace-webportal.yaml.template create mode 100644 src/marketplace-webportal/deploy/refresh.sh create mode 100644 src/marketplace-webportal/deploy/service.yaml create mode 100644 src/marketplace-webportal/deploy/start.sh.template create mode 100644 src/marketplace-webportal/deploy/stop.sh diff --git a/src/marketplace-db/build/marketplace-db.k8s.dockerfile b/src/marketplace-db/build/marketplace-db.k8s.dockerfile new file mode 100644 index 0000000000..9a57b59d2c --- /dev/null +++ b/src/marketplace-db/build/marketplace-db.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM docker.io/postgres:12.0 diff --git a/src/marketplace-db/config/marketplace-db.yaml b/src/marketplace-db/config/marketplace-db.yaml new file mode 100644 index 0000000000..aa67b4bdca --- /dev/null +++ b/src/marketplace-db/config/marketplace-db.yaml @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +service_type: "k8s" + +user: user +passwd: passwd +db: marketplace +port: 9291 +max-connection: 1000 +data-path: /mnt/marketplace diff --git a/src/marketplace-db/config/marketplace_db.py b/src/marketplace-db/config/marketplace_db.py new file mode 100644 index 0000000000..c4da3f9b11 --- /dev/null +++ b/src/marketplace-db/config/marketplace_db.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import copy + +class MarketplaceDb(object): + def __init__(self, cluster_conf, service_conf, default_service_conf): + self.cluster_conf = cluster_conf + self.service_conf = dict(default_service_conf, **service_conf) + + def validation_pre(self): + machine_list = self.cluster_conf['machine-list'] + if len([host for host in machine_list if host.get('pai-master') == 'true']) < 1: + return False, '"pai-master=true" machine is required to deploy the marketplace-db service' + return True, None + + def run(self): + result = copy.deepcopy(self.service_conf) + machine_list = self.cluster_conf['machine-list'] + master_ip = [host['hostip'] for host in machine_list if host.get('pai-master') == 'true'][0] + result['host'] = master_ip + result['connection-str'] = 'postgresql://{}:{}@{}:{}/{}'.format( + result['user'], result['passwd'], result['host'], result['port'], result['db']) + return result + + def validation_post(self, conf): + return True, None diff --git a/src/marketplace-db/deploy/delete.sh b/src/marketplace-db/deploy/delete.sh new file mode 100644 index 0000000000..5ceb907e38 --- /dev/null +++ b/src/marketplace-db/deploy/delete.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +/bin/bash stop.sh || exit $? + +popd > /dev/null diff --git a/src/marketplace-db/deploy/marketplace-db.yaml.template b/src/marketplace-db/deploy/marketplace-db.yaml.template new file mode 100644 index 0000000000..abe51a8325 --- /dev/null +++ b/src/marketplace-db/deploy/marketplace-db.yaml.template @@ -0,0 +1,43 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: marketplace-db-ds +spec: + selector: + matchLabels: + app: marketplace-db + template: + metadata: + name: marketplace-db + labels: + app: marketplace-db + spec: + hostNetwork: true + containers: + - name: marketplace-db + image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}marketplace-db:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} + imagePullPolicy: Always + env: + - name: POSTGRES_USER + value: {{ cluster_cfg["marketplace-db"]["user"] }} + - name: POSTGRES_PASSWORD + value: {{ cluster_cfg["marketplace-db"]["passwd"] }} + - name: POSTGRES_DB + value: {{ cluster_cfg["marketplace-db"]["db"] }} + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + args: ['-c', 'port={{- cluster_cfg["marketplace-db"]["port"] }}', '-N', '{{ cluster_cfg["marketplace-db"]["max-connection"] }}'] + volumeMounts: + - name: marketplace-data-dir + mountPath: /var/lib/postgresql/data/pgdata + mountPropagation: "None" + volumes: + - name: marketplace-data-dir + hostPath: + path: '{{ cluster_cfg["marketplace-db"]["data-path"] }}' + type: DirectoryOrCreate + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} diff --git a/src/marketplace-db/deploy/refresh.sh b/src/marketplace-db/deploy/refresh.sh new file mode 100644 index 0000000000..bd50ad3ded --- /dev/null +++ b/src/marketplace-db/deploy/refresh.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +bash stop.sh +bash start.sh + +popd > /dev/null diff --git a/src/marketplace-db/deploy/service.yaml b/src/marketplace-db/deploy/service.yaml new file mode 100644 index 0000000000..22f84dcd60 --- /dev/null +++ b/src/marketplace-db/deploy/service.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cluster-type: + - k8s + +prerequisite: + - cluster-configuration + +template-list: + - marketplace-db.yaml + - start.sh + +start-script: start.sh +stop-script: stop.sh +delete-script: delete.sh +refresh-script: refresh.sh + +deploy-rules: + - in: pai-master diff --git a/src/marketplace-db/deploy/start.sh.template b/src/marketplace-db/deploy/start.sh.template new file mode 100644 index 0000000000..0251fdc7da --- /dev/null +++ b/src/marketplace-db/deploy/start.sh.template @@ -0,0 +1,13 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +kubectl apply --overwrite=true -f marketplace-db.yaml || exit $? + +# Wait until the service is ready. +PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v marketplace-db || exit $? + +popd > /dev/null diff --git a/src/marketplace-db/deploy/stop.sh b/src/marketplace-db/deploy/stop.sh new file mode 100644 index 0000000000..4f1c496084 --- /dev/null +++ b/src/marketplace-db/deploy/stop.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +kubectl delete --ignore-not-found --now "daemonset/marketplace-db-ds" diff --git a/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile b/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile new file mode 100644 index 0000000000..005c719740 --- /dev/null +++ b/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM docker.io/openpai/pai-marketplace-restserver:v1.2.0 diff --git a/src/marketplace-restserver/config/marketplace-restserver.yaml b/src/marketplace-restserver/config/marketplace-restserver.yaml new file mode 100644 index 0000000000..03352991fe --- /dev/null +++ b/src/marketplace-restserver/config/marketplace-restserver.yaml @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +service_type: "k8s" + +db_user: user +db_password: passwd +db: marketplace +# db_host: postgres +db_port: 9291 +server-port: 9292 diff --git a/src/marketplace-restserver/config/marketplace_restserver.py b/src/marketplace-restserver/config/marketplace_restserver.py new file mode 100644 index 0000000000..457466ca90 --- /dev/null +++ b/src/marketplace-restserver/config/marketplace_restserver.py @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import copy + +class MarketplaceRestserver(object): + def __init__(self, cluster_conf, service_conf, default_service_conf): + self.cluster_conf = cluster_conf + self.service_conf = dict(default_service_conf, **service_conf) + + def validation_pre(self): + machine_list = self.cluster_conf['machine-list'] + if len([host for host in machine_list if host.get('pai-master') == 'true']) < 1: + return False, '"pai-master=true" machine is required to deploy the marketplace-restserver service' + return True, None + + def run(self): + result = copy.deepcopy(self.service_conf) + machine_list = self.cluster_conf['machine-list'] + server_port = self.service_conf['server-port'] + master_ip = [host['hostip'] for host in machine_list if host.get('pai-master') == 'true'][0] + result['uri'] = 'http://{0}:{1}'.format(master_ip, server_port) + if 'db_host' not in result: + result['db_host'] = master_ip + return result + + def validation_post(self, conf): + return True, None diff --git a/src/marketplace-restserver/deploy/delete.sh b/src/marketplace-restserver/deploy/delete.sh new file mode 100644 index 0000000000..5ceb907e38 --- /dev/null +++ b/src/marketplace-restserver/deploy/delete.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +/bin/bash stop.sh || exit $? + +popd > /dev/null diff --git a/src/marketplace-restserver/deploy/marketplace-restserver.yaml.template b/src/marketplace-restserver/deploy/marketplace-restserver.yaml.template new file mode 100644 index 0000000000..45cb1c735c --- /dev/null +++ b/src/marketplace-restserver/deploy/marketplace-restserver.yaml.template @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: marketplace-restserver-ds +spec: + selector: + matchLabels: + app: marketplace-restserver + template: + metadata: + name: marketplace-restserver + labels: + app: marketplace-restserver + spec: + hostNetwork: true + containers: + - name: marketplace-restserver + image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}marketplace-restserver:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} + imagePullPolicy: Always + env: + - name: DB_USERNAME + value: {{ cluster_cfg["marketplace-restserver"]["db_user"] }} + - name: DB_PASSWORD + value: {{ cluster_cfg["marketplace-restserver"]["db_password"] }} + - name: DATABASE + value: {{ cluster_cfg["marketplace-restserver"]["db"] }} + - name: DB_HOST + value: {{ cluster_cfg["marketplace-restserver"]["db_host"] }} + - name: DB_PORT + value: "{{ cluster_cfg["marketplace-restserver"]["db_port"] }}" + - name: NODE_ENV + value: production + - name: PORT + value: "{{ cluster_cfg["marketplace-restserver"]["server-port"] }}" + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} diff --git a/src/marketplace-restserver/deploy/refresh.sh b/src/marketplace-restserver/deploy/refresh.sh new file mode 100644 index 0000000000..bd50ad3ded --- /dev/null +++ b/src/marketplace-restserver/deploy/refresh.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +bash stop.sh +bash start.sh + +popd > /dev/null diff --git a/src/marketplace-restserver/deploy/service.yaml b/src/marketplace-restserver/deploy/service.yaml new file mode 100644 index 0000000000..8b22188561 --- /dev/null +++ b/src/marketplace-restserver/deploy/service.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cluster-type: + - k8s + +prerequisite: + - cluster-configuration + +template-list: + - marketplace-restserver.yaml + - start.sh + +start-script: start.sh +stop-script: stop.sh +delete-script: delete.sh +refresh-script: refresh.sh + +deploy-rules: + - in: pai-master diff --git a/src/marketplace-restserver/deploy/start.sh.template b/src/marketplace-restserver/deploy/start.sh.template new file mode 100644 index 0000000000..93d0d7753c --- /dev/null +++ b/src/marketplace-restserver/deploy/start.sh.template @@ -0,0 +1,13 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +kubectl apply --overwrite=true -f marketplace-restserver.yaml || exit $? + +# Wait until the service is ready. +PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v marketplace-restserver || exit $? + +popd > /dev/null diff --git a/src/marketplace-restserver/deploy/stop.sh b/src/marketplace-restserver/deploy/stop.sh new file mode 100644 index 0000000000..ea204f1378 --- /dev/null +++ b/src/marketplace-restserver/deploy/stop.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +kubectl delete --ignore-not-found --now "daemonset/marketplace-restserver-ds" diff --git a/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile b/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile new file mode 100644 index 0000000000..a2f12ff94a --- /dev/null +++ b/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM docker.io/openpai/pai-marketplace-webportal:v1.2.0 diff --git a/src/marketplace-webportal/config/marketplace-webportal.yaml b/src/marketplace-webportal/config/marketplace-webportal.yaml new file mode 100644 index 0000000000..de69d0be87 --- /dev/null +++ b/src/marketplace-webportal/config/marketplace-webportal.yaml @@ -0,0 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +service_type: "k8s" + +# marketplace_api_uri: marketplace_api_uri +api-port: 9292 +server-port: 9293 diff --git a/src/marketplace-webportal/config/marketplace_webportal.py b/src/marketplace-webportal/config/marketplace_webportal.py new file mode 100644 index 0000000000..7a807e4b3c --- /dev/null +++ b/src/marketplace-webportal/config/marketplace_webportal.py @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import copy + +class MarketplaceWebportal(object): + def __init__(self, cluster_conf, service_conf, default_service_conf): + self.cluster_conf = cluster_conf + self.service_conf = dict(default_service_conf, **service_conf) + + def validation_pre(self): + machine_list = self.cluster_conf['machine-list'] + if len([host for host in machine_list if host.get('pai-master') == 'true']) < 1: + return False, '"pai-master=true" machine is required to deploy the marketplace-webportal service' + return True, None + + def run(self): + result = copy.deepcopy(self.service_conf) + machine_list = self.cluster_conf['machine-list'] + server_port = self.service_conf['server-port'] + api_port = self.service_conf['api-port'] + master_ip = [host['hostip'] for host in machine_list if host.get('pai-master') == 'true'][0] + result['uri'] = 'http://{0}:{1}'.format(master_ip, server_port) + if 'marketplace_api_uri' not in result: + result['marketplace_api_uri'] = 'http://{0}:{1}/api'.format(master_ip, api_port) + return result + + def validation_post(self, conf): + return True, None diff --git a/src/marketplace-webportal/deploy/delete.sh b/src/marketplace-webportal/deploy/delete.sh new file mode 100644 index 0000000000..5ceb907e38 --- /dev/null +++ b/src/marketplace-webportal/deploy/delete.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +/bin/bash stop.sh || exit $? + +popd > /dev/null diff --git a/src/marketplace-webportal/deploy/marketplace-webportal.yaml.template b/src/marketplace-webportal/deploy/marketplace-webportal.yaml.template new file mode 100644 index 0000000000..2f4876d7d4 --- /dev/null +++ b/src/marketplace-webportal/deploy/marketplace-webportal.yaml.template @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: marketplace-webportal-ds +spec: + selector: + matchLabels: + app: marketplace-webportal + template: + metadata: + name: marketplace-webportal + labels: + app: marketplace-webportal + spec: + hostNetwork: true + containers: + - name: marketplace-webportal + image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}marketplace-webportal:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} + imagePullPolicy: Always + env: + - name: MARKETPLACE_API_URL + value: {{ cluster_cfg["marketplace-webportal"]["marketplace_api_uri"] }} + - name: SERVER_PORT + value: "{{ cluster_cfg["marketplace-webportal"]["server-port"] }}" + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} diff --git a/src/marketplace-webportal/deploy/refresh.sh b/src/marketplace-webportal/deploy/refresh.sh new file mode 100644 index 0000000000..bd50ad3ded --- /dev/null +++ b/src/marketplace-webportal/deploy/refresh.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +bash stop.sh +bash start.sh + +popd > /dev/null diff --git a/src/marketplace-webportal/deploy/service.yaml b/src/marketplace-webportal/deploy/service.yaml new file mode 100644 index 0000000000..d4f9879c06 --- /dev/null +++ b/src/marketplace-webportal/deploy/service.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cluster-type: + - k8s + +prerequisite: + - cluster-configuration + +template-list: + - marketplace-webportal.yaml + - start.sh + +start-script: start.sh +stop-script: stop.sh +delete-script: delete.sh +refresh-script: refresh.sh + +deploy-rules: + - in: pai-master diff --git a/src/marketplace-webportal/deploy/start.sh.template b/src/marketplace-webportal/deploy/start.sh.template new file mode 100644 index 0000000000..acd1954dc8 --- /dev/null +++ b/src/marketplace-webportal/deploy/start.sh.template @@ -0,0 +1,13 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +pushd $(dirname "$0") > /dev/null + +kubectl apply --overwrite=true -f marketplace-webportal.yaml || exit $? + +# Wait until the service is ready. +PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v marketplace-webportal || exit $? + +popd > /dev/null diff --git a/src/marketplace-webportal/deploy/stop.sh b/src/marketplace-webportal/deploy/stop.sh new file mode 100644 index 0000000000..0eae959b5d --- /dev/null +++ b/src/marketplace-webportal/deploy/stop.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +kubectl delete --ignore-not-found --now "daemonset/marketplace-webportal-ds" diff --git a/src/pylon/deploy/pylon-config/location.conf.template b/src/pylon/deploy/pylon-config/location.conf.template index 19b144b0e9..a89e6d717e 100644 --- a/src/pylon/deploy/pylon-config/location.conf.template +++ b/src/pylon/deploy/pylon-config/location.conf.template @@ -143,6 +143,28 @@ location ~ ^/dshuttle/(.*)$ { } {%- endif %} +{%- if MARKETPLACE_API_URI %} +# marketplace rest server +location ~ ^/marketplace/api$ { + # Add '/' to the end of the URL, otherwise there will be a 404 error. + return 301 $scheme://$http_host$request_uri/; +} +location ~ ^/marketplace/api/(.*)$ { + proxy_pass {{MARKETPLACE_API_URI}}/$1$is_args$args; +} +{%- endif %} + +{%- if MARKETPLACE_WEBPORTAL_PLUGIN %} +# marketplace webportal plugin +location ~ ^/marketplace$ { + # Add '/' to the end of the URL, otherwise there will be a 404 error. + return 301 $scheme://$http_host$request_uri/; +} +location ~ ^/marketplace/(.*)$ { + proxy_pass {{MARKETPLACE_WEBPORTAL_PLUGIN}}/$1; +} +{%- endif %} + # PAI web portal location ~ ^(.*)$ { proxy_pass {{PAI_WEB_PORTAL_URI}}$1$is_args$args; @@ -186,6 +208,16 @@ location ~ ^(.*)$ { '{{DSHUTTLE_URI}}' '/dshuttle'; {%- endif %} +{%- if MARKETPLACE_API_URI %} + sub_filter + '{{MARKETPLACE_API_URI}}' + '/marketplace/api'; +{%- endif %} +{%- if MARKETPLACE_WEBPORTAL_PLUGIN %} + sub_filter + '{{MARKETPLACE_WEBPORTAL_PLUGIN}}' + '/marketplace'; +{%- endif %} proxy_set_header If-None-Match $request_if_none_match; add_header Etag $response_etag; diff --git a/src/pylon/deploy/pylon.yaml.template b/src/pylon/deploy/pylon.yaml.template index 65f14cd942..7f6e804d2d 100644 --- a/src/pylon/deploy/pylon.yaml.template +++ b/src/pylon/deploy/pylon.yaml.template @@ -82,6 +82,14 @@ spec: {% if cluster_cfg['cluster']['common']['dshuttle'] == 'true' %} - name: DSHUTTLE_URI value: {{ cluster_cfg['dshuttle']['uri'] }} +{% endif %} +{% if cluster_cfg['cluster']['common']['marketplace'] == 'true' %} + - name: MARKETPLACE_API_URI + value: {{ cluster_cfg['marketplace-restserver']['uri']}} +{% endif %} +{% if cluster_cfg['cluster']['common']['marketplace'] == 'true' %} + - name: MARKETPLACE_WEBPORTAL_PLUGIN + value: {{ cluster_cfg['marketplace-webportal']['uri']}} {% endif %} ports: - name: pylon From 97641692c20657a18af526e5412eb1b6a4b079d0 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Thu, 12 Nov 2020 20:21:10 +0800 Subject: [PATCH 09/12] Alert Email Template Refine (#5064) * add kill user job alert template * add troubleshooting link in general email template * allow customized templates * redefine actions schema in customized-receivers to facilitate parameters passing --- .../services-configuration.yaml.template | 18 ++++--- .../services-configuration.yaml.template | 17 ++++--- .../cluster-admin/how-to-use-alert-system.md | 49 +++++++++++++------ .../cluster-admin/how-to-use-alert-system.md | 32 ++++++------ .../services-configuration.yaml | 17 ++++--- src/alert-manager/config/alert-manager.md | 16 +++--- src/alert-manager/config/alert-manager.yaml | 4 +- src/alert-manager/config/alert_manager.py | 21 +++++++- .../alert-manager-configmap.yaml.template | 20 ++++---- .../alert-manager-deployment.yaml.template | 19 +++++++ .../general-templates/html.ejs | 8 ++- .../general-templates/subject.ejs | 0 .../kill-low-efficiency-job-alert/html.ejs | 34 +++++++++++++ .../kill-low-efficiency-job-alert/subject.ejs | 1 + src/alert-manager/deploy/service.yaml | 1 + .../deploy/{start.sh => start.sh.template} | 12 +++++ src/alert-manager/deploy/stop.sh | 2 + .../src/alert-handler/controllers/mail.js | 31 ++++++++++-- 18 files changed, 222 insertions(+), 80 deletions(-) rename src/alert-manager/{src/alert-handler/emails => deploy/alert-templates}/general-templates/html.ejs (93%) mode change 100755 => 100644 rename src/alert-manager/{src/alert-handler/emails => deploy/alert-templates}/general-templates/subject.ejs (100%) mode change 100755 => 100644 create mode 100644 src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/html.ejs create mode 100644 src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/subject.ejs rename src/alert-manager/deploy/{start.sh => start.sh.template} (72%) mode change 100755 => 100644 diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index f29b950bf1..b215de327b 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -183,16 +183,18 @@ authentication: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h -# customized-receivers: +# customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: -# - email-admin -# - email-user -# - stop-jobs -# - tag-jobs -# tags: -# - 'stopped-by-alert-manager' - +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# email-user: +# template: 'kill-low-efficiency-job-alert' +# stop-jobs: # no parameters required for stop-jobs action +# tag-jobs: +# tags: +# - 'stopped-by-alert-manager' # uncomment following if you want to customize prometheus # prometheus: diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index ef6fb089a3..a76b198e8a 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -88,15 +88,18 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h -# customized-receivers: +# customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: -# - email-admin -# - email-user -# - stop-jobs -# - tag-jobs -# tags: -# - 'stopped-by-alert-manager' +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# email-user: +# template: 'kill-low-efficiency-job-alert' +# stop-jobs: # no parameters required for stop-jobs action +# tag-jobs: +# tags: +# - 'stopped-by-alert-manager' # uncomment following if you want to customize prometheus # prometheus: diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md index 10e85308c8..22deee6fc1 100644 --- a/docs/manual/cluster-admin/how-to-use-alert-system.md +++ b/docs/manual/cluster-admin/how-to-use-alert-system.md @@ -94,13 +94,16 @@ alert-manager: alertname: PAIJobGpuPercentLowerThan0_3For1h customized-receivers: # receivers are combination of several actions - name: "pai-email-admin-user-and-stop-job" - actions: - - email-admin - - email-user - - stop-jobs - - tag-jobs - tags: - - 'stopped-by-alert-manager' + actions: + # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] + # if no template specified, 'general-template' will be used. + email-admin: + email-user: + template: 'kill-low-efficiency-job-alert' + stop-jobs: # no parameters required for stop-jobs action + tag-jobs: + tags: + - 'stopped-by-alert-manager' ``` @@ -152,13 +155,16 @@ alert-manager: alertname: PAIJobGpuPercentLowerThan0_3For1h customized-receivers: # receivers are combination of several actions - name: "pai-email-admin-user-and-stop-job" - actions: # We have provided so far these actions: email-admin, email-user, stop-jobs, tag-jobs - - email-admin - - email-user - - stop-jobs - - tag-jobs - tags: - - 'stopped-by-alert-manager' + actions: + # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] + # if no template specified, 'general-template' will be used. + email-admin: + email-user: + template: 'kill-low-efficiency-job-alert' + stop-jobs: # no parameters required for stop-jobs action + tag-jobs: + tags: + - 'stopped-by-alert-manager' ...... ``` @@ -173,8 +179,19 @@ For `routes` definition, we adopt the syntax of [Prometheus Alertmanager](https: For `receivers` definition, you can simply: - name the receiver in `name` field; -- list the actions to use in `actions`; -- list the tags in `tags` if `tag-jobs` is one of the actions. +- list the actions to use in `actions` and fill corresponding parameters for the actions: + - `email-admin`: + - template: Optional, can be choose from ['general-template', 'kill-low-efficiency-job-alert'], by default 'general-template'. + - `email-user`: + - template: Optional, can be choose from ['general-template', 'kill-low-efficiency-job-alert'], by default 'general-template'. + - `cordon-nodes`: No parameters required + - `stop-jobs`: No parameters required + - `tag-jobs`: + - tags: required, list of tags + +You can also add customized email templates by adding a template folder in `pai/src/alert-manager/deploy/alert-templates`. +Two files need to be present: one email body template file named `html.ejs` and one email subject template file named `subject.ejs`. +The folder name will be automatically passed as the template name. Remember to push service config to the cluster and restart the `alert-manager` service after your modification with the following commands in the dev-box container: diff --git a/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md b/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md index 408781be7f..f37b4f2872 100644 --- a/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md +++ b/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md @@ -91,13 +91,14 @@ alert-manager: alertname: PAIJobGpuPercentLowerThan0_3For1h customized-receivers: - name: "pai-email-admin-user-and-stop-job" - actions: - - email-admin - - email-user - - stop-jobs - - tag-jobs - tags: - - 'stopped-by-alert-manager' + actions: + email-admin: + email-user: + template: 'kill-low-efficiency-job-alert' + stop-jobs: + tag-jobs: + tags: + - 'stopped-by-alert-manager' ``` @@ -142,15 +143,16 @@ alert-manager: - receiver: pai-email-admin-user-and-stop-job match: alertname: PAIJobGpuPercentLowerThan0_3For1h - customized-receivers:、 + customized-receivers: - name: "pai-email-admin-user-and-stop-job" - actions:、 - - email-admin - - email-user - - stop-jobs - - tag-jobs - tags: - - 'stopped-by-alert-manager' + actions: + email-admin: + email-user: + template: 'kill-low-efficiency-job-alert' + stop-jobs: + tag-jobs: + tags: + - 'stopped-by-alert-manager' ...... ``` diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index edf605ecca..0d90a94fdd 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -122,15 +122,18 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h -# customized-receivers: +# customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: -# - email-admin -# - email-user -# - stop-jobs -# - tag-jobs -# tags: -# - 'stopped-by-alert-manager' +# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert'] +# # if no template specified, 'general-template' will be used. +# email-admin: +# email-user: +# template: 'kill-low-efficiency-job-alert' +# stop-jobs: # no parameters required for stop-jobs action +# tag-jobs: +# tags: +# - 'stopped-by-alert-manager' # uncomment following if you want to customize prometheus # prometheus: diff --git a/src/alert-manager/config/alert-manager.md b/src/alert-manager/config/alert-manager.md index df0d552325..56c5b3c670 100644 --- a/src/alert-manager/config/alert-manager.md +++ b/src/alert-manager/config/alert-manager.md @@ -55,12 +55,12 @@ After parsing, if you properly configured `email-configs` and `pai-bearer-token` alert-manager: port: 9093 actions-available: - - webportal-notification - - cordon-nodes - - email-admin - - email-user - - stop-jobs - - tag-jobs + - webportal-notification + - cordon-nodes + - email-admin + - email-user + - stop-jobs + - tag-jobs alert-handler: log-level: info port: 9095 @@ -85,8 +85,8 @@ if you didn't configured `alert-handler`, it will be like: alert-manager: port: 9093 actions-available: - - webportal-notification - - cordon-nodes + - webportal-notification + - cordon-nodes alert-handler: log-level: info port: 9095 diff --git a/src/alert-manager/config/alert-manager.yaml b/src/alert-manager/config/alert-manager.yaml index 88b10eab25..3e5d2d0e22 100644 --- a/src/alert-manager/config/alert-manager.yaml +++ b/src/alert-manager/config/alert-manager.yaml @@ -21,8 +21,8 @@ service_type: "common" port: 9093 actions-available: - - webportal-notification - - cordon-nodes +- webportal-notification +- cordon-nodes alert-handler: log-level: 'info' port: 9095 diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py index fa395f3e9d..aaa59cef0e 100644 --- a/src/alert-manager/config/alert_manager.py +++ b/src/alert-manager/config/alert_manager.py @@ -2,6 +2,7 @@ import copy import collections +import os def update_nested_dict(dict_original, dict_update): """ @@ -28,6 +29,21 @@ def get_master_ip(self): def validation_pre(self): return True, None + def get_email_templates(self): + # get all template folders + templates_path = os.path.abspath(os.path.join(os.path.abspath(__file__), '../../deploy/alert-templates')) + template_dirs = os.listdir(templates_path) + + # the template folder is valid if 'html.ejs' and 'subject.ejs' both exist in the dir + templates = [] + for dir_name in template_dirs: + template_path = os.path.join(templates_path, dir_name) + if os.path.isdir(template_path): + contents = os.listdir(template_path) + if set(['html.ejs', 'subject.ejs']).issubset(set(contents)): + templates.append(dir_name) + return templates + def run(self): result = update_nested_dict(self.default_service_conf, self.service_conf) @@ -43,8 +59,11 @@ def run(self): email_configured = True else: email_configured = False + + if email_configured: + result["alert-handler"]["email-configs"]["templates"] = self.get_email_templates() - # check if `pai-bearer-token` is properly configureds + # check if `pai-bearer-token` is properly configured if result.get("alert-handler") is not None and \ result["alert-handler"].get("pai-bearer-token") is not None: token_configured = True diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 434bc084c9..69755da65d 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -77,27 +77,29 @@ data: {% for receiver in cluster_cfg["alert-manager"]["customized-receivers"] %} - name: {{ receiver.name}} webhook_configs: - {% if 'email-admin' in receiver.actions %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin' + {% if receiver["actions"]["email-admin"] is defined %} + {% set template = receiver["actions"]["email-admin"]["template"] %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin/?template={{ template }}' send_resolved: true {% endif %} - {% if 'email-user' in receiver.actions %} - - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-user' - send_resolved: true + {% if receiver["actions"]["email-user"] is defined %} + {% set template = receiver["actions"]["email-user"]["template"] %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-user/?template={{ template }}' + send_resolved: false http_config: bearer_token: {{ cluster_cfg["alert-manager"]["alert-handler"]["pai-bearer-token"] }} {% endif %} - {% if 'stop-jobs' in receiver.actions %} + {% if receiver["actions"]["stop-jobs"] is defined %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/stop-jobs' send_resolved: false http_config: bearer_token: {{ cluster_cfg["alert-manager"]["alert-handler"]["pai-bearer-token"] }} {% endif %} - {% if 'tag-jobs' in receiver.actions %} - {% for tag in receiver["tags"] %} + {% if receiver["actions"]["tag-jobs"] is defined %} + {% for tag in receiver["actions"]["tag-jobs"]["tags"] %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/tag-jobs/{{ tag }}' send_resolved: false http_config: @@ -105,7 +107,7 @@ data: {% endfor %} {% endif %} - {% if 'cordon-nodes' in receiver.actions %} + {% if receiver["actions"]["cordon-nodes"] is defined %} - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes' send_resolved: false {% endif %} diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template index 3cca7feca0..2aea675d72 100755 --- a/src/alert-manager/deploy/alert-manager-deployment.yaml.template +++ b/src/alert-manager/deploy/alert-manager-deployment.yaml.template @@ -67,6 +67,8 @@ spec: value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }} - name: REST_SERVER_URI value: {{ cluster_cfg['rest-server']['uri'] }} + - name: WEBPORTAL_URI + value: {{ cluster_cfg['webportal']['uri'] }} - name: LOG_LEVEL value: {{ cluster_cfg["alert-manager"]["alert-handler"]["log-level"] }} {% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} @@ -83,6 +85,9 @@ spec: - name: EMAIL_CONFIGS_SMTP_AUTH_PASSWORD value: {{ cluster_cfg["alert-manager"]["alert-handler"]["email-configs"]["smtp-auth-password"] }} {% endif %} + volumeMounts: + - name: templates-volume + mountPath: /etc/alerthandler/templates {% endif %} imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} @@ -90,6 +95,20 @@ spec: - name: config-volume configMap: name: alertmanager-configmap +{% if cluster_cfg["alert-manager"]["alert-handler"]["configured"] %} +{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %} + - name: templates-volume + configMap: + name: alert-templates + items: + {% for template in cluster_cfg["alert-manager"]["alert-handler"]["email-configs"]["templates"] -%} + - key: {{ template }}-html.ejs + path: {{ template }}/html.ejs + - key: {{ template }}-subject.ejs + path: {{ template }}/subject.ejs + {% endfor -%} +{% endif %} +{% endif %} - name: alertmanager emptyDir: {} tolerations: diff --git a/src/alert-manager/src/alert-handler/emails/general-templates/html.ejs b/src/alert-manager/deploy/alert-templates/general-templates/html.ejs old mode 100755 new mode 100644 similarity index 93% rename from src/alert-manager/src/alert-handler/emails/general-templates/html.ejs rename to src/alert-manager/deploy/alert-templates/general-templates/html.ejs index 98aa7e65e4..c96d35153e --- a/src/alert-manager/src/alert-handler/emails/general-templates/html.ejs +++ b/src/alert-manager/deploy/alert-templates/general-templates/html.ejs @@ -75,8 +75,12 @@ SOFTWARE. - style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 0; border-color: #348eda; border-style: solid; border-width: 10px 20px;">View - in AlertManager + style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 5px; border-color: #348eda; border-style: solid; border-width: 10px 20px;"> + View in AlertManager + + + style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 5px; border-color: #348eda; border-style: solid; border-width: 10px 20px;"> + Troubleshooting <% if (alerts.filter( element => element.status =="firing").length > 0) { %> diff --git a/src/alert-manager/src/alert-handler/emails/general-templates/subject.ejs b/src/alert-manager/deploy/alert-templates/general-templates/subject.ejs old mode 100755 new mode 100644 similarity index 100% rename from src/alert-manager/src/alert-handler/emails/general-templates/subject.ejs rename to src/alert-manager/deploy/alert-templates/general-templates/subject.ejs diff --git a/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/html.ejs b/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/html.ejs new file mode 100644 index 0000000000..b2b302ec09 --- /dev/null +++ b/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/html.ejs @@ -0,0 +1,34 @@ + + + + + + Dear OpenPAI user: +
+
+ Your jobs in OpenPAI cluster <%= cluster_id %> have very low GPU utilization. + +

The jobs will be killed automatically by OpenPAI services.

+ + Related Job(s): +
+
+ <% alerts.filter( element => element.status =="firing").forEach(function(alert){ %> + /job-detail.html?username=<%= alert.labels.job_name.split("~")[0] %>&jobName=<%= alert.labels.job_name.split("~")[1] %>> + <%= alert.labels.job_name.split("~")[1] %> + : <%= alert.annotations.summary %> +
+ <% }); %> +
+
+ + Thanks for your comprehension, +
+
+ OpenPAI Admin Team + + + \ No newline at end of file diff --git a/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/subject.ejs b/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/subject.ejs new file mode 100644 index 0000000000..495bde2706 --- /dev/null +++ b/src/alert-manager/deploy/alert-templates/kill-low-efficiency-job-alert/subject.ejs @@ -0,0 +1 @@ +<%= cluster_id %>: Low Efficiency Job Alert \ No newline at end of file diff --git a/src/alert-manager/deploy/service.yaml b/src/alert-manager/deploy/service.yaml index c87ab5ce01..160c3be7d2 100644 --- a/src/alert-manager/deploy/service.yaml +++ b/src/alert-manager/deploy/service.yaml @@ -26,6 +26,7 @@ prerequisite: template-list: - alert-manager-deployment.yaml - alert-manager-configmap.yaml + - start.sh start-script: start.sh stop-script: stop.sh diff --git a/src/alert-manager/deploy/start.sh b/src/alert-manager/deploy/start.sh.template old mode 100755 new mode 100644 similarity index 72% rename from src/alert-manager/deploy/start.sh rename to src/alert-manager/deploy/start.sh.template index 819f9f4fcb..81a4c7e409 --- a/src/alert-manager/deploy/start.sh +++ b/src/alert-manager/deploy/start.sh.template @@ -19,6 +19,18 @@ pushd $(dirname "$0") > /dev/null +# crate configmap for alert-templates +{% if cluster_cfg["alert-manager"]["alert-handler"]["configured"] -%} +{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] -%} +kubectl create configmap alert-templates \ +{% for template in cluster_cfg["alert-manager"]["alert-handler"]["email-configs"]["templates"] -%} +--from-file={{ template }}-html.ejs=alert-templates/{{ template }}/html.ejs \ +--from-file={{ template }}-subject.ejs=alert-templates/{{ template }}/subject.ejs \ +{% endfor -%} +--dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $? +{% endif -%} +{% endif -%} + kubectl apply --overwrite=true -f rbac.yaml || exit $? kubectl apply --overwrite=true -f alert-manager-configmap.yaml || exit $? kubectl apply --overwrite=true -f alert-manager-deployment.yaml || exit $? diff --git a/src/alert-manager/deploy/stop.sh b/src/alert-manager/deploy/stop.sh index ae257b931f..aac3c9bfd0 100644 --- a/src/alert-manager/deploy/stop.sh +++ b/src/alert-manager/deploy/stop.sh @@ -17,6 +17,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +kubectl delete --ignore-not-found --now configmap/alert-templates kubectl delete --ignore-not-found --now configmap/alertmanager kubectl delete --ignore-not-found --now deployment/alertmanager diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js index 17014cd68d..5a9a970499 100755 --- a/src/alert-manager/src/alert-handler/controllers/mail.js +++ b/src/alert-manager/src/alert-handler/controllers/mail.js @@ -19,6 +19,7 @@ const axios = require('axios'); const nodemailer = require('nodemailer'); const Email = require('email-templates'); const logger = require('@alert-handler/common/logger'); +const path = require('path'); // create reusable transporter object using the default SMTP transport const transporter = nodemailer.createTransport({ @@ -44,11 +45,21 @@ const email = new Email({ }, }); +// OpenPAI handbook troubleshooting +const troubleshootingURL = + 'https://openpai.readthedocs.io/en/latest/manual/cluster-admin/troubleshooting.html'; + // send email to admin const sendEmailToAdmin = (req, res) => { + logger.info( + 'alert-handler received `send-email-to-admin` post request from alert-manager.', + ); + const template = req.params.template + ? req.params.template + : 'general-templates'; email .send({ - template: 'general-templates', + template: path.join('/etc/alerthandler/templates/', template), message: { to: process.env.EMAIL_CONFIGS_ADMIN_RECEIVER, }, @@ -57,6 +68,8 @@ const sendEmailToAdmin = (req, res) => { alerts: req.body.alerts, groupLabels: req.body.groupLabels, externalURL: req.body.externalURL, + webportalURL: process.env.WEBPORTAL_URI, + troubleshootingURL: troubleshootingURL, }, }) .then(() => { @@ -103,6 +116,9 @@ const getUserEmail = async (username, token) => { // send email to job user const sendEmailToUser = async (req, res) => { + logger.info( + 'alert-handler received `send-email-to-user` post request from alert-manager.', + ); // filter alerts which are firing and contain `job_name` as label const alerts = req.body.alerts.filter( (alert) => alert.status === 'firing' && 'job_name' in alert.labels, @@ -137,6 +153,9 @@ const sendEmailToUser = async (req, res) => { } }); + const template = req.params.template + ? req.params.template + : 'general-templates'; if (alertsGrouped) { // send emails to different users separately Promise.all( @@ -144,7 +163,7 @@ const sendEmailToUser = async (req, res) => { const userEmail = await getUserEmail(username, req.token); if (userEmail) { email.send({ - template: 'general-templates', + template: path.join('/etc/alerthandler/templates/', template), message: { to: userEmail, }, @@ -153,6 +172,8 @@ const sendEmailToUser = async (req, res) => { alerts: alertsGrouped[username], groupLabels: req.body.groupLabels, externalURL: req.body.externalURL, + webportalURL: process.env.WEBPORTAL_URI, + troubleshootingURL: troubleshootingURL, }, }); } else { @@ -161,15 +182,15 @@ const sendEmailToUser = async (req, res) => { }), ) .then((response) => { - logger.info('alert-handler successfully send emails'); + logger.info('alert-handler successfully send emails to users'); res.status(200).json({ - message: `alert-handler successfully send emails`, + message: `alert-handler successfully send emails to users`, }); }) .catch((error) => { logger.error(error); res.status(500).json({ - message: `alert-handler failed to send email`, + message: `alert-handler failed to send email to users`, }); }); } From f4e4b47d0d5e850c616e83afab0fe8f008b4c307 Mon Sep 17 00:00:00 2001 From: Mingliang Tao Date: Fri, 13 Nov 2020 09:49:20 +0800 Subject: [PATCH 10/12] Disable stop button immediately after click (#5079) --- .../app/job/job-view/fabric/JobList/index.jsx | 72 +++++++++---------- .../app/job/job-view/fabric/job-detail.jsx | 7 +- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/src/webportal/src/app/job/job-view/fabric/JobList/index.jsx b/src/webportal/src/app/job/job-view/fabric/JobList/index.jsx index 8a1bc7ee9d..931129a8f4 100644 --- a/src/webportal/src/app/job/job-view/fabric/JobList/index.jsx +++ b/src/webportal/src/app/job/job-view/fabric/JobList/index.jsx @@ -12,7 +12,7 @@ import React, { useEffect, useRef, } from 'react'; -import { debounce, isEmpty } from 'lodash'; +import { debounce, isEmpty, cloneDeep } from 'lodash'; import { ColorClassNames, getTheme } from '@uifabric/styling'; import { Fabric } from 'office-ui-fabric-react/lib/Fabric'; @@ -39,7 +39,6 @@ export default function JobList() { const admin = userAuth.checkAdmin(); const username = cookies.get('user'); - const [allJobs, setAllJobs] = useState(null); const [selectedJobs, setSelectedJobs] = useState([]); const [error, setError] = useState(null); @@ -164,42 +163,43 @@ export default function JobList() { applyOrdering(ordering); }, [applyOrdering, ordering]); - const stopJob = useCallback( - (...jobs) => { - userAuth.checkToken(token => { - jobs.forEach(job => { - const { name, username } = job; - const client = new PAIV2.OpenPAIClient({ - rest_server_uri: new URL( - webportalConfig.restServerUri, - window.location.href, - ), - username: username, - token: token, - https: window.location.protocol === 'https:', - }); - client.job - .updateJobExecutionType(username, name, 'STOP') - .then(() => { - job.executionType = 'STOP'; - delete job._statusText; - delete job._statusIndex; - setAllJobs(allJobs.slice()); - }) - .catch(err => { - if (err.data.code === 'UnauthorizedUserError') { - alert(err.data.message); - clearToken(); - } else { - alert(err.data.message || err.message); - throw new Error(err.data.message || err.message); - } - }); + const stopJob = (...jobs) => { + userAuth.checkToken(token => { + jobs.forEach(job => { + const { name, username } = job; + const client = new PAIV2.OpenPAIClient({ + rest_server_uri: new URL( + webportalConfig.restServerUri, + window.location.href, + ), + username: username, + token: token, + https: window.location.protocol === 'https:', }); + client.job + .updateJobExecutionType(username, name, 'STOP') + .then(() => { + job.executionType = 'STOP'; + delete job._statusText; + delete job._statusIndex; + const newFilteredJobsInfo = cloneDeep(filteredJobsInfo); + setFilteredJobsInfo(newFilteredJobsInfo); + }) + .catch(err => { + if (err.data && err.data.code === 'UnauthorizedUserError') { + alert(err.data.message); + clearToken(); + } else if (err.data) { + alert(err.data.message); + throw new Error(err.data.message); + } else { + alert(err.message); + throw new Error(err.message); + } + }); }); - }, - [allJobs], - ); + }); + }; const getJobs = async query => { const token = userAuth.checkToken(); diff --git a/src/webportal/src/app/job/job-view/fabric/job-detail.jsx b/src/webportal/src/app/job/job-view/fabric/job-detail.jsx index 12befdb18f..9ed39fc3c6 100644 --- a/src/webportal/src/app/job/job-view/fabric/job-detail.jsx +++ b/src/webportal/src/app/job/job-view/fabric/job-detail.jsx @@ -15,7 +15,7 @@ // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import { capitalize, isEmpty, isNil, get } from 'lodash'; +import { capitalize, isEmpty, isNil, get, cloneDeep } from 'lodash'; import { DateTime, Interval } from 'luxon'; import { MessageBar, @@ -162,6 +162,11 @@ class JobDetail extends React.Component { async stop() { await stopJob(); + const newJobInfo = cloneDeep(this.state.jobInfo); + newJobInfo.jobStatus.executionType = 'STOP'; + this.setState({ + jobInfo: newJobInfo, + }); await this.reload(); } From 67c6ce17a69e2c327d9ce215c75ab7b90b8246f2 Mon Sep 17 00:00:00 2001 From: AmberMsy <46340789+AmberMsy@users.noreply.github.com> Date: Fri, 13 Nov 2020 10:25:51 +0800 Subject: [PATCH 11/12] Update CPU_500Task_MNIST.yaml (#5080) --- examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml b/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml index 7bdd5dffb7..c01c9d4ede 100644 --- a/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml +++ b/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml @@ -14,8 +14,8 @@ taskRoles: taskRetryCount: 0 dockerImage: docker_image_0 resourcePerInstance: - gpu: 1 - cpu: 5 + gpu: 0 + cpu: 1 memoryMB: 51200 commands: - >- From 5f12ee5f06ec0dd05a6c140a4880a7acf3d3dd53 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 13 Nov 2020 13:44:36 +0800 Subject: [PATCH 12/12] Support CPU jobs on GPU nodes in HiveD scheduler (#5075) * Support CPU jobs on GPU nodes in HiveD scheduler. * Add docs. --- .../how-to-set-up-virtual-clusters.md | 55 +++++++++++++++++++ src/rest-server/src/models/v2/job/k8s.js | 35 ++++++------ 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md b/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md index 6fdcdd2280..54f8669acb 100644 --- a/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md +++ b/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md @@ -253,6 +253,61 @@ hivedscheduler: In the above example, we set up 2 VCs: `default` and `v100`. The `default` VC has 2 K80 nodes, and `V100` VC has 3 V100 nodes. Every K80 node has 4 K80 GPUs and Every V100 nodes has 4 V100 GPUs. +## Configure CPU and GPU SKU on the Same Node + +If you want to configure both CPU and GPU sku types on the same node, you could use the same `cellAddress` for different `cellTypes`, here is an example. + +```yaml +hivedscheduler: + config: | + physicalCluster: + skuTypes: + GPU: + gpu: 1 + cpu: 4 + memory: 40960Mi + CPU: + gpu: 0 + cpu: 1 + memory: 10240Mi + cellTypes: + GPU-NODE: + childCellType: GPU + childCellNumber: 4 + isNodeLevel: true + GPU-NODE-POOL: + childCellType: GPU-NODE + childCellNumber: 2 + CPU-NODE: + childCellType: CPU + childCellNumber: 12 + isNodeLevel: true + CPU-NODE-POOL: + childCellType: CPU-NODE + childCellNumber: 2 + physicalCells: + - cellType: GPU-NODE-POOL + cellChildren: + - cellAddress: node1 + - cellAddress: node2 + - cellType: CPU-NODE-POOL + cellChildren: + - cellAddress: node1 + - cellAddress: node2 + virtualClusters: + default: + virtualCells: + - cellType: GPU-NODE-POOL.GPU-NODE + cellNumber: 2 + cpu: + virtualCells: + - cellType: CPU-NODE-POOL.CPU-NODE + cellNumber: 2 +``` + +Currently we only support mixing CPU and GPU types on one NVIDIA GPU node or one AMD GPU node, +rare cases including NVIDIA cards and AMD cards on one node are not supported. + ## Use Pinned Cell to Reserve Certain Node in a Virtual Cluster In some cases, you might want to reserve a certain node in a virtual cluster, and submit job to this node explicitly for debugging or quick testing. OpenPAI provides you with a way to "pin" a node to a virtual cluster. diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index 67d64f0d24..9c94d4fc45 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -626,10 +626,7 @@ const generateTaskRole = ( : frameworkTaskRole.taskNumber, }; // check cpu job - if ( - !launcherConfig.enabledHived && - config.taskRoles[taskRole].resourcePerInstance.gpu === 0 - ) { + if (config.taskRoles[taskRole].resourcePerInstance.gpu === 0) { frameworkTaskRole.task.pod.spec.containers[0].env.push({ name: 'NVIDIA_VISIBLE_DEVICES', value: 'none', @@ -647,24 +644,26 @@ const generateTaskRole = ( frameworkTaskRole.task.pod.metadata.annotations[ 'hivedscheduler.microsoft.com/pod-scheduling-spec' ] = yaml.safeDump(config.taskRoles[taskRole].hivedPodSpec); - frameworkTaskRole.task.pod.spec.containers[0].env.push( - { - name: 'NVIDIA_VISIBLE_DEVICES', - valueFrom: { - fieldRef: { - fieldPath: `metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation']`, + if (config.taskRoles[taskRole].resourcePerInstance.gpu > 0) { + frameworkTaskRole.task.pod.spec.containers[0].env.push( + { + name: 'NVIDIA_VISIBLE_DEVICES', + valueFrom: { + fieldRef: { + fieldPath: `metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation']`, + }, }, }, - }, - { - name: 'PAI_AMD_VISIBLE_DEVICES', - valueFrom: { - fieldRef: { - fieldPath: `metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation']`, + { + name: 'PAI_AMD_VISIBLE_DEVICES', + valueFrom: { + fieldRef: { + fieldPath: `metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation']`, + }, }, }, - }, - ); + ); + } } return frameworkTaskRole;