Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Cherry-pick commits into release branch #5612

Merged
merged 17 commits into from
Sep 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ OpenPAI provides end-to-end manuals for both cluster users and administrators.

### For cluster administrators

The [admin manual](https://openpai.readthedocs.io/en/latest/manual/cluster-admin/README.html) is a comprehensive guide for cluster administrators, it covers (but not limited to) the following contents:
The [admin manual](https://openpai.readthedocs.io/en/latest/manual/cluster-admin/index.html) is a comprehensive guide for cluster administrators, it covers (but not limited to) the following contents:

- **Installation and upgrade**. The installation is based on Kubespray, and here is the [system requirements](https://openpai.readthedocs.io/en/latest/manual/cluster-admin/installation-guide.html#installation-requirements). OpenPAI provides an [installation guide](https://openpai.readthedocs.io/en/latest/manual/cluster-admin/installation-guide.html) to facilitate the installation.

Expand Down Expand Up @@ -140,7 +140,7 @@ The [admin manual](https://openpai.readthedocs.io/en/latest/manual/cluster-admin

### For cluster users

The [user manual](https://openpai.readthedocs.io/en/latest/manual/cluster-user/README.html) is a guidance for cluster users, who could train and serve deep learning (and other) tasks on OpenPAI.
The [user manual](https://openpai.readthedocs.io/en/latest/manual/cluster-user/index.html) is a guidance for cluster users, who could train and serve deep learning (and other) tasks on OpenPAI.

- **Job submission and monitoring**. The [quick start tutorial](https://openpai.readthedocs.io/en/latest/manual/cluster-user/quick-start.html) is a good start for learning how to train models on OpenPAI. And more examples and supports to multiple mainstream frameworks (out-of-the-box docker images) are in [here](https://openpai.readthedocs.io/en/latest/manual/cluster-user/docker-images-and-job-examples.html). OpenPAI also provides supports for [good debuggability](https://openpai.readthedocs.io/en/latest/manual/cluster-user/how-to-debug-jobs.html) and [advanced job functionalities](https://openpai.readthedocs.io/en/latest/manual/cluster-user/advanced-jobs.html).

Expand Down
8 changes: 1 addition & 7 deletions contrib/kubespray/docker-cache-config-distribute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,4 @@
roles:
- role: '../roles/docker-cache/install'
vars:
enable_docker_cache: true
docker_cache_host: "{{ hostvars[groups['kube-master'][0]]['ip'] }}:30500"
tasks:
- name: Restart service docker config from /etc/docker/daemon.json after update
ansible.builtin.systemd:
name: docker
state: restarted
docker_cache_host: "{{ hostvars[groups['kube-master'][0]]['ip'] }}:30500"
2 changes: 1 addition & 1 deletion contrib/kubespray/quick-start-kubespray.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ echo "Performing pre-installation..."
ansible-playbook -i ${HOME}/pai-deploy/cluster-cfg/hosts.yml pre-installation.yml || exit $?

echo "Performing docker-cache config distribution..."
ansible-playbook -i ${HOME}/pai-deploy/cluster-cfg/hosts.yml docker-cache-config-distribute.yml || exit $?
ansible-playbook -i ${HOME}/pai-deploy/cluster-cfg/hosts.yml docker-cache-config-distribute.yml -e "@${CLUSTER_CONFIG}" || exit $?

echo "Starting kubernetes..."
/bin/bash script/kubernetes-boot.sh || exit $?
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cluster:
data-path: "/datastorage"
qos-switch: "{{ env["cfg"]["qos-switch"] | default('false') }}"
docker-data-root: "{{ env['cfg']['docker_data_root'] | default('/mnt/docker') }}"
prometheus-pushgateway: false
marketplace: "{{ env["cfg"]["enable_marketplace"] | default('false') }}"

# the docker registry to store docker images that contain system services like frameworklauncher, hadoop, etc.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ def main():
backup_path = Path("/etc/docker/daemon.json.bk")

folder_path.mkdir(parents=True, exist_ok=True)
target_path.touch(mode=0o666)
backup_path.touch(mode=0o666)

with open(str(target_path)) as f:
current_config = json.load(f);

with open(str(backup_path), 'w') as f:
json.dump(current_config, f)
if target_path.exists() and target_path.stat().st_size:
backup_path.touch(mode=0o666)
with open(str(target_path)) as f:
current_config = json.load(f)
with open(str(backup_path), 'w') as f:
json.dump(current_config, f)
else:
target_path.touch(mode=0o666)
current_config = {}

docker_cache_mirror = "http://{}".format(args.host)
if "registry-mirrors" in current_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,8 @@
- python3
- /tmp/add_docker_cache_config.py
- "{{ docker_cache_host }}"

- name: Restart service docker config from /etc/docker/daemon.json after update
ansible.builtin.systemd:
name: docker
state: restarted
7 changes: 3 additions & 4 deletions contrib/kubespray/script/environment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,11 @@ sudo python3 -m pip install -r script/requirements.txt
echo "Install sshpass"
sudo apt-get -y install sshpass

echo "Install kubespray's requirements and ansible is included"
sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt

# ansible 2.7 doesn't support distribution info collection on Ubuntu 20.04
# Use ansible 2.9.7 as a workaround
# Reference: https://stackoverflow.com/questions/61460151/ansible-not-reporting-distribution-info-on-ubuntu-20-04
# We can upgrade kubespray version to avoid this issue in the future.
sudo python3 -m pip install ansible==2.9.7
sed -i 's/ansible==.*/ansible==2.9.7/' ${HOME}/pai-deploy/kubespray/requirements.txt

echo "Install kubespray's requirements and ansible is included"
sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt
2 changes: 1 addition & 1 deletion contrib/kubespray/script/service-boot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ echo "Starting OpenPAI service with dev-box..."
sudo docker exec -w /mnt/pai dev-box-quick-start /bin/bash ./contrib/kubespray/script/start-service-in-dev-box.sh

# print cluster info
WEBPORTAL_URL=http:$(kubectl config view -o jsonpath='{.clusters[].cluster.server}' | cut -d ":" -f 2)
WEBPORTAL_URL=http:$(sudo docker exec dev-box-quick-start kubectl config view -o jsonpath='{.clusters[].cluster.server}' | cut -d ":" -f 2)
echo ""
echo "OpenPAI is successfully deployed, please check the following information:"
echo "Kubernetes cluster config : ~/pai-deploy/kube/config"
Expand Down
12 changes: 6 additions & 6 deletions contrib/submit-job-v2/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3737,9 +3737,9 @@ path-key@^2.0.0, path-key@^2.0.1:
integrity sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=

path-parse@^1.0.6:
version "1.0.6"
resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.6.tgz#d62dbb5679405d72c4737ec58600e9ddcf06d24c"
integrity sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==
version "1.0.7"
resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.7.tgz#fbc114b60ca42b30d9daf5858e4bd68bbedb6735"
integrity sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==

path-to-regexp@0.1.7:
version "0.1.7"
Expand Down Expand Up @@ -3871,9 +3871,9 @@ postcss-value-parser@^3.3.0, postcss-value-parser@^3.3.1:
integrity sha512-pISE66AbVkp4fDQ7VHBwRNXzAAKJjw4Vw7nWI/+Q3vuly7SNfgYXvm6i5IgFylHGK5sP/xHAbB7N49OS4gWNyQ==

postcss@^7.0.14, postcss@^7.0.5, postcss@^7.0.6:
version "7.0.14"
resolved "https://registry.yarnpkg.com/postcss/-/postcss-7.0.14.tgz#4527ed6b1ca0d82c53ce5ec1a2041c2346bbd6e5"
integrity sha512-NsbD6XUUMZvBxtQAJuWDJeeC4QFsmWsfozWxCJPWf3M55K9iu2iMDaKqyoOdTJ1R4usBXuxlVFAIo8rZPQD4Bg==
version "7.0.36"
resolved "https://registry.yarnpkg.com/postcss/-/postcss-7.0.36.tgz#056f8cffa939662a8f5905950c07d5285644dfcb"
integrity sha512-BebJSIUMwJHRH0HAQoxN4u1CN86glsrwsW0q7T+/m44eXOUAxSNdHRkNZPYz5vVUbg17hFgOQDE7fZk7li3pZw==
dependencies:
chalk "^2.4.2"
source-map "^0.6.1"
Expand Down
29 changes: 0 additions & 29 deletions docs/manual/cluster-admin/configuration-for-china.md

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ Sometimes it is not fixable even you have the `python3-apt` package installed. I

#### Network-related Issues

If you are a China user, please refer to [here](./configuration-for-china.md).
If you are a China user, please refer to [this issue](https://github.com/microsoft/pai/issues/5592).

**Cannot download kubeadm or hyperkube**

Expand Down
2 changes: 1 addition & 1 deletion docs/manual/cluster-admin/installation-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Please edit `layout.yaml` and a `config.yaml` file under `<pai-code-dir>/contrib
These two files specify the cluster layout and the customized configuration, respectively.
The following is the format and example of these 2 files.

**Tips for Chinese Users**: If you are in Mainland China, please refer to [here](./configuration-for-china.md) first before you edit these files.
**Tips for Chinese Users**: If you are in Mainland China, please read [this issue](https://github.com/microsoft/pai/issues/5592) first before you edit these files.

#### `layout.yaml` format

Expand Down
29 changes: 0 additions & 29 deletions docs_zh_CN/manual/cluster-admin/configuration-for-china.md

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ sudo chmod 644 /etc/hosts

#### 网络相关的问题

如果您是中国用户,请先参考[这个文档](./configuration-for-china.md).
如果您是中国用户,请先参考[这个issue](https://github.com/microsoft/pai/issues/5592).

**无法下载kubeadm或hyperkube二进制文件**

Expand Down
2 changes: 1 addition & 1 deletion docs_zh_CN/manual/cluster-admin/installation-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ git checkout v1.8.0

#### 关于中国用户的提示

如果您是中国用户,在编辑这两个文件前,请先阅读[这个文档](./configuration-for-china.md)。
在中国安装会有一些网络问题,在开始前,请先阅读[这个issue](https://github.com/microsoft/pai/issues/5592)。

#### <div id="layoutyaml-format">`layout.yaml` 格式示例</div>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set interval = cluster_cfg["prometheus"]["scrape_interval"]|default(30) * 10 %}
{% set interval = cluster_cfg["prometheus"]["scrape_interval"]|default(30) * 2 %}

{"dashboard": {
"annotations": {
Expand All @@ -14,7 +14,7 @@
}
]
},
"editable": false,
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": true,
Expand Down Expand Up @@ -466,7 +466,7 @@
"steppedLine": false,
"targets": [
{
"expr": "100 - avg (irate(node_cpu_seconds_total{mode=\"idle\"}[{{interval}}s])) * 100",
"expr": "avg ((sum by (instance) (idelta(node_cpu_seconds_total{}[{{interval}}s])) > bool 0) * (100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\"}[{{interval}}s])) * 100)))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "cpu utilization",
Expand Down Expand Up @@ -553,14 +553,63 @@
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "free",
"refId": "D"
"refId": "B"
},
{
"expr": "sum(node_memory_Buffers_bytes) + sum(node_memory_Cached_bytes)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "buff/cache",
"refId": "B"
"refId": "C"
},
{
"expr": "sum(node_memory_bytes{type=\"physical_total\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "phisical total",
"refId": "D"
},
{
"expr": "sum(node_memory_bytes{type=\"physical_available\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "phisical available",
"refId": "E"
},
{
"expr": "sum(node_memory_bytes{type=\"committed\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "committed",
"refId": "F"
},
{
"expr": "sum(node_memory_bytes{type=\"commit_limit\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "commit limit",
"refId": "G"
},
{
"expr": "sum(node_memory_bytes{type=\"system_cache\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "system cache",
"refId": "H"
},
{
"expr": "sum(node_memory_bytes{type=\"kernel_paged\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "kernel paged",
"refId": "I"
},
{
"expr": "(node_memory_bytes{type=\"kernel_non_paged\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "kernel non paged",
"refId": "J"
}
],
"thresholds": [],
Expand Down Expand Up @@ -631,14 +680,14 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[{{interval}}s]))",
"expr": "sum(irate(node_network_receive_bytes_total{device!~\"lo\"}[{{interval}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "in",
"refId": "A"
},
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[{{interval}}s]))",
"expr": "sum(irate(node_network_transmit_bytes_total{device!~\"lo\"}[{{interval}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out",
Expand Down Expand Up @@ -725,18 +774,25 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_disk_read_bytes_total[{{interval}}s]))",
"expr": "sum(irate(node_disk_read_bytes_total[{{interval}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "read",
"refId": "A"
},
{
"expr": "sum(rate(node_disk_written_bytes_total[{{interval}}s]))",
"expr": "sum(irate(node_disk_written_bytes_total[{{interval}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "write",
"refId": "B"
},
{
"expr": "sum(irate(node_disk_other_bytes_total[{{interval}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "other",
"refId": "C"
}
],
"thresholds": [],
Expand Down
Loading