Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the CI to build multi-platform container images #1956

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/workflows/build-and-publish-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Reusable workflows for publishing Katib images.
name: Build And Publish Images

on:
workflow_call:
inputs:
component-name:
required: true
type: string
platforms:
required: true
type: string
dockerfile:
required: true
type: string
secrets:
DOCKERHUB_USERNAME:
required: false
DOCKERHUB_TOKEN:
required: false

jobs:
build-and-publish:
name: Publish Image
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Docker Login
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
if: >-
github.repository == 'kubeflow/katib' &&
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Publish Component ${{ inputs.component-name }}
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
if: >-
github.repository == 'kubeflow/katib' &&
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
id: publish
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
dockerfile: ${{ inputs.dockerfile }}
platforms: ${{ inputs.platforms }}
push: true

- name: Test Build For Component ${{ inputs.component-name }}
if: steps.publish.outcome == 'skipped'
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
dockerfile: ${{ inputs.dockerfile }}
platforms: ${{ inputs.platforms }}
push: false
1 change: 0 additions & 1 deletion .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,3 @@ jobs:

- name: Check YAML
run: make yamllint

29 changes: 10 additions & 19 deletions .github/workflows/publish-algorithm-images.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
name: Publish AutoML Algorithm Images

on:
push:
branches:
- master

env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
- push
- pull_request

jobs:
algorithm:
name: Publish Image
# Trigger workflow only for kubeflow/katib repository.
if: github.repository == 'kubeflow/katib'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Publish Component ${{ matrix.component-name }}
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflowkatib/${{ matrix.component-name }}
dockerfile: ${{ matrix.dockerfile }}
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.component-name }}
platforms: linux/amd64,linux/arm64
dockerfile: ${{ matrix.dockerfile }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

strategy:
fail-fast: false
Expand Down
28 changes: 10 additions & 18 deletions .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,21 @@
name: Publish Katib Core Images

on:
push:
branches:
- master

env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
- push
- pull_request

jobs:
core:
name: Publish Image
# Trigger workflow only for kubeflow/katib repository.
if: github.repository == 'kubeflow/katib'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.component-name }}
platforms: linux/amd64,linux/arm64
dockerfile: ${{ matrix.dockerfile }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Publish Component ${{ matrix.component-name }}
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflowkatib/${{ matrix.component-name }}
dockerfile: ${{ matrix.dockerfile }}

strategy:
fail-fast: false
Expand Down
38 changes: 19 additions & 19 deletions .github/workflows/publish-trial-images.yaml
Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
name: Publish Trial Images

on:
push:
branches:
- master

env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
- push
- pull_request

jobs:
trial:
name: Publish Image
# Trigger workflow only for kubeflow/katib repository.
if: github.repository == 'kubeflow/katib'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Publish Trial ${{ matrix.trial-name }}
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflowkatib/${{ matrix.trial-name }}
dockerfile: ${{ matrix.dockerfile }}
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.trial-name }}
platforms: ${{ matrix.platforms }}
dockerfile: ${{ matrix.dockerfile }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

strategy:
fail-fast: false
matrix:
include:
- trial-name: mxnet-mnist
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
- trial-name: pytorch-mnist-cpu
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
- trial-name: pytorch-mnist-gpu
platforms: linux/amd64
Copy link
Member Author

@tenzen-y tenzen-y Sep 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we face the below error, we can not build multiplatform container images with GPU support.
Once we use AWS self-hosted runner, we will be able to build it.

System.IO.IOException: No space left on device : '/home/runner/runners/2.296.2/_diag/Worker_20220918-065331-utc.log'
   at System.IO.RandomAccess.WriteAtOffset(SafeFileHandle handle, ReadOnlySpan`1 buffer, Int64 fileOffset)
   at System.IO.Strategies.BufferedFileStreamStrategy.FlushWrite()
   at System.IO.StreamWriter.Flush(Boolean flushStream, Boolean flushEncoder)
   at System.Diagnostics.TextWriterTraceListener.Flush()
   at GitHub.Runner.Common.HostTraceListener.WriteHeader(String source, TraceEventType eventType, Int32 id)
   at GitHub.Runner.Common.HostTraceListener.TraceEvent(TraceEventCache eventCache, String source, TraceEventType eventType, Int32 id, String message)
   at System.Diagnostics.TraceSource.TraceEvent(TraceEventType eventType, Int32 id, String message)
   at GitHub.Runner.Worker.Worker.RunAsync(String pipeIn, String pipeOut)
   at GitHub.Runner.Worker.Program.MainAsync(IHostContext context, String[] args)
System.IO.IOException: No space left on device : '/home/runner/runners/2.296.2/_diag/Worker_20220918-065331-utc.log'
   at System.IO.RandomAccess.WriteAtOffset(SafeFileHandle handle, ReadOnlySpan`1 buffer, Int64 fileOffset)
   at System.IO.Strategies.BufferedFileStreamStrategy.FlushWrite()
   at System.IO.StreamWriter.Flush(Boolean flushStream, Boolean flushEncoder)
   at System.Diagnostics.TextWriterTraceListener.Flush()
   at GitHub.Runner.Common.HostTraceListener.WriteHeader(String source, TraceEventType eventType, Int32 id)
   at GitHub.Runner.Common.HostTraceListener.TraceEvent(TraceEventCache eventCache, String source, TraceEventType eventType, Int32 id, String message)
   at System.Diagnostics.TraceSource.TraceEvent(TraceEventType eventType, Int32 id, String message)
   at GitHub.Runner.Common.Tracing.Error(Exception exception)
   at GitHub.Runner.Worker.Program.MainAsync(IHostContext context, String[] args)
Unhandled exception. System.IO.IOException: No space left on device : '/home/runner/runners/2.296.2/_diag/Worker_20220918-065331-utc.log'
   at System.IO.RandomAccess.WriteAtOffset(SafeFileHandle handle, ReadOnlySpan`1 buffer, Int64 fileOffset)
   at System.IO.Strategies.BufferedFileStreamStrategy.FlushWrite()
   at System.IO.StreamWriter.Flush(Boolean flushStream, Boolean flushEncoder)
   at System.Diagnostics.TextWriterTraceListener.Flush()
   at System.Diagnostics.TraceSource.Flush()
   at GitHub.Runner.Common.TraceManager.Dispose(Boolean disposing)
   at GitHub.Runner.Common.TraceManager.Dispose()
   at GitHub.Runner.Common.HostContext.Dispose(Boolean disposing)
   at GitHub.Runner.Common.HostContext.Dispose()
   at GitHub.Runner.Worker.Program.Main(String[] args)

https://github.com/kubeflow/katib/actions/runs/3075994787

dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
- trial-name: tf-mnist-with-summaries
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
- trial-name: enas-cnn-cifar10-gpu
platforms: linux/amd64
dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu
- trial-name: enas-cnn-cifar10-cpu
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu
- trial-name: darts-cnn-cifar10-cpu
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.cpu
- trial-name: darts-cnn-cifar10-gpu
platforms: linux/amd64
dockerfile: examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.gpu
- trial-name: simple-pbt
platforms: linux/amd64,linux/arm64
dockerfile: examples/v1beta1/trial-images/simple-pbt/Dockerfile
18 changes: 10 additions & 8 deletions .github/workflows/template-e2e-test/action.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
# Template for e2e tests.
# Composite action for e2e tests.
name: Run E2E Test
description: Run e2e test using the minikube cluster

inputs:
experiments:
required: true
type: string
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

description: comma delimited experiment name
Copy link
Member Author

@tenzen-y tenzen-y Sep 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

training-operator:
required: false
type: boolean
default: false
description: whether to deploy training-operator or not
default: "false"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trial-images:
required: true
type: string
description: comma delimited trial image name
katib-ui:
required: true
type: boolean
default: false
description: whether to deploy katib-ui or not
default: "false"
database-type:
required: false
type: string
description: mysql or postgres
default: mysql

runs:
Expand Down
27 changes: 16 additions & 11 deletions .github/workflows/template-publish-image/action.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# Template run for publishing Katib images.
# Composite action for publishing Katib images.
name: Build And Publish Container Images
description: Build MultiPlatform Supporting Container Images

inputs:
image:
required: true
type: string
description: image tag
dockerfile:
required: true
type: string
description: path for dockerfile
platforms:
required: true
description: linux/amd64 or linux/amd64,linux/arm64
push:
required: true
description: whether to push container images or not

runs:
using: composite
steps:
- name: Set up QEMU
uses: docker/setup-qemu-action@v2

- name: Set Up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Docker Login
uses: docker/login-action@v1
with:
username: ${{ env.DOCKERHUB_USERNAME }}
password: ${{ env.DOCKERHUB_TOKEN }}

- name: Add Docker Tags
id: meta
uses: docker/metadata-action@v3
Expand All @@ -34,8 +39,8 @@ runs:
with:
context: .
file: ${{ inputs.dockerfile }}
push: true
push: ${{ inputs.push }}
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
platforms: linux/amd64
platforms: ${{ inputs.platforms }}
6 changes: 4 additions & 2 deletions .github/workflows/template-setup-e2e-test/action.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Template for e2e tests.
# Composite action to setup e2e tests.
name: Setup E2E Test
description: setup env for e2e test using the minikube cluster

inputs:
kubernetes-version:
required: true
type: string
description: kubernetes version

runs:
using: composite
Expand Down