diff --git a/README.md b/README.md index 5065ed5..219502a 100644 --- a/README.md +++ b/README.md @@ -41,4 +41,12 @@ To deploy a signed certificate in cluster follow [trusted cluster cert](signed-c ### Object Storage This solution requires object storage to be in place either through S3 or using Noobaa. -If you are using Noobaa apply the following [tuning paramters](noobaa/README.md) \ No newline at end of file +If you are using Noobaa apply the following [tuning paramters](noobaa/README.md) + +## How to run 🏃🏼 + +1. Create K8s config map and K8s secret based on the target Model Server Info. Use [kfp-model-server.yaml](./sdg/kfp-model-server.yaml). + +2. Use pipeline.py file to generate the pipeline.yaml which will create RHOAI pipeline. + +3. Create a run in RHOAI by providing required input parameter values. \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index f4e03ff..4577292 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,9 +11,9 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): """Wrapper for KFP pipeline, which allows for mocking individual stages.""" if 'sdg' in mock: - from sdg.faked import git_clone_op, sdg_op + from sdg.faked import preflight_check_op, git_clone_op, sdg_op else: - from sdg import git_clone_op, sdg_op + from sdg import preflight_check_op, git_clone_op, sdg_op @dsl.pipeline( @@ -27,9 +27,15 @@ def pipeline( repo_branch: Optional[str] = None, repo_pr: Optional[int] = None, ): + preflight_check_task = preflight_check_op( + repo_branch=repo_branch, repo_pr=repo_pr + ) + use_config_map_as_env(preflight_check_task, K8S_NAME, dict(endpoint="endpoint", model="model")) + use_secret_as_env(preflight_check_task, K8S_NAME, {"api_key": "api_key"}) + git_clone_task = git_clone_op( repo_branch=repo_branch, repo_pr=repo_pr, repo_url=repo_url - ) + ).after(preflight_check_task) sdg_task = sdg_op( num_instructions_to_generate=num_instructions_to_generate, diff --git a/pipeline.yaml b/pipeline.yaml index c999fee..02870f6 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -23,6 +23,14 @@ components: artifactType: schemaTitle: system.Dataset schemaVersion: 0.0.1 + comp-preflight-check-op: + executorLabel: exec-preflight-check-op + inputDefinitions: + parameters: + repo_branch: + parameterType: STRING + repo_pr: + parameterType: NUMBER_INTEGER comp-sdg-op: executorLabel: exec-sdg-op inputDefinitions: @@ -49,23 +57,40 @@ deploymentSpec: exec-git-clone-op: container: args: - - 'git clone {{$.inputs.parameters[''repo_url'']}} {{$.outputs.artifacts[''taxonomy''].path}} - && cd {{$.outputs.artifacts[''taxonomy''].path}} && if [ ! -z "{{$.inputs.parameters[''repo_branch'']}}" - ]; then git fetch origin {{$.inputs.parameters[''repo_branch'']}} && git - checkout {{$.inputs.parameters[''repo_branch'']}}; elif [ ! -z "{{$.inputs.parameters[''repo_pr'']}}" - ]; then git fetch origin pull/{{$.inputs.parameters[''repo_pr'']}}/head:{{$.inputs.parameters[''repo_pr'']}} - && git checkout {{$.inputs.parameters[''repo_pr'']}}; fi ' + - --executor_input + - '{{$}}' + - --function_to_execute + - git_clone_op command: - - /bin/sh + - sh - -c - image: registry.access.redhat.com/ubi9/toolbox - exec-sdg-op: + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.8.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef git_clone_op(\n taxonomy: dsl.Output[dsl.Dataset],\n repo_branch:\ + \ str,\n repo_pr: Optional[int],\n repo_url: Optional[str],\n):\n\ + \ return\n\n" + image: registry.access.redhat.com/ubi9/python-311:latest + exec-preflight-check-op: container: args: - --executor_input - '{{$}}' - --function_to_execute - - sdg_op + - preflight_check_op command: - sh - -c @@ -79,6 +104,36 @@ deploymentSpec: - 'program_path=$(mktemp -d) + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef preflight_check_op(\n repo_branch: str,\n repo_pr: Optional[int],\n\ + ):\n pass\n\n" + image: registry.access.redhat.com/ubi9/python-311:latest + exec-sdg-op: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - sdg_op + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.8.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ + \ python3 -m pip install --quiet --no-warn-script-location 'git+https://github.com/redhat-et/ilab-on-ocp.git#subdirectory=sdg/faked/fixtures'\ + \ && \"$0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" @@ -87,20 +142,10 @@ deploymentSpec: - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n taxonomy:\ \ dsl.Input[dsl.Dataset],\n sdg: dsl.Output[dsl.Dataset],\n repo_branch:\ - \ Optional[str],\n repo_pr: Optional[int],\n):\n import openai\n \ - \ from instructlab.sdg import generate_data\n from instructlab.sdg.utils.taxonomy\ - \ import read_taxonomy\n from os import getenv\n\n api_key = getenv(\"\ - api_key\")\n model = getenv(\"model\")\n endpoint = getenv(\"endpoint\"\ - )\n client = openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n \ - \ taxonomy_base = \"main\" if repo_branch or repo_pr else \"empty\"\n\n\ - \ print(\"Generating syntetic dataset for:\")\n print()\n print(read_taxonomy(taxonomy.path,\ - \ taxonomy_base))\n\n # generate_data has a magic word for its taxonomy_base\ - \ argument - `empty`\n # it allows generating from the whole repo, see:\n\ - \ # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ - \ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ - \ output_dir=sdg.path,\n taxonomy=taxonomy.path,\n \ - \ taxonomy_base=taxonomy_base,\n model_name=model,\n )\n\n" - image: quay.io/tcoufal/ilab-sdg:latest + \ Optional[str],\n repo_pr: Optional[int],\n):\n import sys\n from\ + \ pathlib import Path\n import shutil\n\n shutil.copytree(Path(sys.prefix)\ + \ / \"sdg_fixtures\", sdg.path, dirs_exist_ok=True)\n return\n\n" + image: registry.access.redhat.com/ubi9/python-311:latest pipelineInfo: description: InstructLab pipeline displayName: InstructLab @@ -113,6 +158,8 @@ root: enableCache: true componentRef: name: comp-git-clone-op + dependentTasks: + - preflight-check-op inputs: parameters: repo_branch: @@ -123,6 +170,19 @@ root: componentInputParameter: repo_url taskInfo: name: git-clone-op + preflight-check-op: + cachingOptions: + enableCache: true + componentRef: + name: comp-preflight-check-op + inputs: + parameters: + repo_branch: + componentInputParameter: repo_branch + repo_pr: + componentInputParameter: repo_pr + taskInfo: + name: preflight-check-op sdg-op: cachingOptions: enableCache: true @@ -168,6 +228,19 @@ platforms: kubernetes: deploymentSpec: executors: + exec-preflight-check-op: + configMapAsEnv: + - configMapName: kfp-model-server + keyToEnv: + - configMapKey: endpoint + envVar: endpoint + - configMapKey: model + envVar: model + secretAsEnv: + - keyToEnv: + - envVar: api_key + secretKey: api_key + secretName: kfp-model-server exec-sdg-op: configMapAsEnv: - configMapName: kfp-model-server diff --git a/sdg/__init__.py b/sdg/__init__.py index 095fcb9..fac2005 100644 --- a/sdg/__init__.py +++ b/sdg/__init__.py @@ -1,4 +1,4 @@ -from .components import git_clone_op, sdg_op +from .components import preflight_check_op, git_clone_op, sdg_op from . import faked -__all__ = ["git_clone_op", "sdg_op", "faked"] +__all__ = ["preflight_check_op", "git_clone_op", "sdg_op", "faked"] diff --git a/sdg/components.py b/sdg/components.py index 2f716ec..460cf8d 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -5,6 +5,26 @@ IMAGE = "quay.io/tcoufal/ilab-sdg:latest" +@dsl.component(base_image=IMAGE) +def preflight_check_op( + repo_branch: str, + repo_pr: Optional[int], +): + from os import getenv + + if (not repo_branch) and (repo_pr is None or repo_pr <= 0 ): + raise Exception("Both taxonomy repo branch and taxonomy pull request number cannot be empty") + api_key = getenv("api_key") + model = getenv("model") + endpoint = getenv("endpoint") + + if not api_key: + raise Exception("Model Server Auth Key is missing in kfp-model-server secret") + if not model: + raise Exception("Model name is missing in kfp-model-server configMap") + if not endpoint: + raise Exception("Model Server endpoint URL is missing in kfp-model-server configMap") + @dsl.container_component def git_clone_op( taxonomy: dsl.Output[dsl.Dataset], diff --git a/sdg/faked/__init__.py b/sdg/faked/__init__.py index 7559c2d..c1db169 100644 --- a/sdg/faked/__init__.py +++ b/sdg/faked/__init__.py @@ -1,3 +1,3 @@ -from .components import git_clone_op, sdg_op +from .components import preflight_check_op, git_clone_op, sdg_op -__all__ = ["git_clone_op", "sdg_op"] +__all__ = ["preflight_check_op", "git_clone_op", "sdg_op"] diff --git a/sdg/faked/components.py b/sdg/faked/components.py index 7d45dd6..d4532bf 100644 --- a/sdg/faked/components.py +++ b/sdg/faked/components.py @@ -5,6 +5,13 @@ IMAGE = "registry.access.redhat.com/ubi9/python-311:latest" +@dsl.component(base_image=IMAGE) +def preflight_check_op( + repo_branch: str, + repo_pr: Optional[int], +): + pass + @dsl.component(base_image=IMAGE) def git_clone_op( taxonomy: dsl.Output[dsl.Dataset],