Skip to content

Commit

Permalink
renaming folder to storage initalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
deepanker13 committed Jan 10, 2024
1 parent 0de6aa4 commit 4488d02
Show file tree
Hide file tree
Showing 13 changed files with 25 additions and 49 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ jobs:
dockerfile: build/images/training-operator/Dockerfile
- component-name: kubectl-delivery
dockerfile: build/images/kubectl-delivery/Dockerfile
- component-name: storage-initializer
dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile
context: sdk/python/kubeflow/storage_initializer
1 change: 0 additions & 1 deletion sdk/python/kubeflow/storage_init_container/constants.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM python:3.11
WORKDIR /app

# Copy the Python package and its source code into the container
COPY . /app/storage_init_container
COPY . /app/storage_initializer

# Copy the requirements.txt file into the container
COPY requirements.txt /app/requirements.txt
Expand All @@ -14,4 +14,4 @@ COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Run storage.py when the container launches
ENTRYPOINT ["python", "-m", "storage_init_container.storage"]
ENTRYPOINT ["python", "-m", "storage_initializer.storage"]
3 changes: 3 additions & 0 deletions sdk/python/kubeflow/storage_initializer/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
INIT_CONTAINER_MOUNT_PATH = "/workspace"
VOLUME_PATH_DATASET = INIT_CONTAINER_MOUNT_PATH + "/dataset"
VOLUME_PATH_MODEL = INIT_CONTAINER_MOUNT_PATH + "/model"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from urllib.parse import urlparse
import json, os
from typing import Union
from .constants import INIT_CONTAINER_MOUNT_PATH
from .constants import VOLUME_PATH_DATASET, VOLUME_PATH_MODEL
from .abstract_model_provider import modelProvider
from .abstract_dataset_provider import datasetProvider

Expand All @@ -24,21 +24,12 @@ class HuggingFaceModelParams:
model_uri: str
transformer_type: TRANSFORMER_TYPES
access_token: str = None
download_dir: str = field(default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "models"))

def __post_init__(self):
# Custom checks or validations can be added here
if self.model_uri == "" or self.model_uri is None:
raise ValueError("model_uri cannot be empty.")

@property
def download_dir(self):
return self.download_dir

@download_dir.setter
def download_dir(self, value):
raise AttributeError("Cannot modify read-only field 'download_dir'")


@dataclass
class HuggingFaceTrainParams:
Expand All @@ -62,35 +53,24 @@ def download_model_and_tokenizer(self):
transformer_type_class.from_pretrained(
self.model,
token=self.config.access_token,
cache_dir=self.config.download_dir,
cache_dir=VOLUME_PATH_MODEL,
trust_remote_code=True,
)
transformers.AutoTokenizer.from_pretrained(
self.model, cache_dir=self.config.download_dir
self.model, cache_dir=VOLUME_PATH_MODEL
)


@dataclass
class HfDatasetParams:
repo_id: str
access_token: str = None
download_dir: str = field(
default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "datasets")
)

def __post_init__(self):
# Custom checks or validations can be added here
if self.repo_id == "" or self.repo_id is None:
raise ValueError("repo_id is None")

@property
def download_dir(self):
return self.download_dir

@download_dir.setter
def download_dir(self, value):
raise AttributeError("Cannot modify read-only field 'download_dir'")


class HuggingFaceDataset(datasetProvider):
def load_config(self, serialised_args):
Expand All @@ -104,4 +84,4 @@ def download_dataset(self):
if self.config.access_token:
huggingface_hub.login(self.config.access_token)

load_dataset(self.config.repo_id, cache_dir=self.config.download_dir)
load_dataset(self.config.repo_id, cache_dir=VOLUME_PATH_DATASET)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import boto3
from urllib.parse import urlparse
from .abstract_dataset_provider import datasetProvider
from .constants import INIT_CONTAINER_MOUNT_PATH
from .constants import VOLUME_PATH_DATASET


@dataclass
Expand All @@ -14,9 +14,6 @@ class S3DatasetParams:
region_name: str = None
access_key: str = None
secret_key: str = None
download_dir: str = field(
default=os.path.join(INIT_CONTAINER_MOUNT_PATH, "datasets")
)

def is_valid_url(self, url):
try:
Expand All @@ -36,14 +33,6 @@ def __post_init__(self):
raise ValueError("bucket_name or endpoint_url or file_key is None")
self.is_valid_url(self.endpoint_url)

@property
def download_dir(self):
return self.download_dir

@download_dir.setter
def download_dir(self, value):
raise AttributeError("Cannot modify read-only field 'download_dir'")


class S3(datasetProvider):
def load_config(self, serialised_args):
Expand All @@ -63,6 +52,6 @@ def download_dataset(self):
s3_client.download_file(
self.config.bucket_name,
self.config.file_key,
os.path.join(self.config.download_dir, self.config.file_key),
os.path.join(VOLUME_PATH_DATASET, self.config.file_key),
)
print(f"File downloaded to: {self.config.download_dir}")
print(f"File downloaded to: {VOLUME_PATH_DATASET}")
16 changes: 10 additions & 6 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
from kubeflow.training.api_client import ApiClient
from kubeflow.training.constants import constants
from kubeflow.training.utils import utils
from kubeflow.storage_init_container.constants import INIT_CONTAINER_MOUNT_PATH
from kubeflow.storage_initializer.constants import (
INIT_CONTAINER_MOUNT_PATH,
VOLUME_PATH_DATASET,
VOLUME_PATH_MODEL,
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -116,8 +120,8 @@ def train(
print(
"train api dependencies not installed. Run pip install -U 'kubeflow-training[huggingface]' "
)
from kubeflow.storage_init_container.s3 import S3DatasetParams
from kubeflow.storage_init_container.hugging_face import (
from kubeflow.storage_initializer.s3 import S3DatasetParams
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceModelParams,
HuggingFaceTrainParams,
HfDatasetParams,
Expand Down Expand Up @@ -209,9 +213,9 @@ def train(
"--transformer_type",
model_provider_parameters.transformer_type.__class__.__name__,
"--model_dir",
model_provider_parameters.download_dir,
VOLUME_PATH_MODEL,
"--dataset_dir",
dataset_provider_parameters.download_dir,
VOLUME_PATH_DATASET,
"--dataset_name",
dataset_provider_parameters.repo_id,
"--lora_config",
Expand All @@ -222,7 +226,7 @@ def train(
volume_mounts=[
models.V1VolumeMount(
name=constants.TRAINER_PV,
mount_path=constants.TRAINER_CONTAINER_MOUNT_PATH,
mount_path=INIT_CONTAINER_MOUNT_PATH,
)
],
resources=resources_per_worker,
Expand Down
2 changes: 0 additions & 2 deletions sdk/python/kubeflow/training/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,5 +168,3 @@
models.KubeflowOrgV1MPIJob,
models.KubeflowOrgV1PaddleJob,
]

TRAINER_CONTAINER_MOUNT_PATH = "/workspace"

0 comments on commit 4488d02

Please sign in to comment.