Cherry pick fixes and features (#164)

* Support ms swift * fix * Upgrade cuda version * fix branch mapping error * fix resource display issue * use graceful deletion * bug fix --------- Co-authored-by: James <xzgan@opencsg.com>
OpenCSGs · Nov 5, 2024 · b563d97 · b563d97
1 parent 71d236b
commit b563d97
Show file tree

Hide file tree

Showing 26 changed files with 1,316 additions and 35 deletions.
diff --git a/api/handler/model.go b/api/handler/model.go
@@ -277,6 +277,10 @@ func (h *ModelHandler) SDKModelInfo(ctx *gin.Context) {
 		return
 	}
 	ref := ctx.Param("ref")
+	mappedBranch := ctx.Param("branch_mapped")
+	if mappedBranch != "" {
+		ref = mappedBranch
+	}
 	currentUser := httpbase.GetCurrentUser(ctx)
 	modelInfo, err := h.c.SDKModelInfo(ctx, namespace, name, ref, currentUser)
 	if err != nil {

diff --git a/api/handler/repo.go b/api/handler/repo.go
@@ -731,6 +731,10 @@ func (h *RepoHandler) SDKListFiles(ctx *gin.Context) {
 		return
 	}
 	ref := ctx.Param("ref")
+	mappedBranch := ctx.Param("branch_mapped")
+	if mappedBranch != "" {
+		ref = mappedBranch
+	}
 	files, err := h.c.SDKListFiles(ctx, common.RepoTypeFromContext(ctx), namespace, name, ref, currentUser)
 	if err != nil {
 		if errors.Is(err, component.ErrUnauthorized) {
@@ -786,6 +790,10 @@ func (h *RepoHandler) HeadSDKDownload(ctx *gin.Context) {
 	filePath := ctx.Param("file_path")
 	filePath = convertFilePathFromRoute(filePath)
 	branch := ctx.Param("branch")
+	mappedBranch := ctx.Param("branch_mapped")
+	if mappedBranch != "" {
+		branch = mappedBranch
+	}
 	req := &types.GetFileReq{
 		Namespace: namespace,
 		Name:      name,
@@ -855,6 +863,10 @@ func (h *RepoHandler) handleDownload(ctx *gin.Context, isResolve bool) {
 		ctx.Set("X-OPENCSG-S3-Internal", true)
 	}
 
+	mappedBranch := ctx.Param("branch_mapped")
+	if mappedBranch != "" {
+		branch = mappedBranch
+	}
 	req := &types.GetFileReq{
 		Namespace: namespace,
 		Name:      name,

diff --git a/api/middleware/repo.go b/api/middleware/repo.go
@@ -25,6 +25,10 @@ func RepoMapping(repo_type types.RepositoryType) gin.HandlerFunc {
 		common.SetRepoTypeContext(ctx, repo_type)
 		namespace := ctx.Param("namespace")
 		name := ctx.Param("name")
+		branch := ctx.Param("branch")
+		if branch == "" {
+			branch = ctx.Param("ref")
+		}
 		mapping := GetMapping(ctx)
 		if mapping == types.CSGHubMapping {
 			ctx.Next()
@@ -38,6 +42,10 @@ func RepoMapping(repo_type types.RepositoryType) gin.HandlerFunc {
 			slog.Info("namespace changed: ", "namespace", repo_id[0])
 			ctx.Set("namespace_mapped", repo_id[0])
 			ctx.Set("name_mapped", repo_id[1])
+			// for modelscope, the default branch is master, we should mapp it to real branch
+			if (branch == "main" || branch == "master") && mirror.Repository.DefaultBranch != branch {
+				ctx.Set("branch_mapped", mirror.Repository.DefaultBranch)
+			}
 			ctx.Next()
 			return
 		}

diff --git a/builder/deploy/cluster/cluster_manager.go b/builder/deploy/cluster/cluster_manager.go
@@ -188,9 +188,13 @@ func GetNodeResources(clientset *kubernetes.Clientset, config *config.Config) (m
 
 func getXPULabel(node v1.Node, config *config.Config) (string, string) {
 	if _, found := node.Labels[config.Space.GPUModelLabel]; found {
-		//for default clsuter
+		//for default cluster
 		return "nvidia.com/gpu", config.Space.GPUModelLabel
 	}
+	if _, found := node.Labels["nvidia.com/nvidia_name"]; found {
+		//for k3s cluster
+		return "nvidia.com/gpu", "nvidia.com/nvidia_name"
+	}
 	if _, found := node.Labels["kubemore_xpu_type"]; found {
 		//for huawei gpu
 		return "huawei.com/Ascend910", "kubemore_xpu_type"

diff --git a/builder/deploy/scheduler/deploy_runner.go b/builder/deploy/scheduler/deploy_runner.go
@@ -271,6 +271,7 @@ func (t *DeployRunner) makeDeployRequest() (*types.RunRequest, error) {
 		envMap["port"] = strconv.Itoa(deploy.ContainerPort)
 		envMap["HF_ENDPOINT"], _ = url.JoinPath(t.deployCfg.ModelDownloadEndpoint, "hf")
 		envMap["HF_TOKEN"] = token.Token
+		envMap["USE_CSGHUB_MODEL"] = "1"
 	}
 
 	if t.deployCfg.PublicRootDomain == "" {

diff --git a/builder/store/database/migrations/20241018113252_init_swift_runtime_framework.down.sql b/builder/store/database/migrations/20241018113252_init_swift_runtime_framework.down.sql
diff --git a/builder/store/database/migrations/20241018113252_init_swift_runtime_framework.up.sql b/builder/store/database/migrations/20241018113252_init_swift_runtime_framework.up.sql
diff --git a/builder/store/database/mirror.go b/builder/store/database/mirror.go
@@ -56,6 +56,15 @@ func (s *MirrorStore) IsExist(ctx context.Context, repoID int64) (exists bool, e
 		Exists(ctx)
 	return
 }
+func (s *MirrorStore) IsRepoExist(ctx context.Context, repoType types.RepositoryType, namespace, name string) (exists bool, err error) {
+	var repo Repository
+	exists, err = s.db.Operator.Core.
+		NewSelect().
+		Model(&repo).
+		Where("git_path=?", fmt.Sprintf("%ss_%s/%s", repoType, namespace, name)).
+		Exists(ctx)
+	return
+}
 
 func (s *MirrorStore) FindByRepoID(ctx context.Context, repoID int64) (*Mirror, error) {
 	var mirror Mirror
@@ -109,10 +118,16 @@ func (s *MirrorStore) FindWithMapping(ctx context.Context, repoType types.Reposi
 			Scan(ctx)
 	} else {
 		// auto mapping
+		//fix some repo id has mirror but it's not public,for example: https://opencsg.com/models/Qwen/Qwen_Qwen2-7B-Instruct
+		exist, _ := s.IsRepoExist(ctx, repoType, namespace, name)
+		if exist {
+			// no need mapping if repo id already exists in reporitory
+			return nil, fmt.Errorf("repo already exists, no need mapping")
+		}
 		err = s.db.Operator.Core.NewSelect().
 			Model(&mirror).
 			Relation("Repository").
-			Where("LOWER(repository.git_path) = LOWER(?) OR mirror.source_repo_path=?", fmt.Sprintf("%ss_%s/%s", repoType, namespace, name), fmt.Sprintf("%s/%s", namespace, name)).
+			Where("mirror.source_repo_path=?", fmt.Sprintf("%s/%s", namespace, name)).
 			Where("repository.repository_type=?", repoType).
 			Scan(ctx)
 	}

diff --git a/component/repo.go b/component/repo.go
@@ -1959,16 +1959,16 @@ func (c *RepoComponent) DeployDetail(ctx context.Context, detailReq types.Deploy
 func (c *RepoComponent) generateEndpoint(ctx context.Context, deploy *database.Deploy) (string, string) {
 	var endpoint string
 	provider := ""
+	cls, err := c.cluster.ByClusterID(ctx, deploy.ClusterID)
+	zone := ""
+	if err != nil {
+		slog.Warn("Get cluster with error", slog.Any("error", err))
+	} else {
+		zone = cls.Zone
+		provider = cls.Provider
+	}
 	if len(deploy.SvcName) > 0 && deploy.Status == deployStatus.Running {
 		// todo: zone.provider.endpoint to support multi-zone, multi-provider
-		cls, err := c.cluster.ByClusterID(ctx, deploy.ClusterID)
-		zone := ""
-		if err != nil {
-			slog.Warn("Get cluster with error", slog.Any("error", err))
-		} else {
-			zone = cls.Zone
-			provider = cls.Provider
-		}
 		regionDomain := ""
 		if len(zone) > 0 && len(provider) > 0 {
 			regionDomain = fmt.Sprintf(".%s.%s", zone, provider)

diff --git a/docker/finetune/Dockerfile.ms-swift b/docker/finetune/Dockerfile.ms-swift
@@ -0,0 +1,60 @@
+# pull from devel image instead of base
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
+# Set bash as the default shell
+ENV SHELL=/bin/bash \
+    JUPYTERHUB_SERVICE_PREFIX=/proxy/ \
+    GRADIO_ROOT_PATH=/proxy/7860/ \
+    TZ=Asia/Shanghai \
+    NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 \
+    HF_HOME=/workspace/.cache \
+    DEBIAN_FRONTEND=noninteractive
+
+# Build with some basic utilities
+RUN apt-get update && apt-get install -y \
+    python3-pip apt-utils \
+    wget curl vim \
+    git git-lfs \
+    supervisor \
+    unzip
+# set timezone
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get install -y tzdata \
+    && ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
+    && echo $TZ > /etc/timezone \
+    && dpkg-reconfigure -f noninteractive tzdata
+
+# alias python='python3'
+RUN ln -s /usr/bin/python3 /usr/bin/python
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+# Install the appropriate torch version 
+#RUN pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install --no-cache-dir jupyterlab numpy==1.26.4 \
+    torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 \
+    jupyter-server-proxy==4.4.0 deepspeed \
+    gradio-client==1.4.0
+# Create a working directory
+WORKDIR /etc/csghub
+#RUN git clone https://github.com/modelscope/ms-swift.git --branch v2.5.0 --single-branch
+RUN git clone https://gitee.com/xzgan/ms-swift.git --branch v2.5.0 --single-branch
+RUN cd ms-swift && pip install --no-cache-dir -e ".[llm]"
+#because this library is update frequently, we use new line
+RUN pip install --no-cache-dir vllm==v0.6.3.post1 transformers==4.45.2 timm==1.0.11 evalscope==0.5.5
+# setup supervisord
+RUN mkdir -p /var/log/supervisord
+COPY swift/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+COPY swift/jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py
+COPY swift/ /etc/csghub/
+RUN chmod +x /etc/csghub/*.sh
+#use dark mode
+RUN mkdir -p /root/.jupyter/lab/user-settings/@jupyterlab/apputils-extension && \
+	    echo '{"theme":"JupyterLab Dark"}' > /root/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings && \
+	    mkdir -p /root/.jupyter/lab/user-settings/@jupyterlab/notebook-extension && \
+	    echo '{"codeCellConfig":{"lineNumbers":true }}' >   /root/.jupyter/lab/user-settings/@jupyterlab/notebook-extension/tracker.jupyterlab-settings
+#fix gradio proxy issue
+RUN pip uninstall -y gradio && pip install https://opencsg-public-resource.oss-cn-beijing.aliyuncs.com/csghub/gradio/gradio-5.1.0-py3-none-any.whl
+
+# Create a working directory
+WORKDIR /workspace/
+ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
+EXPOSE 8000
diff --git a/docker/finetune/README.md b/docker/finetune/README.md
@@ -22,15 +22,38 @@ docker buildx build --platform linux/amd64,linux/arm64 \
   -f Dockerfile.llamafactory \
   --push .
 ```
+
+## Build Multi-Platform Images for swift
+```bash
+export BUILDX_NO_DEFAULT_ATTESTATIONS=1
+export IMAGE_TAG=1.0-cuda12.1-devel-ubuntu22.04-py310-torch2.4.0
+docker buildx build --platform linux/amd64,linux/arm64 \
+  -t ${OPENCSG_ACR}/public/ms-swift:${IMAGE_TAG} \
+  -t ${OPENCSG_ACR}/public/ms-swift:latest \
+  -f Dockerfile.ms-swift \
+  --push .
+```
 *Note: The above command will create `linux/amd64` and `linux/arm64` images with the tags `${IMAGE_TAG}` and `latest` at the same time.*
 
+## build gradio whl
+```
+1. build gradio base image or pick from opencsg-registry.cn-beijing.cr.aliyuncs.com/public/gradio-build-base:1.0
+2. docker run -itd base_image
+3. download gradio resource:  git clone https://gitee.com/xzgan/gradio.git --branch 5.1.0 --single-branch
+4. build frontend js: bash scripts/build_frontend.sh
+5. build whl: python3 -m build -w
+6. check whl file in dist folder and upload to https://git-devops.opencsg.com/opensource/gradio/
+```
+
+
 ## Run Finetune Image Locally
 ```bash
 docker run -d \
-  -e ACCESS_TOKEN=xxx \
+  --gpus device=7 \
+  -e HF_TOKEN=xxx \
   -e REPO_ID="OpenCSG/csg-wukong-1B" \
-  -e HF_ENDPOINT=https://opencsg.com/hf \
-  -p 8000:8000 \
+  -e HF_ENDPOINT=https://hub.opencsg.com/hf \
+  -p 30148:8000 \
   ${OPENCSG_ACR}/public/llama-factory:${IMAGE_TAG}
 ```
 *Note: HF_ENDPOINT should be use the real csghub address.*

diff --git a/docker/finetune/swift/jupyter_notebook_config.py b/docker/finetune/swift/jupyter_notebook_config.py
@@ -0,0 +1,41 @@
+import os
+
+c.ServerApp.ip = '0.0.0.0'
+c.ServerApp.token = ""
+c.ServerApp.open_browser = False
+c.ServerApp.allow_root = True
+c.ServerApp.port_retries = 0
+c.ServerApp.quit_button = False
+c.ServerApp.allow_remote_access = True
+c.ServerApp.disable_check_xsrf = True
+c.ServerApp.allow_origin = '*'
+c.ServerApp.trust_xheaders = True
+c.ServerApp.open_browser = False
+c.ServerApp.answer_yes = True
+c.ServerApp.tornado_settings = {
+    "headers": {
+        "Content-Security-Policy": "frame-ancestors \'self\' *"
+    }
+}
+
+# c.ServerApp.base_url = context_path
+
+# opt-in the async version to file handler and checkpoints
+c.ServerApp.checkpoints_class = "jupyter_server.services.contents.checkpoints.AsyncCheckpoints"
+
+# Do not delete files to trash: https://github.com/jupyter/notebook/issues/3130
+c.FileContentsManager.delete_to_trash = False
+
+c.ContentsManager.allow_hidden = True
+
+# improve the performance of autocompletion, disable Jedi in IPython (the LSP servers for Python use Jedi too)
+c.Completer.use_jedi = False
+
+# https://forums.fast.ai/t/jupyter-notebook-enhancements-tips-and-tricks/17064/22
+c.NotebookApp.iopub_msg_rate_limit = 100000000
+c.NotebookApp.iopub_data_rate_limit = 2147483647
+
+# inject proxy js (it is hack)
+
+# c.ServerProxy['non_service_rewrite_response'] = [proxy_local_server]
+c.FileContentsManager.always_delete_dir = True
diff --git a/docker/finetune/swift/mem_monitor.sh b/docker/finetune/swift/mem_monitor.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+while true; do
+  if test -f "/sys/fs/cgroup/cpu.max"; then
+    max_memory=$(cat /sys/fs/cgroup/memory.max)
+    current_memory=$(cat /sys/fs/cgroup/memory.current)
+  fi
+
+  if test -f "/sys/fs/cgroup/memory/memory.limit_in_bytes"; then
+    max_memory=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
+    MEMORY_STAT_PATH="/sys/fs/cgroup/memory/memory.stat"
+    current_memory=$(awk '$1 == "rss" {print $2}' $MEMORY_STAT_PATH)
+  fi
+
+  if [ "${max_memory}" == "max" ]; then
+    sleep 86400
+    continue
+  fi
+  # reserve 200M
+  threshold=209715200
+  less_max_memory=$((max_memory - threshold))
+  if [ "$current_memory" -gt "$less_max_memory" ]; then
+    # Get the PID of the process with the highest memory usage
+    pid=$(ps -eo pid,%mem --sort=-%mem | awk 'NR==2 {print $1}')
+
+    # Kill the process
+    kill "$pid"
+    echo "Process with PID $pid killed due to memory exceeding the limit."
+  fi
+
+  sleep 10
+done