Skip to content

Commit

Permalink
Cherry pick fixes and features (#164)
Browse files Browse the repository at this point in the history
* Support ms swift

* fix

* Upgrade cuda version

* fix branch mapping error

* fix resource display issue

* use graceful deletion

* bug fix

---------

Co-authored-by: James <xzgan@opencsg.com>
  • Loading branch information
ganisback and James authored Nov 5, 2024
1 parent 71d236b commit b563d97
Show file tree
Hide file tree
Showing 26 changed files with 1,316 additions and 35 deletions.
4 changes: 4 additions & 0 deletions api/handler/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ func (h *ModelHandler) SDKModelInfo(ctx *gin.Context) {
return
}
ref := ctx.Param("ref")
mappedBranch := ctx.Param("branch_mapped")
if mappedBranch != "" {
ref = mappedBranch
}
currentUser := httpbase.GetCurrentUser(ctx)
modelInfo, err := h.c.SDKModelInfo(ctx, namespace, name, ref, currentUser)
if err != nil {
Expand Down
12 changes: 12 additions & 0 deletions api/handler/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,10 @@ func (h *RepoHandler) SDKListFiles(ctx *gin.Context) {
return
}
ref := ctx.Param("ref")
mappedBranch := ctx.Param("branch_mapped")
if mappedBranch != "" {
ref = mappedBranch
}
files, err := h.c.SDKListFiles(ctx, common.RepoTypeFromContext(ctx), namespace, name, ref, currentUser)
if err != nil {
if errors.Is(err, component.ErrUnauthorized) {
Expand Down Expand Up @@ -786,6 +790,10 @@ func (h *RepoHandler) HeadSDKDownload(ctx *gin.Context) {
filePath := ctx.Param("file_path")
filePath = convertFilePathFromRoute(filePath)
branch := ctx.Param("branch")
mappedBranch := ctx.Param("branch_mapped")
if mappedBranch != "" {
branch = mappedBranch
}
req := &types.GetFileReq{
Namespace: namespace,
Name: name,
Expand Down Expand Up @@ -855,6 +863,10 @@ func (h *RepoHandler) handleDownload(ctx *gin.Context, isResolve bool) {
ctx.Set("X-OPENCSG-S3-Internal", true)
}

mappedBranch := ctx.Param("branch_mapped")
if mappedBranch != "" {
branch = mappedBranch
}
req := &types.GetFileReq{
Namespace: namespace,
Name: name,
Expand Down
8 changes: 8 additions & 0 deletions api/middleware/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ func RepoMapping(repo_type types.RepositoryType) gin.HandlerFunc {
common.SetRepoTypeContext(ctx, repo_type)
namespace := ctx.Param("namespace")
name := ctx.Param("name")
branch := ctx.Param("branch")
if branch == "" {
branch = ctx.Param("ref")
}
mapping := GetMapping(ctx)
if mapping == types.CSGHubMapping {
ctx.Next()
Expand All @@ -38,6 +42,10 @@ func RepoMapping(repo_type types.RepositoryType) gin.HandlerFunc {
slog.Info("namespace changed: ", "namespace", repo_id[0])
ctx.Set("namespace_mapped", repo_id[0])
ctx.Set("name_mapped", repo_id[1])
// for modelscope, the default branch is master, we should mapp it to real branch
if (branch == "main" || branch == "master") && mirror.Repository.DefaultBranch != branch {
ctx.Set("branch_mapped", mirror.Repository.DefaultBranch)
}
ctx.Next()
return
}
Expand Down
6 changes: 5 additions & 1 deletion builder/deploy/cluster/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,13 @@ func GetNodeResources(clientset *kubernetes.Clientset, config *config.Config) (m

func getXPULabel(node v1.Node, config *config.Config) (string, string) {
if _, found := node.Labels[config.Space.GPUModelLabel]; found {
//for default clsuter
//for default cluster
return "nvidia.com/gpu", config.Space.GPUModelLabel
}
if _, found := node.Labels["nvidia.com/nvidia_name"]; found {
//for k3s cluster
return "nvidia.com/gpu", "nvidia.com/nvidia_name"
}
if _, found := node.Labels["kubemore_xpu_type"]; found {
//for huawei gpu
return "huawei.com/Ascend910", "kubemore_xpu_type"
Expand Down
1 change: 1 addition & 0 deletions builder/deploy/scheduler/deploy_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ func (t *DeployRunner) makeDeployRequest() (*types.RunRequest, error) {
envMap["port"] = strconv.Itoa(deploy.ContainerPort)
envMap["HF_ENDPOINT"], _ = url.JoinPath(t.deployCfg.ModelDownloadEndpoint, "hf")
envMap["HF_TOKEN"] = token.Token
envMap["USE_CSGHUB_MODEL"] = "1"
}

if t.deployCfg.PublicRootDomain == "" {
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

17 changes: 16 additions & 1 deletion builder/store/database/mirror.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ func (s *MirrorStore) IsExist(ctx context.Context, repoID int64) (exists bool, e
Exists(ctx)
return
}
func (s *MirrorStore) IsRepoExist(ctx context.Context, repoType types.RepositoryType, namespace, name string) (exists bool, err error) {
var repo Repository
exists, err = s.db.Operator.Core.
NewSelect().
Model(&repo).
Where("git_path=?", fmt.Sprintf("%ss_%s/%s", repoType, namespace, name)).
Exists(ctx)
return
}

func (s *MirrorStore) FindByRepoID(ctx context.Context, repoID int64) (*Mirror, error) {
var mirror Mirror
Expand Down Expand Up @@ -109,10 +118,16 @@ func (s *MirrorStore) FindWithMapping(ctx context.Context, repoType types.Reposi
Scan(ctx)
} else {
// auto mapping
//fix some repo id has mirror but it's not public,for example: https://opencsg.com/models/Qwen/Qwen_Qwen2-7B-Instruct
exist, _ := s.IsRepoExist(ctx, repoType, namespace, name)
if exist {
// no need mapping if repo id already exists in reporitory
return nil, fmt.Errorf("repo already exists, no need mapping")
}
err = s.db.Operator.Core.NewSelect().
Model(&mirror).
Relation("Repository").
Where("LOWER(repository.git_path) = LOWER(?) OR mirror.source_repo_path=?", fmt.Sprintf("%ss_%s/%s", repoType, namespace, name), fmt.Sprintf("%s/%s", namespace, name)).
Where("mirror.source_repo_path=?", fmt.Sprintf("%s/%s", namespace, name)).
Where("repository.repository_type=?", repoType).
Scan(ctx)
}
Expand Down
16 changes: 8 additions & 8 deletions component/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -1959,16 +1959,16 @@ func (c *RepoComponent) DeployDetail(ctx context.Context, detailReq types.Deploy
func (c *RepoComponent) generateEndpoint(ctx context.Context, deploy *database.Deploy) (string, string) {
var endpoint string
provider := ""
cls, err := c.cluster.ByClusterID(ctx, deploy.ClusterID)
zone := ""
if err != nil {
slog.Warn("Get cluster with error", slog.Any("error", err))
} else {
zone = cls.Zone
provider = cls.Provider
}
if len(deploy.SvcName) > 0 && deploy.Status == deployStatus.Running {
// todo: zone.provider.endpoint to support multi-zone, multi-provider
cls, err := c.cluster.ByClusterID(ctx, deploy.ClusterID)
zone := ""
if err != nil {
slog.Warn("Get cluster with error", slog.Any("error", err))
} else {
zone = cls.Zone
provider = cls.Provider
}
regionDomain := ""
if len(zone) > 0 && len(provider) > 0 {
regionDomain = fmt.Sprintf(".%s.%s", zone, provider)
Expand Down
60 changes: 60 additions & 0 deletions docker/finetune/Dockerfile.ms-swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# pull from devel image instead of base
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
# Set bash as the default shell
ENV SHELL=/bin/bash \
JUPYTERHUB_SERVICE_PREFIX=/proxy/ \
GRADIO_ROOT_PATH=/proxy/7860/ \
TZ=Asia/Shanghai \
NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 \
HF_HOME=/workspace/.cache \
DEBIAN_FRONTEND=noninteractive

# Build with some basic utilities
RUN apt-get update && apt-get install -y \
python3-pip apt-utils \
wget curl vim \
git git-lfs \
supervisor \
unzip
# set timezone
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y tzdata \
&& ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
&& echo $TZ > /etc/timezone \
&& dpkg-reconfigure -f noninteractive tzdata

# alias python='python3'
RUN ln -s /usr/bin/python3 /usr/bin/python
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

# Install the appropriate torch version
#RUN pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121
RUN pip install --no-cache-dir jupyterlab numpy==1.26.4 \
torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 \
jupyter-server-proxy==4.4.0 deepspeed \
gradio-client==1.4.0
# Create a working directory
WORKDIR /etc/csghub
#RUN git clone https://github.com/modelscope/ms-swift.git --branch v2.5.0 --single-branch
RUN git clone https://gitee.com/xzgan/ms-swift.git --branch v2.5.0 --single-branch
RUN cd ms-swift && pip install --no-cache-dir -e ".[llm]"
#because this library is update frequently, we use new line
RUN pip install --no-cache-dir vllm==v0.6.3.post1 transformers==4.45.2 timm==1.0.11 evalscope==0.5.5
# setup supervisord
RUN mkdir -p /var/log/supervisord
COPY swift/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY swift/jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py
COPY swift/ /etc/csghub/
RUN chmod +x /etc/csghub/*.sh
#use dark mode
RUN mkdir -p /root/.jupyter/lab/user-settings/@jupyterlab/apputils-extension && \
echo '{"theme":"JupyterLab Dark"}' > /root/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings && \
mkdir -p /root/.jupyter/lab/user-settings/@jupyterlab/notebook-extension && \
echo '{"codeCellConfig":{"lineNumbers":true }}' > /root/.jupyter/lab/user-settings/@jupyterlab/notebook-extension/tracker.jupyterlab-settings
#fix gradio proxy issue
RUN pip uninstall -y gradio && pip install https://opencsg-public-resource.oss-cn-beijing.aliyuncs.com/csghub/gradio/gradio-5.1.0-py3-none-any.whl

# Create a working directory
WORKDIR /workspace/
ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
EXPOSE 8000
29 changes: 26 additions & 3 deletions docker/finetune/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,38 @@ docker buildx build --platform linux/amd64,linux/arm64 \
-f Dockerfile.llamafactory \
--push .
```

## Build Multi-Platform Images for swift
```bash
export BUILDX_NO_DEFAULT_ATTESTATIONS=1
export IMAGE_TAG=1.0-cuda12.1-devel-ubuntu22.04-py310-torch2.4.0
docker buildx build --platform linux/amd64,linux/arm64 \
-t ${OPENCSG_ACR}/public/ms-swift:${IMAGE_TAG} \
-t ${OPENCSG_ACR}/public/ms-swift:latest \
-f Dockerfile.ms-swift \
--push .
```
*Note: The above command will create `linux/amd64` and `linux/arm64` images with the tags `${IMAGE_TAG}` and `latest` at the same time.*

## build gradio whl
```
1. build gradio base image or pick from opencsg-registry.cn-beijing.cr.aliyuncs.com/public/gradio-build-base:1.0
2. docker run -itd base_image
3. download gradio resource: git clone https://gitee.com/xzgan/gradio.git --branch 5.1.0 --single-branch
4. build frontend js: bash scripts/build_frontend.sh
5. build whl: python3 -m build -w
6. check whl file in dist folder and upload to https://git-devops.opencsg.com/opensource/gradio/
```


## Run Finetune Image Locally
```bash
docker run -d \
-e ACCESS_TOKEN=xxx \
--gpus device=7 \
-e HF_TOKEN=xxx \
-e REPO_ID="OpenCSG/csg-wukong-1B" \
-e HF_ENDPOINT=https://opencsg.com/hf \
-p 8000:8000 \
-e HF_ENDPOINT=https://hub.opencsg.com/hf \
-p 30148:8000 \
${OPENCSG_ACR}/public/llama-factory:${IMAGE_TAG}
```
*Note: HF_ENDPOINT should be use the real csghub address.*
Expand Down
41 changes: 41 additions & 0 deletions docker/finetune/swift/jupyter_notebook_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os

c.ServerApp.ip = '0.0.0.0'
c.ServerApp.token = ""
c.ServerApp.open_browser = False
c.ServerApp.allow_root = True
c.ServerApp.port_retries = 0
c.ServerApp.quit_button = False
c.ServerApp.allow_remote_access = True
c.ServerApp.disable_check_xsrf = True
c.ServerApp.allow_origin = '*'
c.ServerApp.trust_xheaders = True
c.ServerApp.open_browser = False
c.ServerApp.answer_yes = True
c.ServerApp.tornado_settings = {
"headers": {
"Content-Security-Policy": "frame-ancestors \'self\' *"
}
}

# c.ServerApp.base_url = context_path

# opt-in the async version to file handler and checkpoints
c.ServerApp.checkpoints_class = "jupyter_server.services.contents.checkpoints.AsyncCheckpoints"

# Do not delete files to trash: https://github.com/jupyter/notebook/issues/3130
c.FileContentsManager.delete_to_trash = False

c.ContentsManager.allow_hidden = True

# improve the performance of autocompletion, disable Jedi in IPython (the LSP servers for Python use Jedi too)
c.Completer.use_jedi = False

# https://forums.fast.ai/t/jupyter-notebook-enhancements-tips-and-tricks/17064/22
c.NotebookApp.iopub_msg_rate_limit = 100000000
c.NotebookApp.iopub_data_rate_limit = 2147483647

# inject proxy js (it is hack)

# c.ServerProxy['non_service_rewrite_response'] = [proxy_local_server]
c.FileContentsManager.always_delete_dir = True
32 changes: 32 additions & 0 deletions docker/finetune/swift/mem_monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

while true; do
if test -f "/sys/fs/cgroup/cpu.max"; then
max_memory=$(cat /sys/fs/cgroup/memory.max)
current_memory=$(cat /sys/fs/cgroup/memory.current)
fi

if test -f "/sys/fs/cgroup/memory/memory.limit_in_bytes"; then
max_memory=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
MEMORY_STAT_PATH="/sys/fs/cgroup/memory/memory.stat"
current_memory=$(awk '$1 == "rss" {print $2}' $MEMORY_STAT_PATH)
fi

if [ "${max_memory}" == "max" ]; then
sleep 86400
continue
fi
# reserve 200M
threshold=209715200
less_max_memory=$((max_memory - threshold))
if [ "$current_memory" -gt "$less_max_memory" ]; then
# Get the PID of the process with the highest memory usage
pid=$(ps -eo pid,%mem --sort=-%mem | awk 'NR==2 {print $1}')

# Kill the process
kill "$pid"
echo "Process with PID $pid killed due to memory exceeding the limit."
fi

sleep 10
done
Loading

0 comments on commit b563d97

Please sign in to comment.