diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 6f1673dc..0b2af2c6 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -23,9 +23,12 @@ jobs:
strategy:
fail-fast: false
matrix:
- python: ["3.8", "3.9", "3.10", "3.11"]
+ python: ["3.8", "3.9", "3.10", "3.11", "3.12"]
cuda: ["11.8", "12.1"]
- torch: ["2.1.2", "2.2.2", "2.3.1"]
+ torch: ["2.2.2", "2.3.1", "2.4.0"]
+ include:
+ - cuda: "12.4"
+ torch: "2.4.0"
runs-on: [self-hosted, linux, release]
env:
PYTHON_VERSION: ${{ matrix.python }}
diff --git a/.github/workflows/package_test.yml b/.github/workflows/package_test.yml
index ead55f76..e15bd472 100644
--- a/.github/workflows/package_test.yml
+++ b/.github/workflows/package_test.yml
@@ -38,9 +38,9 @@ jobs:
strategy:
fail-fast: false
matrix:
- python: ["3.10"]
- cuda: ["12.1"]
- torch: ["2.3.1"]
+ python: ["3.12"]
+ cuda: ["12.4"]
+ torch: ["2.4.0"]
runs-on: [self-hosted, linux, build]
env:
PYTHON_VERSION: ${{ matrix.python }}
diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml
index ea17fb66..7dd81321 100644
--- a/.github/workflows/publish_devel_image.yml
+++ b/.github/workflows/publish_devel_image.yml
@@ -42,6 +42,7 @@ jobs:
tags: |
vectorchai/scalellm_devel:cuda12.4-ubuntu22.04
vectorchai/scalellm_devel:cuda12.4
+ vectorchai/scalellm_devel:latest
- name: Build devel image for cuda 12.1
uses: docker/build-push-action@v5
@@ -58,7 +59,6 @@ jobs:
tags: |
vectorchai/scalellm_devel:cuda12.1-ubuntu22.04
vectorchai/scalellm_devel:cuda12.1
- vectorchai/scalellm_devel:latest
- name: Build devel image for cuda 11.8
uses: docker/build-push-action@v5
diff --git a/.github/workflows/publish_manylinux_image.yml b/.github/workflows/publish_manylinux_image.yml
index a05f2c2a..2ed14873 100644
--- a/.github/workflows/publish_manylinux_image.yml
+++ b/.github/workflows/publish_manylinux_image.yml
@@ -27,7 +27,7 @@ jobs:
- name: Create cache directory
run: mkdir -p $CI_CACHE_DIR/.buildx-cache
- - name: Build base for cuda 12.4 (experimental)
+ - name: Build base for cuda 12.4
uses: docker/build-push-action@v5
with:
context: ./docker
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 30e2f4f5..a28f05c7 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -22,9 +22,9 @@ jobs:
strategy:
fail-fast: false
matrix:
- python: ["3.8", "3.9", "3.10", "3.11"]
- cuda: ["12.1"]
- torch: ["2.3.1"]
+ python: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ cuda: ["12.4"]
+ torch: ["2.4.0"]
runs-on: [self-hosted, linux, release]
env:
PYTHON_VERSION: ${{ matrix.python }}
diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml
index adb2b0f4..749a7749 100644
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@@ -13,9 +13,9 @@ jobs:
strategy:
fail-fast: false
matrix:
- python: ["3.10"]
- cuda: ["12.1"]
- torch: ["2.3.1"]
+ python: ["3.12"]
+ cuda: ["12.4"]
+ torch: ["2.4.0"]
runs-on: [self-hosted, linux, build]
env:
PYTHON_VERSION: ${{ matrix.python }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20880087..0250f8a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,25 +194,25 @@ if (DEFINED ENV{LIBTORCH_ROOT})
else()
include(FetchContent)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4)
- # download nightly libtorch with cuda 12.4 from pytorch.org (experimental)
+ # download libtorch 2.4.0 with cuda 12.4 from pytorch.org
if (USE_CXX11_ABI)
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu124.zip")
else()
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-shared-with-deps-2.4.0%2Bcu124.zip")
endif()
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1)
- # download libtorch 2.3.1 with cuda 12.1 from pytorch.org
+ # download libtorch 2.4.0 with cuda 12.1 from pytorch.org
if (USE_CXX11_ABI)
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu121.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu121.zip")
else()
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-shared-with-deps-2.3.1%2Bcu121.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-shared-with-deps-2.4.0%2Bcu121.zip")
endif()
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.8)
- # download libtorch 2.3.1 with cuda 11.8 from pytorch.org
+ # download libtorch 2.4.0 with cuda 11.8 from pytorch.org
if (USE_CXX11_ABI)
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu118.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu118.zip")
else()
- set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.3.1%2Bcu118.zip")
+ set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.4.0%2Bcu118.zip")
endif()
else()
# error out if cuda version is not supported
@@ -232,7 +232,7 @@ else()
FetchContent_MakeAvailable(libtorch)
find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH)
- message(STATUS "Downloading and using libtorch 2.3.1 for cuda ${CUDA_VERSION} at ${libtorch_SOURCE_DIR}")
+ message(STATUS "Downloading and using libtorch 2.4.0 for cuda ${CUDA_VERSION} at ${libtorch_SOURCE_DIR}")
endif()
# check if USE_CXX11_ABI is set correctly
diff --git a/README.md b/README.md
index 2105e28e..7590024a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,15 @@
-# ScaleLLM: An efficient LLM Inference solution
-[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![GitHub Repo stars](https://img.shields.io/github/stars/vectorch-ai/ScaleLLM?style=social)](https://github.com/vectorch-ai/ScaleLLM/stargazers) [![build and test](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml)
+
+ScaleLLM: An efficient LLM Inference solution
+
+[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![build](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml)
+[![PyPI](https://badge.fury.io/py/scalellm.svg)](https://badge.fury.io/py/scalellm)
+[![Twitter](https://img.shields.io/twitter/url?label=%20%40VectorchAI&style=social&url=https://x.com/VectorchAI)](https://x.com/VectorchAI)
+[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn?compact=true&style=flat)](https://discord.gg/PKe5gvBZfn)
-[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn)](https://discord.gg/PKe5gvBZfn)
-[ScaleLLM]() is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including [Llama3](https://github.com/meta-llama/llama3), [Gemma](https://github.com/google-deepmind/gemma), Bloom, GPT-NeoX, and more.
+[ScaleLLM](#) is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including [Llama3](https://github.com/meta-llama/llama3), [Gemma](https://github.com/google-deepmind/gemma), Bloom, GPT-NeoX, and more.
ScaleLLM is currently undergoing active development. We are fully committed to consistently enhancing its efficiency while also incorporating additional features. Feel free to explore our [**_Roadmap_**](https://github.com/vectorch-ai/ScaleLLM/issues/84) for more details.
@@ -45,14 +50,14 @@ ScaleLLM is currently undergoing active development. We are fully committed to c
ScaleLLM is available as a Python Wheel package on PyPI. You can install it using pip:
```bash
-# Install scalellm with CUDA 12.1 and Pytorch 2.3
+# Install scalellm with CUDA 12.4 and Pytorch 2.4.0
pip install scalellm
```
-If you want to install ScaleLLM with different version of CUDA and Pytorch, you can pip install it with provding index URL of the version. For example, to install ScaleLLM with CUDA 11.8 and Pytorch 2.2.2, you can use the following command:
+If you want to install ScaleLLM with different version of CUDA and Pytorch, you can pip install it with provding index URL of the version. For example, to install ScaleLLM with CUDA 12.1 and Pytorch 2.2.2, you can use the following command:
```bash
-pip install scalellm -i https://whl.vectorch.com/cu118/torch2.2.2/
+pip install scalellm -i https://whl.vectorch.com/cu121/torch2.2.2/
```
### Build from source
diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh
index fd32031b..1e90973c 100755
--- a/docker/common/install_cuda.sh
+++ b/docker/common/install_cuda.sh
@@ -4,6 +4,9 @@
set -ex
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.1.0.70
+
function install_cusparselt_040 {
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && pushd tmp_cusparselt
@@ -27,7 +30,7 @@ function install_cusparselt_052 {
}
function install_118 {
- echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.4.0"
+ echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
# install CUDA 11.8.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -38,16 +41,16 @@ function install_118 {
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
- wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -O cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
- tar xf cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
- cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/include/* /usr/local/cuda/include/
- cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/lib/* /usr/local/cuda/lib64/
+ wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+ tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
- git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+ git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
@@ -60,7 +63,7 @@ function install_118 {
}
function install_121 {
- echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+ echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
rm -rf /usr/local/cuda-12.1 /usr/local/cuda
# install CUDA 12.1.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
@@ -71,16 +74,16 @@ function install_121 {
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
- wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
- tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
- cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
- cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+ wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+ tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
- git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+ git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
@@ -93,7 +96,7 @@ function install_121 {
}
function install_124 {
- echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+ echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
rm -rf /usr/local/cuda-12.4 /usr/local/cuda
# install CUDA 12.4.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
@@ -104,16 +107,16 @@ function install_124 {
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
- wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
- tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
- cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
- cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+ wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+ tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+ cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
- git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+ git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
@@ -201,6 +204,9 @@ function prune_124 {
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
+ if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+ export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+ fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
@@ -232,4 +238,4 @@ do
;;
esac
shift
-done
+done
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7592cdb7..cc95c4e7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -12,7 +12,7 @@ ScaleLLM is available as a Python Wheel package on `PyPI