Skip to content

Commit

Permalink
[Core] manage nccl via a pypi package & upgrade to pt 2.2.1 (vllm-pro…
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored Apr 4, 2024
1 parent b778200 commit ca81ff5
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11']
pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt.
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt.
cuda-version: ['11.8', '12.1']

steps:
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")

Expand Down
10 changes: 7 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################


Expand All @@ -47,9 +54,6 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ requires = [
"ninja",
"packaging",
"setuptools >= 49.4.0",
"torch == 2.1.2",
"torch == 2.2.1",
"wheel",
]
build-backend = "setuptools.build_meta"
Expand Down
2 changes: 1 addition & 1 deletion requirements-build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ cmake>=3.21
ninja
packaging
setuptools>=49.4.0
torch==2.1.2
torch==2.2.1
wheel
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ psutil
ray >= 2.9
sentencepiece # Required for LLaMA tokenizer.
numpy
torch == 2.1.2
torch == 2.2.1
requests
py-cpuinfo
transformers >= 4.39.1 # Required for StarCoder2 & Llava.
xformers == 0.0.23.post1 # Required for CUDA 12.1.
xformers == 0.0.25 # Requires PyTorch 2.2.1.
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
Expand All @@ -17,3 +17,4 @@ pynvml == 11.5.0
triton >= 2.1.0
outlines == 0.0.34
tiktoken == 0.6.0 # Required for DBRX tokenizer
vllm-nccl-cu12>=2.18<2.19 # for downloading nccl library
10 changes: 10 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,16 @@ def get_requirements() -> List[str]:
if _is_cuda():
with open(get_path("requirements.txt")) as f:
requirements = f.read().strip().split("\n")
cuda_major = torch.version.cuda.split(".")[0]
modified_requirements = []
for req in requirements:
if "vllm-nccl-cu12" in req:
modified_requirements.append(
req.replace("vllm-nccl-cu12",
f"vllm-nccl-cu{cuda_major}"))
else:
modified_requirements.append(req)
requirements = modified_requirements
elif _is_hip():
with open(get_path("requirements-rocm.txt")) as f:
requirements = f.read().strip().split("\n")
Expand Down
14 changes: 12 additions & 2 deletions vllm/model_executor/parallel_utils/pynccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import ctypes
import datetime
import glob
import os

# ===================== import region =====================
Expand All @@ -34,18 +35,27 @@

so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")

# check if we have vllm-managed nccl
vllm_nccl_path = None
if torch.version.cuda is not None:
cuda_major = torch.version.cuda.split(".")[0]
path = os.path.expanduser(
f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
files = glob.glob(path)
vllm_nccl_path = files[0] if files else None

# manually load the nccl library
if so_file:
logger.info(
f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
else:
if torch.version.cuda is not None:
so_file = "libnccl.so.2"
so_file = vllm_nccl_path or "libnccl.so.2"
elif torch.version.hip is not None:
so_file = "librccl.so.1"
else:
raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.debug(f"Loading nccl from library {so_file}")
logger.info(f"Loading nccl from library {so_file}")

try:
nccl = ctypes.CDLL(so_file)
Expand Down

0 comments on commit ca81ff5

Please sign in to comment.