Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Allow vllm to still work if triton is not installed. #6786

Merged
merged 18 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements-cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@
# Dependencies for x86_64 CPUs
torch == 2.3.1+cpu; platform_machine != "ppc64le"
torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
2 changes: 0 additions & 2 deletions requirements-openvino.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,3 @@
torch >= 2.1.2
openvino ~= 2024.3.0.dev
optimum-intel[openvino] >= 1.18.1

triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
1 change: 0 additions & 1 deletion requirements-tpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@
# Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA.
# You can install the dependencies in Dockerfile.tpu.
triton # To avoid import errors
7 changes: 4 additions & 3 deletions tests/kernels/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import triton
import triton.language as tl

from vllm.model_executor.layers.ops.sample import (
MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
sample)
from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
sample)
from vllm.model_executor.sampling_metadata import SamplingTensors
from vllm.model_executor.utils import set_random_seed
from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
get_num_triton_sampler_splits)

SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
Expand Down
5 changes: 4 additions & 1 deletion vllm/attention/ops/paged_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import torch

from vllm import _custom_ops as ops
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.triton_utils import HAS_TRITON

if HAS_TRITON:
from vllm.attention.ops.prefix_prefill import context_attention_fwd

# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
_PARTITION_SIZE = 512
Expand Down
16 changes: 12 additions & 4 deletions vllm/model_executor/layers/fused_moe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
FusedMoEMethodBase)
from vllm.triton_utils import HAS_TRITON

if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts, fused_moe, fused_topk, get_config_file_name,
grouped_topk)

__all__ = [
"FusedMoE",
"FusedMoEMethodBase",
"fused_moe",
"fused_topk",
"fused_experts",
"get_config_file_name",
"grouped_topk",
"FusedMoE",
"FusedMoEMethodBase",
]

if not HAS_TRITON:
# need to do it like this other ruff complains
__all__ = __all__[:2]
14 changes: 1 addition & 13 deletions vllm/model_executor/layers/ops/sample.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,14 @@
import math
from typing import Optional, Tuple

import torch
import triton
import triton.language as tl

from vllm.model_executor.layers.ops.rand import seeded_uniform
from vllm.triton_utils.sample import get_num_triton_sampler_splits

_EPS = 1e-6

# This is a hardcoded limit in Triton (max block size).
MAX_TRITON_N_COLS = 131072


def get_num_triton_sampler_splits(n_cols: int) -> int:
"""Get the number of splits to use for Triton sampling.

Triton has a limit on the number of columns it can handle, so we need to
split the tensor and call the kernel multiple times if it's too large.
"""
return math.ceil(n_cols / MAX_TRITON_N_COLS)


def _multi_split_sample(
probs: torch.Tensor,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
fused_moe)
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization.base_config import (
Expand Down Expand Up @@ -404,6 +403,7 @@ def apply(self,
num_expert_group: Optional[int] = None,
topk_group: Optional[int] = None) -> torch.Tensor:

from vllm.model_executor.layers.fused_moe import fused_moe
return fused_moe(x,
layer.w13_weight,
layer.w2_weight,
Expand Down
6 changes: 5 additions & 1 deletion vllm/model_executor/layers/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import torch
import torch.nn as nn

from vllm.model_executor.layers.ops.sample import sample as sample_triton
from vllm.triton_utils import HAS_TRITON

if HAS_TRITON:
from vllm.model_executor.layers.ops.sample import sample as sample_triton

from vllm.model_executor.sampling_metadata import (SamplingMetadata,
SamplingTensors,
SequenceGroupToSample)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/sampling_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

import torch

from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SequenceData, SequenceGroupMetadata
from vllm.triton_utils.sample import get_num_triton_sampler_splits
from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
make_tensor_with_pad, maybe_expand_dim)

Expand Down
15 changes: 10 additions & 5 deletions vllm/triton_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from vllm.triton_utils.custom_cache_manager import (
maybe_set_triton_cache_manager)
from vllm.triton_utils.importing import HAS_TRITON

__all__ = [
"maybe_set_triton_cache_manager",
]
if HAS_TRITON:
from vllm.triton_utils.custom_cache_manager import (
maybe_set_triton_cache_manager)

__all__ = ["HAS_TRITON", "maybe_set_triton_cache_manager"]

if not HAS_TRITON:
# need to do this afterwards due to ruff complaining
__all__.pop()
11 changes: 11 additions & 0 deletions vllm/triton_utils/importing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from importlib.util import find_spec

from vllm.logger import init_logger

logger = init_logger(__name__)

HAS_TRITON = find_spec("triton") is not None

if not HAS_TRITON:
logger.info("Triton not installed; certain GPU-related functions"
" will be not be available.")
13 changes: 13 additions & 0 deletions vllm/triton_utils/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import math

# This is a hardcoded limit in Triton (max block size).
MAX_TRITON_N_COLS = 131072


def get_num_triton_sampler_splits(n_cols: int) -> int:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Off topic: @Yard1 I feel this should be a general function for all triton kernels instead of just sampler. Do you think it makes sense to rename it to get_num_triton_input_chunks so something similar, and use it here as well?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that would make sense

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify: is this something you'd like to have addressed in this PR?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's not necessary. We can merge this PR first.

"""Get the number of splits to use for Triton sampling.

Triton has a limit on the number of columns it can handle, so we need to
split the tensor and call the kernel multiple times if it's too large.
"""
return math.ceil(n_cols / MAX_TRITON_N_COLS)
Loading