Skip to content

Commit

Permalink
added 'disable_custom_kernels' gflags to allow disable all custom ker…
Browse files Browse the repository at this point in the history
…nels.
  • Loading branch information
guocuimi committed Nov 11, 2023
1 parent 5064657 commit 149b943
Show file tree
Hide file tree
Showing 14 changed files with 76 additions and 96 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ docker run -it --gpus=all --net=host --shm-size=1g \

This command starts the Docker container with GPU support and various configuration options.
> **Warning**<br />
> NCCL might fall back to using the host memory if NVLink or PCI is not available. To allow NCCL to use the host memory, we added '--shm-size=1g' to the command. If you have NVLink or PCI available, you can remove this option.
> NCCL might fall back to using the host memory if NVLink or PCI is not available. To allow NCCL to use the host memory, we added '--shm-size=1g' to the docker run command.
- `HF_MODEL_ID` specifies which Hugging Face model you want to run.
- `HF_MODEL_REVISION` specifies which Hugging Face model revision you want to run. by default, it is set to `"main"`.
- `HF_MODEL_ALLOW_PATTERN` specifies which types of files are allowed to be downloaded. by default, it is set to `"*.json,*.safetensors,*.model"`.
- `DEVICE` specifies the device on which this model should run. by default, it is set to `"auto"`.
- `HF_MODEL_REVISION` specifies which Hugging Face model revision you want to run. By default, it is set to `"main"`.
- `HF_MODEL_ALLOW_PATTERN` specifies which types of files are allowed to be downloaded. By default, it is set to `"*.json,*.safetensors,*.model"`.
- `DEVICE` specifies the device on which this model should run. By default, it is set to `"auto"`.
- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models.

> **Note**<br />
Expand Down
4 changes: 0 additions & 4 deletions entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ if [ -n "$HF_MODEL_ID" ]; then
exit 1
fi
ARGS+=" --model_path "$MODEL_PATH" --model_id "$HF_MODEL_ID""
elif [ -n "$HF_MODEL_PATH" ]; then
echo "Using model from the specified path "$HF_MODEL_PATH""

ARGS+=" --model_path "$HF_MODEL_PATH""
fi

ARGS+=" --device "$DEVICE""
Expand Down
1 change: 1 addition & 0 deletions scalellm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ services:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
volumes:
- $HOME/.cache/huggingface/hub:/models
shm_size: 1g
command: --logtostderr
# turn on GPU access
deploy:
Expand Down
4 changes: 0 additions & 4 deletions scripts/start_scalellm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ if [ -n "$HF_MODEL_ID" ]; then
exit 1
fi
ARGS+=" --model_path "$MODEL_PATH" --model_id "$HF_MODEL_ID""
elif [ -n "$HF_MODEL_PATH" ]; then
echo "Using model from the specified path "$HF_MODEL_PATH""

ARGS+=" --model_path "$HF_MODEL_PATH""
fi

ARGS+=" --device "$DEVICE""
Expand Down
43 changes: 15 additions & 28 deletions src/common/process_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,21 @@ namespace llm {
namespace {

// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
GLOG(FATAL) << "Failed, NCCL error " << __FILE__ << ":" << __LINE__ \
<< " " << ncclGetErrorString(r); \
} \
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
GLOG(FATAL) << "Failed, NCCL error :" << ncclGetErrorString(r); \
} \
} while (0)

// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define CUDACHECK(cmd) \
do { \
cudaError_t err = cmd; \
if (err != cudaSuccess) { \
GLOG(FATAL) << "Failed, Cuda error " << __FILE__ << ":" << __LINE__ \
<< " " << cudaGetErrorString(err); \
} \
#define CUDACHECK(cmd) \
do { \
cudaError_t err = cmd; \
if (err != cudaSuccess) { \
GLOG(FATAL) << "Failed, Cuda error :" << cudaGetErrorString(err); \
} \
} while (0)

at::Tensor flatten_for_scatter_gather(std::vector<at::Tensor>& tensors) {
Expand Down Expand Up @@ -81,15 +79,14 @@ std::vector<std::unique_ptr<ProcessGroup>> ProcessGroup::create_process_groups(
GCHECK(device.is_cuda()) << "device should be cuda device";
}

const int world_size = static_cast<int>(devices.size());

std::vector<ncclComm_t> comms;
comms.reserve(devices.size());
std::vector<int> device_idxs;
device_idxs.reserve(devices.size());
for (const auto& device : devices) {
device_idxs.push_back(device.index());
}

std::vector<ncclComm_t> comms(devices.size());
const int world_size = static_cast<int>(devices.size());
NCCLCHECK(ncclCommInitAll(comms.data(), world_size, device_idxs.data()));

std::vector<std::unique_ptr<ProcessGroup>> process_groups;
Expand All @@ -102,16 +99,6 @@ std::vector<std::unique_ptr<ProcessGroup>> ProcessGroup::create_process_groups(
}

// Constructor.
ProcessGroupNCCL::ProcessGroupNCCL(int rank,
int world_size,
const torch::Device& device,
const ncclUniqueId& comm_id)
: ProcessGroup(rank, world_size, device) {
torch::DeviceGuard device_guard(device);
NCCLCHECK(ncclCommInitRank(&comm_, world_size, comm_id, rank));
CUDACHECK(cudaStreamCreate(&stream_));
}

ProcessGroupNCCL::ProcessGroupNCCL(int rank,
int world_size,
const torch::Device& device,
Expand Down
5 changes: 0 additions & 5 deletions src/common/process_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ class ProcessGroup {
class ProcessGroupNCCL : public ProcessGroup {
public:
// Constructor.
ProcessGroupNCCL(int rank,
int world_size,
const torch::Device& device,
const ncclUniqueId& id);

ProcessGroupNCCL(int rank,
int world_size,
const torch::Device& device,
Expand Down
7 changes: 7 additions & 0 deletions src/engine/engine.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "engine.h"
#include <gflags/gflags_declare.h>

#include <boost/algorithm/string.hpp>
#include <memory>
Expand All @@ -21,6 +22,8 @@ DEFINE_double(max_memory_utilization,
0.9,
"maximum memory utilization allowed, default 0.9");

DECLARE_bool(disable_custom_kernels);

namespace llm {
namespace {
torch::ScalarType parse_dtype(const std::string& dtype_str,
Expand Down Expand Up @@ -64,6 +67,10 @@ Engine::Engine(const std::vector<torch::Device>& devices) : devices_(devices) {
ParallelArgs parallel_args(rank, world_size, pg);
workers_.emplace_back(std::make_unique<Worker>(parallel_args, devices[i]));
}

if (FLAGS_disable_custom_kernels) {
GLOG(WARNING) << "Custom kernels are disabled, using generic kernels.";
}
}

bool Engine::init(const std::string& model_weights_path) {
Expand Down
23 changes: 17 additions & 6 deletions src/layers/activation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include "common/logging.h"

DECLARE_bool(disable_custom_kernels);

namespace llm {
namespace detail {
torch::Tensor gelu(torch::Tensor x) {
Expand Down Expand Up @@ -84,10 +86,12 @@ ActFunc Activation::get_act_func(const std::string& name,
return gelu;
}
if (boost::iequals(name, "gelu_fast")) {
return device.is_cuda() ? kernel::gelu_fast : gelu_fast;
return device.is_cuda() && !FLAGS_disable_custom_kernels ? kernel::gelu_fast
: gelu_fast;
}
if (boost::iequals(name, "gelu_new")) {
return device.is_cuda() ? kernel::gelu_new : gelu_new;
return device.is_cuda() && !FLAGS_disable_custom_kernels ? kernel::gelu_new
: gelu_new;
}
if (boost::iequals(name, "gelu_pytorch_tanh")) {
return gelu_pytorch_tanh;
Expand All @@ -96,7 +100,8 @@ ActFunc Activation::get_act_func(const std::string& name,
return relu;
}
if (boost::iequals(name, "silu")) {
return device.is_cuda() ? kernel::silu : silu;
return device.is_cuda() && !FLAGS_disable_custom_kernels ? kernel::silu
: silu;
}

GLOG(ERROR) << "Unsupported activation function: " << name;
Expand All @@ -111,10 +116,14 @@ ActFunc Activation::get_act_with_mul_func(const std::string& name,
return gelu_with_mul;
}
if (boost::iequals(name, "gelu_fast")) {
return device.is_cuda() ? kernel::gelu_fast_with_mul : gelu_fast_with_mul;
return device.is_cuda() && !FLAGS_disable_custom_kernels
? kernel::gelu_fast_with_mul
: gelu_fast_with_mul;
}
if (boost::iequals(name, "gelu_new")) {
return device.is_cuda() ? kernel::gelu_new_with_mul : gelu_new_with_mul;
return device.is_cuda() && !FLAGS_disable_custom_kernels
? kernel::gelu_new_with_mul
: gelu_new_with_mul;
}
if (boost::iequals(name, "gelu_pytorch_tanh")) {
return gelu_pytorch_tanh_with_mul;
Expand All @@ -123,7 +132,9 @@ ActFunc Activation::get_act_with_mul_func(const std::string& name,
return relu_with_mul;
}
if (boost::iequals(name, "silu")) {
return device.is_cuda() ? kernel::silu_with_mul : silu_with_mul;
return device.is_cuda() && !FLAGS_disable_custom_kernels
? kernel::silu_with_mul
: silu_with_mul;
}

GLOG(ERROR) << "Unsupported activation function: " << name;
Expand Down
55 changes: 20 additions & 35 deletions src/layers/attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,7 @@

#include "common/logging.h"

DEFINE_string(varlen_masked_self_attention,
"",
"type of attention to use for varlen_masked_self_attention, "
"slow, cuda, or empty for auto");

DEFINE_string(
single_query_masked_self_attention,
"",
"type of attention to use for single_query_masked_self_attention, slow, "
"cuda, or empty for auto");
DEFINE_bool(disable_custom_kernels, false, "disable all custom kernels");

DEFINE_bool(
force_use_paged_attention_v2,
Expand Down Expand Up @@ -188,19 +179,16 @@ void varlen_masked_self_attention(
int32_t max_seq_len, // maximum sequence length
float scale, // scale for softmax
torch::Tensor& output) {
if (query.is_cuda()) {
if (query.is_cuda() && !FLAGS_disable_custom_kernels) {
// use cuda kernel
if (FLAGS_varlen_masked_self_attention.empty() ||
FLAGS_varlen_masked_self_attention == "cuda") {
return varlen_masked_self_attention_cuda(query,
key,
value,
cu_seq_lens,
alibi_slopes,
max_seq_len,
scale,
output);
}
return varlen_masked_self_attention_cuda(query,
key,
value,
cu_seq_lens,
alibi_slopes,
max_seq_len,
scale,
output);
}
return varlen_masked_self_attention_generic(
query, key, value, cu_seq_lens, alibi_slopes, scale, output);
Expand All @@ -216,20 +204,17 @@ void single_query_masked_self_attention(
int32_t max_context_len, // maximum context length
float scale, // scale for softmax
torch::Tensor& output) {
if (query.is_cuda()) {
if (query.is_cuda() && !FLAGS_disable_custom_kernels) {
// use cuda kernel
if (FLAGS_single_query_masked_self_attention.empty() ||
FLAGS_single_query_masked_self_attention == "cuda") {
return single_query_masked_self_attention_cuda(kv_cache,
kv_head_mapping,
query,
block_tables,
context_lens,
alibi_slopes,
max_context_len,
scale,
output);
}
return single_query_masked_self_attention_cuda(kv_cache,
kv_head_mapping,
query,
block_tables,
context_lens,
alibi_slopes,
max_context_len,
scale,
output);
}
return single_query_masked_self_attention_generic(kv_cache,
query,
Expand Down
3 changes: 1 addition & 2 deletions src/layers/attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
#include "memory/kv_cache.h"
#include "models/input_parameters.h"

DECLARE_string(varlen_masked_self_attention);
DECLARE_string(single_query_masked_self_attention);
DECLARE_bool(disable_custom_kernels);

namespace llm {

Expand Down
5 changes: 3 additions & 2 deletions src/layers/normalization.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "kernels/layernorm_kernels.h"
#include "model_loader/state_dict.h"

DECLARE_bool(disable_custom_kernels);
namespace llm {
namespace detail {
inline torch::Tensor rms_norm(torch::Tensor input,
Expand Down Expand Up @@ -58,7 +59,7 @@ class LayerNormImpl : public torch::nn::Module {
}

torch::Tensor forward(torch::Tensor input) {
if (input.is_cuda()) {
if (input.is_cuda() && !FLAGS_disable_custom_kernels) {
auto output = torch::empty_like(input);
kernel::layer_norm(output, input, weight_, bias_, eps_);
return output;
Expand Down Expand Up @@ -131,7 +132,7 @@ class RMSNormImpl : public torch::nn::Module {
}

torch::Tensor forward(torch::Tensor input) {
if (input.is_cuda()) {
if (input.is_cuda() && !FLAGS_disable_custom_kernels) {
auto output = torch::empty_like(input);
kernel::rms_norm(output, input, weight_, eps_);
return output;
Expand Down
8 changes: 5 additions & 3 deletions src/layers/pos_embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "common/logging.h"
#include "kernels/pos_embedding_kernels.h"

DECLARE_bool(disable_custom_kernels);
namespace llm {

namespace {
Expand Down Expand Up @@ -57,7 +58,8 @@ std::shared_ptr<RotaryEmbeddingImpl> create(int64_t rotary_dim,
bool interleaved,
torch::ScalarType dtype,
const torch::Device& device) {
if (device.is_cuda()) {
if (device.is_cuda() && !FLAGS_disable_custom_kernels) {
// use custom kernels
return std::make_shared<RotaryEmbeddingKernel>(rotary_dim,
max_position_embeddings,
scaling_factor,
Expand Down Expand Up @@ -146,7 +148,7 @@ RotaryEmbeddingGeneric::RotaryEmbeddingGeneric(int64_t rotary_dim,

const auto cos_sin = torch::cat({emd.cos(), emd.sin()}, /*dim=*/-1);
const auto options = torch::dtype(dtype).device(device);
cos_sin_cache_ = register_buffer("cos_sin_cached", cos_sin);
cos_sin_cache_ = register_buffer("cos_sin_cache", cos_sin.to(options));
}

// inplace rotary positional embedding
Expand Down Expand Up @@ -181,8 +183,8 @@ RotaryEmbeddingKernel::RotaryEmbeddingKernel(int64_t rotary_dim,
: rotary_dim_(rotary_dim), interleaved_(interleaved) {
const auto freqs = detail::compute_freqs(
max_position_embeddings, rotary_dim, scaling_factor, theta);
const auto cos_sin = torch::cat({freqs.cos(), freqs.sin()}, /*dim=*/-1);

const auto cos_sin = torch::cat({freqs.cos(), freqs.sin()}, /*dim=*/-1);
const auto options = torch::dtype(dtype).device(device);
cos_sin_cache_ = register_buffer("cos_sin_cache", cos_sin.to(options));
}
Expand Down
4 changes: 2 additions & 2 deletions src/model_loader/model_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,8 @@ bool HFModelLoader::load_model_args(const std::string& model_weights_path) {

// always use float16 for quantization
if (!quant_args_.quant_method().empty() && args_.dtype() != "float16") {
LOG(WARNING) << "Overwriting dtype from " << args_.dtype() << " to float16 "
<< "for quantization";
GLOG(WARNING) << "Overwriting dtype from " << args_.dtype()
<< " to float16 for quantization";
args_.dtype() = "float16";
}

Expand Down
2 changes: 1 addition & 1 deletion src/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ int main(int argc, char** argv) {

// check if model path exists
if (!std::filesystem::exists(FLAGS_model_path)) {
LOG(FATAL) << "Model path " << FLAGS_model_path << " does not exist.";
GLOG(FATAL) << "Model path " << FLAGS_model_path << " does not exist.";
}

if (FLAGS_model_id.empty()) {
Expand Down

0 comments on commit 149b943

Please sign in to comment.