Skip to content

Commit

Permalink
cuda : improve cuda pool efficiency using virtual memory (#4606)
Browse files Browse the repository at this point in the history
* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
  • Loading branch information
slaren and JohannesGaessler authored Dec 24, 2023
1 parent 708e179 commit 5bf3953
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 208 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
Expand Down
6 changes: 2 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -367,17 +367,15 @@ endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
MK_NVCCFLAGS = -use_fast_math
ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT

ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif

endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
Expand Down
16 changes: 6 additions & 10 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);

int id = ggml_backend_registry_count;
size_t id = ggml_backend_registry_count;

ggml_backend_registry[id] = (struct ggml_backend_reg) {
/* .name = */ {0},
Expand Down Expand Up @@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
return i;
}
}

// not found
return SIZE_MAX;
}

Expand All @@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
const char * params = strchr(backend_str, ':');
char backend_name[128];
if (params == NULL) {
strcpy(backend_name, backend_str);
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
params = "";
} else {
strncpy(backend_name, backend_str, params - backend_str);
backend_name[params - backend_str] = '\0';
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
params++;
}

size_t backend_i = ggml_backend_reg_find_by_name(backend_name);

if (backend_i == SIZE_MAX) {
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
return NULL;
Expand Down Expand Up @@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
}

static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy((char *)tensor->data + offset, data, size);

GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy(data, (const char *)tensor->data + offset, size);

GGML_UNUSED(buffer);
Expand Down
Loading

6 comments on commit 5bf3953

@whoreson
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm this commit broke CUDA 10 compilation again.

ggml-cuda.cu(6685): error: identifier "CUmemAllocationProp" is undefined

ggml-cuda.cu(6686): error: identifier "CU_MEM_ALLOCATION_TYPE_PINNED" is undefined

ggml-cuda.cu(6687): error: identifier "CU_MEM_LOCATION_TYPE_DEVICE" is undefined

ggml-cuda.cu(6689): error: identifier "CUmemGenericAllocationHandle" is undefined

ggml-cuda.cu(6690): error: identifier "cuMemCreate" is undefined

ggml-cuda.cu(6694): error: identifier "cuMemAddressReserve" is undefined

ggml-cuda.cu(6698): error: identifier "cuMemMap" is undefined

ggml-cuda.cu(6701): error: identifier "cuMemRelease" is undefined

ggml-cuda.cu(6704): error: identifier "CUmemAccessDesc" is undefined

ggml-cuda.cu(6707): error: identifier "CU_MEM_ACCESS_FLAGS_PROT_READWRITE" is undefi

ggml-cuda.cu(6708): error: identifier "cuMemSetAccess" is undefined

ggml-cuda.cu(6842): error: identifier "CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMEN

ggml-cuda.cu(6845): error: identifier "CUmemAllocationProp" is undefined

ggml-cuda.cu(6846): error: identifier "CU_MEM_ALLOCATION_TYPE_PINNED" is undefined
ggml-cuda.cu(6847): error: identifier "CU_MEM_LOCATION_TYPE_DEVICE" is undefined

ggml-cuda.cu(6849): error: identifier "CU_MEM_ALLOC_GRANULARITY_RECOMMENDED" is unde

ggml-cuda.cu(6849): error: identifier "cuMemGetAllocationGranularity" is undefined

@LostRuins
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're trying to build for cuda with Conda and since this commit, we've been getting a linker(?) error ../x86_64-conda-linux-gnu/bin/ld: cannot find -lcuda: No such file or directory error. This is using the same library includes as the makefile. Is it possible that the library path is different or something else is missing?

cc: @henk717

@henk717
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue for us is bigger than that, -lcuda tries to target the driver directly while we are trying to create a universal binary for our end users.
Github CI doesn't have a GPU so we can't install lcuda on it, this is part of the driver so I have no way to install it.

@ggerganov
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LostRuins You need to add the path to the CUDA driver library. The llama.cpp Makefile does not cover all possible locations. You can open a PR to add the path suitable for Conda:

MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib

@henk717 I'm not sure what is the common practice in this case, but surely there has to be a way to link CUDA. Quick search shows that people are using stub libraries:

https://stackoverflow.com/questions/20186848/can-i-compile-a-cuda-program-without-having-a-cuda-device

@henk717
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stub was a good shout, I had been misdirected yesterday night into finding the non-stub.
With your reminder and a good night sleep I got it now, the stub is provided in our case by the conda package cuda-driver-dev so I can work with that package.

@LostRuins
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think adding -Lconda/envs/linux/lib -Lconda/envs/linux/lib/stubs does seem to work for conda, although I wonder if there are any other cases missing.

Please sign in to comment.