Skip to content

Commit

Permalink
feat: sync llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Jan 19, 2024
1 parent 08d0a13 commit abfb6dc
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 19 deletions.
10 changes: 10 additions & 0 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.hellaswag_tasks = std::stoi(argv[i]);
} else if (arg == "--winogrande") {
params.winogrande = true;
} else if (arg == "--winogrande-tasks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.winogrande_tasks = std::stoi(argv[i]);
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") {
Expand Down Expand Up @@ -932,6 +940,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
Expand Down
3 changes: 3 additions & 0 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ struct gpt_params {
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
Expand Down
16 changes: 6 additions & 10 deletions cpp/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -238,21 +238,19 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format,
static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

id<MTLDevice> device;
NSString * s;

#if TARGET_OS_OSX
#if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
// Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
for (device in devices) {
s = [device name];
for (id<MTLDevice> device in devices) {
NSString * s = [device name];
LM_GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
}
[devices release]; // since it was created by a *Copy* C method
#endif

// Pick and show default Metal device
device = MTLCreateSystemDefaultDevice();
s = [device name];
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
NSString * s = [device name];
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);

// Configure context
Expand Down Expand Up @@ -712,7 +710,6 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
static bool lm_ggml_metal_graph_compute(
struct lm_ggml_metal_context * ctx,
struct lm_ggml_cgraph * gf) {
@autoreleasepool {

MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
edesc.dispatchType = MTLDispatchTypeSerial;
Expand Down Expand Up @@ -2255,7 +2252,6 @@ static bool lm_ggml_metal_graph_compute(
}

return true;
}
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
15 changes: 11 additions & 4 deletions cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1610,7 +1610,7 @@ struct llama_model {
std::unique_ptr<llama_mmap> mapping;

// objects representing data potentially being locked in memory
llama_mlock mlock_buf;
std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
llama_mlock mlock_mmap;

// for quantize-stats only
Expand Down Expand Up @@ -3449,7 +3449,12 @@ static bool llm_load_tensors(
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
if (lm_gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
} else {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
ml.n_created--; // artificial tensor
}
}

for (int i = 0; i < n_layer; ++i) {
Expand Down Expand Up @@ -3826,8 +3831,10 @@ static bool llm_load_tensors(
else {
buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf != nullptr && use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
model.mlock_buf.init (lm_ggml_backend_buffer_get_base(buf));
model.mlock_buf.grow_to(lm_ggml_backend_buffer_get_size(buf));
model.mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = model.mlock_bufs.back();
mlock_buf->init (lm_ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(lm_ggml_backend_buffer_get_size(buf));
}
}
if (buf == nullptr) {
Expand Down
2 changes: 1 addition & 1 deletion example/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ export default function App() {
initLlama({
model: file.uri,
use_mlock: true,
n_gpu_layers: 0, // > 0: enable GPU
n_gpu_layers: Platform.OS === 'ios' ? 1 : 0, // > 0: enable GPU
// embedding: true,
})
.then((ctx) => {
Expand Down
6 changes: 3 additions & 3 deletions scripts/ggml-metal.m.patch
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--- ggml-metal.m.orig 2024-01-18 11:47:27
+++ ggml-metal.m 2024-01-18 11:47:29
@@ -290,7 +290,7 @@
--- ggml-metal.m.orig 2024-01-19 10:06:53
+++ ggml-metal.m 2024-01-19 10:06:54
@@ -288,7 +288,7 @@
if (ggmlMetalPathResources) {
sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
} else {
Expand Down

0 comments on commit abfb6dc

Please sign in to comment.